Snippets

Edel SM Nagios plugin to check RTAPI webapp ping, then writes the status in /tmp/_status.rtapi for nagios to fetch

Created by Edelberto Mania last modified
#!/usr/bin/env python
# nagios plugin to check response time of the rtapi "ping" function
# Edelberto Mania <ed@zenoradio.com>
# 20160125
# 20160201 - backgrounded, write status to $status_file that is read by a separate nagios plugin 

endpoint_title="API Response Time"
endpoint="https://rtapi.zenoradio.com/api/Ping/read?data=1"

## number of queires to be sent
number_of_queries=1

## duration for test (200 in production)
test_duration=200

## duration for recovery (300 in production)
recovery_duration=300

## interval (15 in production)
interval=15

## status saved in file
status_file='/tmp/_status.rtapi'


## threshold (response time) to generate alert
warning_threshold=1
critical_threshold=2

ok=0
warning=1
critical=2

import os,sys,requests,time,datetime
from timeit import timeit

## on version python2, we sometimes encounter 'InsecurePlatformWarning'
## uncomment below to disable the warning
#if sys.version[0]=='2':
#	import requests.packages.urllib3
#	requests.packages.urllib3.disable_warnings()

def ping():
	try:p=requests.get(endpoint)
	except:return(critical,'endpoint is down')
	try:d=p.json()
	except:d=p.text
	if p.status_code!=requests.codes.ok:return(critical,'returns non-200 HTTP status code')
	return(ok,d)

def return_status(elapsed):
	if elapsed>=critical_threshold:return critical
	if elapsed>=warning_threshold:return warning
	return ok

def write_data(d):
	with open(status_file,'w') as status:status.write('{0}\n'.format(d))
	
if __name__=='__main__':
	## from Noah's (Crowley) criteria
	## every 15-sec check, alert if 50% failed in 120 seconds
	## recover at 80% in 300 seconds period

	count=0
	failure=0
	degraded=False

	write_data(ok)
	try:
		while(True):
			elapsed_time_in_seconds=timeit("ping()",number=number_of_queries,setup="from __main__ import ping")

			## for testing only
			#from random import random
			#elapsed_time_in_seconds=random()
			## end

			## if all is clear, max 'count' of check is 8
			## if failure is 4 or beyond, max 'count' of check is 20
			if not degraded:
				if failure>=4:
					count=0
					failure=0
					degraded=True
					write_data(critical)
					print '    [ALARM is set and ongoing!]'
				else:
					if count>=8:
						failure=0
						count=0	# 120 seconds/15-sec check == 8

			## degraded mode
			elif degraded:
				print '    [currently in ALARM mode',failure,count,']'
				if count>=20:count=0	# 300 seconds/15-sec check == 20
				if count==0:failure=0

				## if 80% of checks are successful, recover now
				if count==19 and failure<=14: # 4
					degraded=False
					failure=0
					write_data(ok)
					print '    <** turning off ALARM **>'

			## record failure count
			if return_status(elapsed_time_in_seconds)!=0:failure+=1

			print('count: {0}, failure: {1}, degraded: {4}, exitcode: {2}, elapsed_time: {3}'.format(count,failure,return_status(elapsed_time_in_seconds),elapsed_time_in_seconds,degraded))
			count+=1
			time.sleep(interval)
	except KeyboardInterrupt:
		print('Quitting...')
		sys.exit(0)


Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.