Snippets
Edel SM Nagios plugin to check RTAPI webapp ping, then writes the status in /tmp/_status.rtapi for nagios to fetch
Created by
Edelberto Mania
last modified
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | #!/usr/bin/env python
# nagios plugin to check response time of the rtapi "ping" function
# Edelberto Mania <ed@zenoradio.com>
# 20160125
# 20160201 - backgrounded, write status to $status_file that is read by a separate nagios plugin
endpoint_title="API Response Time"
endpoint="https://rtapi.zenoradio.com/api/Ping/read?data=1"
## number of queires to be sent
number_of_queries=1
## duration for test (200 in production)
test_duration=200
## duration for recovery (300 in production)
recovery_duration=300
## interval (15 in production)
interval=15
## status saved in file
status_file='/tmp/_status.rtapi'
## threshold (response time) to generate alert
warning_threshold=1
critical_threshold=2
ok=0
warning=1
critical=2
import os,sys,requests,time,datetime
from timeit import timeit
## on version python2, we sometimes encounter 'InsecurePlatformWarning'
## uncomment below to disable the warning
#if sys.version[0]=='2':
# import requests.packages.urllib3
# requests.packages.urllib3.disable_warnings()
def ping():
try:p=requests.get(endpoint)
except:return(critical,'endpoint is down')
try:d=p.json()
except:d=p.text
if p.status_code!=requests.codes.ok:return(critical,'returns non-200 HTTP status code')
return(ok,d)
def return_status(elapsed):
if elapsed>=critical_threshold:return critical
if elapsed>=warning_threshold:return warning
return ok
def write_data(d):
with open(status_file,'w') as status:status.write('{0}\n'.format(d))
if __name__=='__main__':
## from Noah's (Crowley) criteria
## every 15-sec check, alert if 50% failed in 120 seconds
## recover at 80% in 300 seconds period
count=0
failure=0
degraded=False
write_data(ok)
try:
while(True):
elapsed_time_in_seconds=timeit("ping()",number=number_of_queries,setup="from __main__ import ping")
## for testing only
#from random import random
#elapsed_time_in_seconds=random()
## end
## if all is clear, max 'count' of check is 8
## if failure is 4 or beyond, max 'count' of check is 20
if not degraded:
if failure>=4:
count=0
failure=0
degraded=True
write_data(critical)
print ' [ALARM is set and ongoing!]'
else:
if count>=8:
failure=0
count=0 # 120 seconds/15-sec check == 8
## degraded mode
elif degraded:
print ' [currently in ALARM mode',failure,count,']'
if count>=20:count=0 # 300 seconds/15-sec check == 20
if count==0:failure=0
## if 80% of checks are successful, recover now
if count==19 and failure<=14: # 4
degraded=False
failure=0
write_data(ok)
print ' <** turning off ALARM **>'
## record failure count
if return_status(elapsed_time_in_seconds)!=0:failure+=1
print('count: {0}, failure: {1}, degraded: {4}, exitcode: {2}, elapsed_time: {3}'.format(count,failure,return_status(elapsed_time_in_seconds),elapsed_time_in_seconds,degraded))
count+=1
time.sleep(interval)
except KeyboardInterrupt:
print('Quitting...')
sys.exit(0)
|
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.