I was asked recently to get Nagios to flag *under*utilisation for a bunch of WAN links.
I had been using a shell script written I think by Garry Cook and Israel Brewster, with a number of hacks to add some extra functionality, but I couldn't get this additional mod going without a complete rewrite.
#!/usr/bin/python
#
# NAME: check_rrd.py
# AUTHOR: Philip Damian-Grint
# MODIFIED: 6th March 2014
# VERSION: 0.5
#
# DESCRIPTION:
# Nagios Plugin to compare utilisation values from an RRD file with
# warning and critical thresholds.
# Features:
# 1. Threshold units and RRD units can be individually specified.
# Thresholds default to Kilobytes/sec and RRD units default to Bytes/sec (MRTG default)
# 2. RRD filepath can be supplied on the command line or via environment variable
# 3. Threshold direction can be reversed so that low utilisation can also be checked
# 4. Time period can be specified in minutes, hours, days or months; a basic mean average
# is taken over multiple records. Defaults to 10 minutes
# 5. Multipliers used for unit conversion can be decimal (default) or binary
# 6. Threshold behaviour can be specified so that only one direction, both directions,
# any (default) direction, or the sum of both directions can be checked against the threshold.
# 7. A maximum age of data threshold can be specified
# 6. An http link (or any text) can be appended to line 1 output.
#
# Notes:
# 1. This has been tested on a Centos 6.4 system with Nagios v4, rrdtool v1.4.8,
# and Python 2.6.6
# 2. All errors prior to fetching data or resulting in invalid or suspect data return UNKNOWN.
# 3. At present, only AVERAGE values are processed
# 4. Verbose includes a report on number of empty records, latest timestamp, RRD file processed,
# threshold behaviour and threshold direction
#
# Example configuration:
#
# file: checkcommands.cfg
#
# # Check 7-day average sum of in and out not below supplied thresholds,
# # and insert a link to MRTG at the end of line 1
# #'check_under_util' command definition
# define command{
# command_name check_under_util
# command_line $USER1$/check_rrd -f /usr/local/mrtg/share/rrd/$ARG1$.rrd -w $ARG2$ -c $ARG3$ -r -p 7days -m sumonly -v -l '<a href=/mrtg/cgi-bin/mrtg-rrd.cgi/$ARG1$.html style=font-size:6pt target=_blank>MRTG</a>'
# }
#
import argparse
from argparse import RawTextHelpFormatter
import os
import re
import rrdtool
import sys
import time
class CheckRRD(object):
'''Structure to store key variables and data'''
# Nagios states - offsets = return code
states = ('OK', 'WARNING', 'CRITICAL', 'UNKNOWN')
# Units for thresholds and data - offsets used to index into multiplier table
units = ('b', 'B', 'K', 'M', 'G')
# 5x5 tables to convert data units into threshold units (b,B,K,M,G rows and columns)
multi_bin = ((1,8,8192,8388608,8589934592),
(0.125,1,1024,1048576,1073741824),
(0.00012207,0.00097656,1,1024,1048576),
(1.19209E-07,9.53674E-07,0.000976563,1,1024),
(1.16415E-10,9.31323E-10,9.53674E-07,0.000976563,1))
multi_dec = ((1,8,8000,8000000,8000000000),
(0.125,1,1000,1000000,1000000000),
(0.000125,0.001,1,1000,1000000),
(0.000000125,0.000001,0.001,1,1000),
(1.25E-10,0.000000001,0.000001,0.001,1))
def __init__(self):
self.version = '0.5'
self.output = '' # Nagios plugin output line 1
self.info = '' # additional output for line 2 onwards
self.status = CheckRRD.states.index('OK') # default to successful return code
self.empty = 0 # number of empty records found in dataset
self.stale_secs = False # conditional data age check
self.verbose = False
def parse_args():
'''Retrieve and sanity-check script arguments'''
parser = argparse.ArgumentParser(
description='RRD Threshold Check Script v{0}'.format(rrdchk.version),
formatter_class=RawTextHelpFormatter,
epilog='Notes:'
+ '\n- Warning and critical thresholds are AVERAGE values.'
+ '\n- Units for stored data and thresholds:'
+ '\n "b"=bps, "B"=Bps, "K"=KBps, "M"=MBps, "G"=GBps'
+ '\n output BW uses threshold units'
+ '\n- Threshold behaviour:'
+ '\n "inout": both IN and OUT must breach'
+ '\n "sum": sum of IN and OUT must breach'
+ '\n "inonly"/"outonly": specified threshold must breach'
+ '\n "any": either threshold can breach')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument( '-f', dest='rrd_file',
action='store',
help='rrd file-path')
group.add_argument( '-e', dest='rrd_env',
action='store',
help='rrd environment variable')
parser.add_argument('-r', dest='direction',
action='store_true',
default=False,
help='reverse threshold direction (low check)')
parser.add_argument('-b', dest='binary',
action='store_true',
default=False,
help='use binary (1024) multiples instead of decimal (1000)')
parser.add_argument('-m', dest='threshold',
action='store',
default='any',
choices=['any','inout','inonly','outonly','sumonly'],
help='threshold behaviour: (any|inout|inonly|outonly|sumonly)')
parser.add_argument('-l', dest='embedded_link',
action='store',
help='http link to append to output line 1')
parser.add_argument('-a', dest='age_check',
action='store',
default=False,
help='data age threshold in seconds')
parser.add_argument('-w', dest='warning',
action='store',
required=True,
help='warning threshold')
parser.add_argument('-c', dest='critical',
action='store',
required=True,
help='critical threshold')
parser.add_argument('-p', dest='period',
action='store',
default='10minutes',
help='time period: N{minutes|hours|days|months}, default 10minutes')
parser.add_argument('-d', dest='rrd_units',
action='store',
choices=['b','B','K','M','G'],
default='B',
help='rrd data units (default Bytes/sec)')
parser.add_argument('-u', dest='thresh_units',
action='store',
choices=['b','B','K','M','G'],
default='K',
help='threshold units (default Kilobytes/sec)')
parser.add_argument('-v', dest='verbose',
action='store_true',
help='verbose output')
args = parser.parse_args()
# Any arguments?
if len(sys.argv) == 1:
parser.print_usage()
return False
# How much verbosity?
if args.verbose:
rrdchk.verbose = True
rrdchk.info = '\n'
# Path to RRD file?
if args.rrd_file:
rrdchk.rrd_path = args.rrd_file
elif args.rrd_env:
try:
rrdchk.rrd_path = os.environ[args.rrd_env]
except KeyError:
return bail('Error reading environment variable {0}'.format(args.rrd_env))
if rrdchk.verbose:
rrdchk.info += 'RRD file:{0}'.format(rrdchk.rrd_path)
# Input and output units
rrdchk.runits = args.rrd_units
rrdchk.tunits = args.thresh_units
# Warning and Critical supplied?
try:
rrdchk.warning = int(args.warning)
rrdchk.critical = int(args.critical)
except (TypeError,ValueError):
return bail('Warning ({0}) and Critical ({1}) thresholds must be positive integers'.format(args.warning, args.critical))
# Threshold higher or lower?`
if args.direction:
rrdchk.opt_eq = '<='
if rrdchk.verbose:
rrdchk.info += ', checking for LOW threshold'
else:
rrdchk.opt_eq = '>='
# Reasonable time period?
period = re.match(r'([0-9]+)((?:minutes|hours|days|months))',args.period)
if not period:
return bail('Invalid time period')
elif ((int(period.group(1)) > 12 and period.group(2) == 'months') or
(int(period.group(1)) > 365 and period.group(2) == 'days') or
(int(period.group(1)) > 8760 and period.group(2) == 'hours') or
(int(period.group(1)) > 381600 and period.group(2) == 'minutes')):
return bail('Unreasonable time period')
else:
rrdchk.period = args.period
# Mandatory thresholds?
rrdchk.behaviour = args.threshold
# Binary vs decimal multipliers for calculations?
if args.binary:
rrdchk.multipliers = CheckRRD.multi_bin
else:
rrdchk.multipliers = CheckRRD.multi_dec
# Record age check required?
if args.age_check:
try:
rrdchk.stale_secs = int(args.age_check)
except (TypeError,ValueError):
return bail('Data age threshold must be a positive integer, when present')
# Store embedded link if supplied
if args.embedded_link:
rrdchk.href = args.embedded_link
else:
rrdchk.href = ''
return True
def fetch_data():
'''Retrieve traffic samples for requested period'''
# First check data age
try:
rrdchk.rrd_info = rrdtool.info(rrdchk.rrd_path)
except rrdtool.error,e:
return bail('Error from RRDTOOL.info: {0}'.format(e))
if rrdchk.stale_secs:
if int(time.time()) - rrdchk.rrd_info['last_update'] > rrdchk.stale_secs:
return bail('Data age check failed: latest dataslot {0} more than minutes ago ({1})'.format(
int(rrdchk.stale_secs/60.0),
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(rrdchk.rrd_info['last_update']))))
# Then pull a dataset
try:
((start_time,
end_time,
interval),
(ds0, ds1),
rrdchk.dataset) = rrdtool.fetch(rrdchk.rrd_path,
'AVERAGE',
'-s-{0}'.format(rrdchk.period))
except rrdtool.error,e:
return bail('Error from RRDTOOL.fetch: {0}'.format(e))
if rrdchk.verbose:
rrdchk.info += ', latest dataslot found: {0}'.format(
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(rrdchk.rrd_info['last_update'])))
return True
def normalise_data():
'''Convert data to same units as thresholds'''
# Sum dataset, keep track of empty slots
in_sum = out_sum = 0
for (in_slot, out_slot) in rrdchk.dataset:
if in_slot != None and out_slot != None:
in_sum += in_slot
out_sum += out_slot
else:
rrdchk.empty += 1
if rrdchk.verbose:
rrdchk.info += ', found {0} empty records out of {1}'.format(rrdchk.empty, len(rrdchk.dataset))
# Check for empty dataset
if rrdchk.empty == len(rrdchk.dataset):
return bail('No records in the time period contained data')
# Calculate averages
rrdchk.in_average = in_sum / (len(rrdchk.dataset)-rrdchk.empty)
rrdchk.out_average = out_sum / (len(rrdchk.dataset)-rrdchk.empty)
# Convert averages into threshold units
rrdchk.in_norm = rrdchk.in_average * rrdchk.multipliers[CheckRRD.units.index(rrdchk.tunits)][CheckRRD.units.index(rrdchk.runits)]
rrdchk.out_norm = rrdchk.out_average * rrdchk.multipliers[CheckRRD.units.index(rrdchk.tunits)][CheckRRD.units.index(rrdchk.runits)]
return True
def check_threshold():
'''Carry out threshold checking on normalised data '''
in_status = 0
out_status = 0
sum_status = 0
# Calculate all possible statuses
if eval("rrdchk.in_norm {0} rrdchk.warning".format(rrdchk.opt_eq)):
in_status = CheckRRD.states.index('WARNING')
if eval("rrdchk.in_norm {0} rrdchk.critical".format(rrdchk.opt_eq)):
in_status = CheckRRD.states.index('CRITICAL')
if eval("rrdchk.out_norm {0} rrdchk.warning".format(rrdchk.opt_eq)):
out_status = CheckRRD.states.index('WARNING')
if eval("rrdchk.out_norm {0} rrdchk.critical".format(rrdchk.opt_eq)):
out_status = CheckRRD.states.index('CRITICAL')
if eval("(rrdchk.out_norm + rrdchk.in_norm) {0} rrdchk.warning".format(rrdchk.opt_eq)):
sum_status = CheckRRD.states.index('WARNING')
if eval("(rrdchk.out_norm + rrdchk.in_norm) {0} rrdchk.critical".format(rrdchk.opt_eq)):
sum_status = CheckRRD.states.index('CRITICAL')
# Now determine which will contribute to Nagios output
# ANY - threshold triggered by either threshold
# INONLY - threshold only triggered if IN thresholds, OUT not checked
# OUTONLY - threshold only triggered if OUT thresholds, IN not checked
# INOUT - threshold only triggered if both IN and OUT threshold
# SUM - threshold only triggered if the sum of IN and OUT thresholds
# Check IN, ignore OUT
if rrdchk.behaviour == 'inonly':
if rrdchk.verbose:
rrdchk.info += ', IN threshold used, OUT ignored'
if in_status > rrdchk.status:
rrdchk.status = in_status
# Check OUT, ignore IN
elif rrdchk.behaviour == 'outonly':
if rrdchk.verbose:
rrdchk.info += ', OUT threshold used, IN ignored'
if out_status > rrdchk.status:
rrdchk.status = out_status
# Check IN AND OUT
elif rrdchk.behaviour == 'inout':
if rrdchk.verbose:
rrdchk.info += ', Both IN and OUT thresholds used'
if (out_status > rrdchk.status and in_status > rrdchk.status):
if in_status >= out_status:
rrdchk.status = out_status
else:
rrdchk.status = in_status
# Check the sum of IN and OUT
elif rrdchk.behaviour == 'sumonly':
if rrdchk.verbose:
rrdchk.info += ', Sum of IN and OUT thresholds used'
if sum_status > rrdchk.status:
rrdchk.status = sum_status
# default either/or case last
else:
if rrdchk.verbose:
rrdchk.info += ', Either IN or OUT thresholds used'
if in_status > rrdchk.status:
rrdchk.status = in_status
if out_status > rrdchk.status:
rrdchk.status = out_status
return True
def bail(msg):
'''Set status for all processing errors to UNKNOWN'''
rrdchk.output = 'UNKNOWN - ' + msg
rrdchk.status = CheckRRD.states.index('UNKNOWN')
return False
def build_output():
'''Prepare Nagios-Plugin standard output'''
rrdchk.output = '{0} - Average BW ({1}) in: {2:.4f}{4}{5}ps, out: {3:.4f}{4}{5}ps {6}'.format(
CheckRRD.states[rrdchk.status],
rrdchk.period,
rrdchk.in_norm,
rrdchk.out_norm,
rrdchk.tunits,
('b' if rrdchk.tunits == 'b'
else '' if rrdchk.tunits == 'B'
else 'B'),
rrdchk.href)
return True
################################
# MAIN starts here
################################
rrdchk = CheckRRD()
if all(check() for check in (parse_args, fetch_data, normalise_data,check_threshold)):
build_output()
rrdchk.output += rrdchk.info
print rrdchk.output
sys.exit(rrdchk.status)