Thursday, 6 March 2014

A Flexible RRD Checker for Nagios

I was asked recently to get Nagios to flag *under*utilisation for a bunch of WAN links.

I had been using a shell script written I think by Garry Cook and Israel Brewster, with a number of hacks to add some extra functionality, but I couldn't get this additional mod going without a complete rewrite.


#!/usr/bin/python
#
# NAME:     check_rrd.py
# AUTHOR:   Philip Damian-Grint
# MODIFIED: 6th March 2014
# VERSION:  0.5
#
# DESCRIPTION:
#   Nagios Plugin to compare utilisation values from an RRD file with 
#   warning and critical thresholds.
#   Features:
#   1.  Threshold units and RRD units can be individually specified.
#       Thresholds default to Kilobytes/sec and RRD units default to Bytes/sec (MRTG default)
#   2.  RRD filepath can be supplied on the command line or via environment variable
#   3.  Threshold direction can be reversed so that low utilisation can also be checked
#   4.  Time period can be specified in minutes, hours, days or months; a basic mean average
#       is taken over multiple records. Defaults to 10 minutes
#   5.  Multipliers used for unit conversion can be decimal (default) or binary
#   6.  Threshold behaviour can be specified so that only one direction, both directions, 
#       any (default) direction, or the sum of both directions can be checked against the threshold.
#   7.  A maximum age of data threshold can be specified
#   6.  An http link (or any text) can be appended to line 1 output.
#
# Notes:
#   1.  This has been tested on a Centos 6.4 system with Nagios v4, rrdtool v1.4.8,
#       and Python 2.6.6
#   2.  All errors prior to fetching data or resulting in invalid or suspect data return UNKNOWN.
#   3.  At present, only AVERAGE values are processed
#   4.  Verbose includes a report on number of empty records, latest timestamp, RRD file processed,
#       threshold behaviour and threshold direction
#
#   Example configuration:
# 
# file: checkcommands.cfg
#
# # Check 7-day average sum of in and out not below supplied thresholds, 
# #   and insert a link to MRTG at the end of line 1
# #'check_under_util' command definition
# define command{
#    command_name    check_under_util
#    command_line    $USER1$/check_rrd -f /usr/local/mrtg/share/rrd/$ARG1$.rrd -w $ARG2$ -c $ARG3$ -r -p 7days -m sumonly -v -l '<a href=/mrtg/cgi-bin/mrtg-rrd.cgi/$ARG1$.html style=font-size:6pt target=_blank>MRTG</a>'
#    }
# 

import argparse
from argparse import RawTextHelpFormatter
import os
import re
import rrdtool
import sys
import time

class CheckRRD(object):
    '''Structure to store key variables and data'''

    # Nagios states - offsets = return code
    states = ('OK', 'WARNING', 'CRITICAL', 'UNKNOWN')
    
    # Units for thresholds and data - offsets used to index into multiplier table
    units = ('b', 'B', 'K', 'M', 'G')

    # 5x5 tables to convert data units into threshold units (b,B,K,M,G rows and columns)
    multi_bin = ((1,8,8192,8388608,8589934592),
                   (0.125,1,1024,1048576,1073741824),
                   (0.00012207,0.00097656,1,1024,1048576),
                   (1.19209E-07,9.53674E-07,0.000976563,1,1024),
                   (1.16415E-10,9.31323E-10,9.53674E-07,0.000976563,1))
    multi_dec = ((1,8,8000,8000000,8000000000),
                   (0.125,1,1000,1000000,1000000000),
                   (0.000125,0.001,1,1000,1000000),
                   (0.000000125,0.000001,0.001,1,1000),
                   (1.25E-10,0.000000001,0.000001,0.001,1))

    def __init__(self):
        self.version = '0.5'
        self.output = ''        # Nagios plugin output line 1
        self.info = ''          # additional output for line 2 onwards
        self.status = CheckRRD.states.index('OK')   # default to successful return code
        self.empty = 0          # number of empty records found in dataset
        self.stale_secs = False # conditional data age check
        self.verbose = False

def parse_args():
    '''Retrieve and sanity-check script arguments'''

    parser = argparse.ArgumentParser(
            description='RRD Threshold Check Script v{0}'.format(rrdchk.version),
            formatter_class=RawTextHelpFormatter,
            epilog='Notes:'
                    + '\n- Warning and critical thresholds are AVERAGE values.'
                    + '\n- Units for stored data and thresholds:'
                    + '\n  "b"=bps, "B"=Bps, "K"=KBps, "M"=MBps, "G"=GBps'
                    + '\n  output BW uses threshold units'
                    + '\n- Threshold behaviour:'
                    + '\n  "inout": both IN and OUT must breach'
                    + '\n  "sum": sum of IN and OUT must breach'
                    + '\n  "inonly"/"outonly": specified threshold must breach'
                    + '\n  "any": either threshold can breach')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument( '-f', dest='rrd_file',
                            action='store',
                            help='rrd file-path')
    group.add_argument( '-e', dest='rrd_env',
                            action='store',
                            help='rrd environment variable')
    parser.add_argument('-r', dest='direction',
                            action='store_true',
                            default=False,
                            help='reverse threshold direction (low check)')
    parser.add_argument('-b', dest='binary',
                            action='store_true',
                            default=False,
                            help='use binary (1024) multiples instead of decimal (1000)')
    parser.add_argument('-m', dest='threshold',
                            action='store',
                            default='any',
                            choices=['any','inout','inonly','outonly','sumonly'],
                            help='threshold behaviour: (any|inout|inonly|outonly|sumonly)')
    parser.add_argument('-l', dest='embedded_link',
                            action='store',
                            help='http link to append to output line 1')
    parser.add_argument('-a', dest='age_check',
                            action='store',
                            default=False,
                            help='data age threshold in seconds')
    parser.add_argument('-w', dest='warning',
                            action='store',
                            required=True,
                            help='warning threshold')
    parser.add_argument('-c', dest='critical',
                            action='store',
                            required=True,
                            help='critical threshold')
    parser.add_argument('-p', dest='period',
                            action='store',
                            default='10minutes',
                            help='time period: N{minutes|hours|days|months}, default 10minutes')
    parser.add_argument('-d', dest='rrd_units',
                            action='store',
                            choices=['b','B','K','M','G'],
                            default='B',
                            help='rrd data units (default Bytes/sec)')
    parser.add_argument('-u', dest='thresh_units',
                            action='store',
                            choices=['b','B','K','M','G'],
                            default='K',
                            help='threshold units (default Kilobytes/sec)')
    parser.add_argument('-v', dest='verbose',
                            action='store_true',
                            help='verbose output')
    
    args = parser.parse_args()

    # Any arguments?
    if len(sys.argv) == 1:
        parser.print_usage()
        return False

    # How much verbosity?
    if args.verbose:
        rrdchk.verbose = True
        rrdchk.info = '\n'

    # Path to RRD file?
    if args.rrd_file:
        rrdchk.rrd_path = args.rrd_file
    elif args.rrd_env:
        try:
            rrdchk.rrd_path = os.environ[args.rrd_env]
        except KeyError:
            return bail('Error reading environment variable {0}'.format(args.rrd_env))
    if rrdchk.verbose:
        rrdchk.info += 'RRD file:{0}'.format(rrdchk.rrd_path)

    # Input and output units
    rrdchk.runits = args.rrd_units
    rrdchk.tunits = args.thresh_units

    # Warning and Critical supplied?
    try:
        rrdchk.warning = int(args.warning)
        rrdchk.critical = int(args.critical)
    except (TypeError,ValueError):
        return bail('Warning ({0}) and Critical ({1}) thresholds must be positive integers'.format(args.warning, args.critical))

    # Threshold higher or lower?`
    if args.direction:
        rrdchk.opt_eq = '<='
        if rrdchk.verbose:
            rrdchk.info += ', checking for LOW threshold'
    else:
        rrdchk.opt_eq = '>='

    # Reasonable time period?
    period = re.match(r'([0-9]+)((?:minutes|hours|days|months))',args.period)
    if not period:
        return bail('Invalid time period')
    elif ((int(period.group(1)) > 12 and period.group(2) == 'months') or
          (int(period.group(1)) > 365 and period.group(2) == 'days') or
          (int(period.group(1)) > 8760 and period.group(2) == 'hours') or
          (int(period.group(1)) > 381600 and period.group(2) == 'minutes')):
        return bail('Unreasonable time period')
    else:
        rrdchk.period = args.period

    # Mandatory thresholds?
    rrdchk.behaviour = args.threshold

    # Binary vs decimal multipliers for calculations?
    if args.binary:
        rrdchk.multipliers = CheckRRD.multi_bin
    else:
        rrdchk.multipliers = CheckRRD.multi_dec

    # Record age check required?
    if args.age_check:
        try:
            rrdchk.stale_secs = int(args.age_check)
        except (TypeError,ValueError):
            return bail('Data age threshold must be a positive integer, when present')

    # Store embedded link if supplied
    if args.embedded_link:
        rrdchk.href = args.embedded_link
    else:
        rrdchk.href = ''

    return True

def fetch_data():
    '''Retrieve traffic samples for requested period'''

    # First check data age
    try:
        rrdchk.rrd_info = rrdtool.info(rrdchk.rrd_path)
    except rrdtool.error,e:
        return bail('Error from RRDTOOL.info: {0}'.format(e))

    if rrdchk.stale_secs:
        if int(time.time()) - rrdchk.rrd_info['last_update'] > rrdchk.stale_secs:
            return bail('Data age check failed: latest dataslot {0} more than minutes ago ({1})'.format(
                    int(rrdchk.stale_secs/60.0),
                    time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(rrdchk.rrd_info['last_update']))))

    # Then pull a dataset
    try:
        ((start_time,
          end_time,
          interval),
         (ds0, ds1),
          rrdchk.dataset) = rrdtool.fetch(rrdchk.rrd_path,
                                         'AVERAGE',
                                         '-s-{0}'.format(rrdchk.period))
    except rrdtool.error,e:
        return bail('Error from RRDTOOL.fetch: {0}'.format(e))

    if rrdchk.verbose:
        rrdchk.info += ', latest dataslot found: {0}'.format(
                time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(rrdchk.rrd_info['last_update'])))

    return True

def normalise_data():
    '''Convert data to same units as thresholds'''

    # Sum dataset, keep track of empty slots
    in_sum = out_sum = 0
    for (in_slot, out_slot) in rrdchk.dataset:
        if in_slot != None and out_slot != None:
            in_sum += in_slot
            out_sum += out_slot
        else:
            rrdchk.empty += 1
    
    if rrdchk.verbose:
        rrdchk.info += ', found {0} empty records out of {1}'.format(rrdchk.empty, len(rrdchk.dataset))

    # Check for empty dataset
    if rrdchk.empty == len(rrdchk.dataset):
        return bail('No records in the time period contained data')

    # Calculate averages
    rrdchk.in_average = in_sum / (len(rrdchk.dataset)-rrdchk.empty)
    rrdchk.out_average = out_sum / (len(rrdchk.dataset)-rrdchk.empty)

    # Convert averages into threshold units
    rrdchk.in_norm = rrdchk.in_average * rrdchk.multipliers[CheckRRD.units.index(rrdchk.tunits)][CheckRRD.units.index(rrdchk.runits)]
    rrdchk.out_norm = rrdchk.out_average * rrdchk.multipliers[CheckRRD.units.index(rrdchk.tunits)][CheckRRD.units.index(rrdchk.runits)]

    return True

def check_threshold():
    '''Carry out threshold checking on normalised data '''

    in_status = 0
    out_status = 0
    sum_status = 0

    # Calculate all possible statuses
    if eval("rrdchk.in_norm {0} rrdchk.warning".format(rrdchk.opt_eq)):
        in_status = CheckRRD.states.index('WARNING')
    if eval("rrdchk.in_norm {0} rrdchk.critical".format(rrdchk.opt_eq)):
        in_status = CheckRRD.states.index('CRITICAL')
    if eval("rrdchk.out_norm {0} rrdchk.warning".format(rrdchk.opt_eq)):
        out_status = CheckRRD.states.index('WARNING')
    if eval("rrdchk.out_norm {0} rrdchk.critical".format(rrdchk.opt_eq)):
        out_status = CheckRRD.states.index('CRITICAL')
    if eval("(rrdchk.out_norm + rrdchk.in_norm) {0} rrdchk.warning".format(rrdchk.opt_eq)):
        sum_status = CheckRRD.states.index('WARNING')
    if eval("(rrdchk.out_norm + rrdchk.in_norm) {0} rrdchk.critical".format(rrdchk.opt_eq)):
        sum_status = CheckRRD.states.index('CRITICAL')

    # Now determine which will contribute to Nagios output

    # ANY - threshold triggered by either threshold
    # INONLY - threshold only triggered if IN thresholds, OUT not checked
    # OUTONLY - threshold only triggered if OUT thresholds, IN not checked
    # INOUT - threshold only triggered if both IN and OUT threshold
    # SUM - threshold only triggered if the sum of IN and OUT thresholds

    # Check IN, ignore OUT
    if rrdchk.behaviour == 'inonly':
        if rrdchk.verbose:
            rrdchk.info += ', IN threshold used, OUT ignored'
        if in_status > rrdchk.status:
            rrdchk.status = in_status
    # Check OUT, ignore IN
    elif rrdchk.behaviour == 'outonly':
        if rrdchk.verbose:
            rrdchk.info += ', OUT threshold used, IN ignored'
        if out_status > rrdchk.status:
            rrdchk.status = out_status
    # Check IN AND OUT
    elif rrdchk.behaviour == 'inout':
        if rrdchk.verbose:
            rrdchk.info += ', Both IN and OUT thresholds used'
        if (out_status > rrdchk.status and in_status > rrdchk.status):
            if in_status >= out_status:
                rrdchk.status = out_status
            else:
                rrdchk.status = in_status
    # Check the sum of IN and OUT
    elif rrdchk.behaviour == 'sumonly':
        if rrdchk.verbose:
            rrdchk.info += ', Sum of IN and OUT thresholds used'
        if sum_status > rrdchk.status:
            rrdchk.status = sum_status
    # default either/or case last
    else:
        if rrdchk.verbose:
            rrdchk.info += ', Either IN or OUT thresholds used'
        if in_status > rrdchk.status:
            rrdchk.status = in_status
        if out_status > rrdchk.status:
            rrdchk.status = out_status

    return True

def bail(msg):
    '''Set status for all processing errors to UNKNOWN'''
    rrdchk.output = 'UNKNOWN - ' + msg
    rrdchk.status = CheckRRD.states.index('UNKNOWN')
    return False

def build_output():
    '''Prepare Nagios-Plugin standard output'''
    rrdchk.output = '{0} - Average BW ({1}) in: {2:.4f}{4}{5}ps, out: {3:.4f}{4}{5}ps {6}'.format(
            CheckRRD.states[rrdchk.status],
            rrdchk.period,
            rrdchk.in_norm,
            rrdchk.out_norm,
            rrdchk.tunits,
            ('b' if rrdchk.tunits == 'b' 
                   else '' if rrdchk.tunits == 'B' 
                   else 'B'),
            rrdchk.href)

    return True

################################
# MAIN starts here
################################
rrdchk = CheckRRD()

if all(check() for check in (parse_args, fetch_data, normalise_data,check_threshold)):
    build_output()
    rrdchk.output += rrdchk.info

print rrdchk.output
sys.exit(rrdchk.status)