# Example rule to send an event through to Nagios # # Send any PingCheck through. Strip the first for characters (the prefix) # into $1 and send $2 (the nagios passive command) to the %nagiosCmd pipe. # type = single desc = submit pingcheck ptype = regexp pattern =^(... )(.*;PingCheck;.*)$ action = write %nagiosCmd $2 # reschedule service checks. # # Service checks can bunch up. If this happens when using check_by_ssh # ssh fails in a particular way, so reschedule the ssh check some # random number of seconds in the future to respread the checks. # type = single desc = reschedule ssh exchange failure some random 23-43 seconds in future rem = Have only one pending reschedule per host in any minute. ptype = regexp pattern = PROCESS_SERVICE_CHECK_RESULT\;([^;]+)\;([^;]+)\;[0123]\;Remote command execution failed: ssh_exchange_identification: Connection closed by remote host rem = $1 = host name $2 = service description context = ! resched_$1_in_progress action = eval %d (int(rand(20))+23+%u;); create resched_$1_in_progress 60 ; \ write %nagiosCmd ([%u] SCHEDULE_FORCED_SVC_CHECK;$1;$2;%d) # # Require 4 consecutive ok messages in a row before changing the # status of the service. Note the ok component of this service # must be mode three and other states can be in mode 2 or 3. # # used for service rtables_split_check type=single rem = use takenext to have the failure asserted by the default rule continue=takenext desc = detect non-ok state for rtables_split_check on host $2 ptype=regexp rem = look for mode 3 active events. pattern = ^.3a (\[[0-9]*\] PROCESS_SERVICE_CHECK_RESULT\;([^;]+)\;rtables_split_check\;[123]\;.*) rem = reset the SingleWithThreshold if a non-ok check occurs action = reset +1 require 4 consecutive ok's for rtables_split_check on host $2 # the event that matches this rule has already been sent to the nagios # core maintaining the current error status (the digit at the beginning # of the string). So we just count to 4 before we send the clear. type = SingleWithThreshold desc = require 4 consecutive ok's for rtables_split_check on host $2 ptype = regexp rem = match only when current service state is not ok pattern = ^[123]3a (\[[0-9]*\] PROCESS_SERVICE_CHECK_RESULT\;([^;]+)\;rtables_split_check\;0\;.*) action = write %nagiosCmd $1 rem = window = 3 minute interval/cycle * 4 cycles * 60 sec/minute + 30 sec process time window=750 thresh=4 # Capture the mode 3 event CronDaemonCheck on host concord # If 2 cron processes are running submit the result as ok # between 6PM and 2AM. Normally 2 processes is a warning. # type=single desc = concord can have 2 cron processes between 6PM and 2 am for backups ptype=regexp pattern = ^... (\[[0-9]+\] PROCESS_SERVICE_CHECK_RESULT\;concord\;CronDaemonCheck);1;(PROCS WARNING: ([0-9]+) processes with command name .crond.*) context = backups_running && =($3 == 2) action = write %nagiosCmd ($1;0;[backups running] $2) # see section 5.3.4 in # http://sixshooter.v6.thrupoint.net/SEC-examples/article-part2.html # to understand the reason for the calendar spec. type = calendar time = * 0-6 * * * desc = start backups_running context context = [! backups_running] action = create backups_running type = calendar time = * 7-23 * * * desc = stop backups_running context context = [backups_running] action = delete backups_running # replace useless event output which is a line of @ signs with a # useful error message. # type = single desc = shorten and diagnose ssh failure with row of @@@@@@@@ ptype = regexp pattern = ^(... )(.*);Remote command execution failed: \@+(.*)$ action = write %nagiosCmd ($2;ssh key failure, likely remote host identification has changed. $3.) # This event is a bit long (100 periods) so we shorten it. # type = single desc = shorten ReadProcCheck text ptype = regexp pattern = ^(... )(.*)'\.+(.*)$ action = write %nagiosCmd ($2 \.*$3)