# This rules files looks at automountd mount failure error messages # and reports when the same error occurs on the same host more than # twice in 60 seconds, or when the same server host causes an error to # be generated on more than three unique hosts 5 minutes. ## standard preamble type= single continue= dontcont ptype= nsubstr pattern= automountd desc= Eliminate events that aren't processed by this ruleset. action= none type= single continue= dontcont ptype= tvalue pattern= TRUE desc= See if other ruleset has processed this event action= none context= [EVENT_PROCESSED] type= single continue= takenext ptype= tvalue pattern= TRUE desc= Stop other rulesets from processing this event action= create EVENT_PROCESSED #### # # This rule looks for more than two events from a single client # system in one minute. Then suppresses following reports for # one hour (timeout of suppress_automountd_mount_failure_$1_$2). # # Example input: # May 29 13:14:12 u12.example.com automountd[186]: \ # exem:/disk/sd0g/sources/conserver-8.3: No such file or directory # # create the context and set it with a 1 minute timeout type= singlewiththreshold desc= automount failure to mount $2 on $1 continue=takenext ptype= regexp pattern= ([\w._-]+) automountd\[[0-9]+\]: (.*): No such file or directory action= add automountd_mount_failure_$1_$2 $0; \ report automountd_mount_failure_$1_$2 \ /usr/bin/mailx "Automount failure on $1" admin ;\ create suppress_automountd_mount_failure_$1_$2 1800 context= ! suppress_automountd_mount_failure_$1_$2 thresh= 2 window= 60 # accumulate the events into a context for the threshold rule to # report. Extend the accumulating context's (automountd_mount_failure_$1_$2) # lifetime by the detection window (60 seconds). This also means that # the context will clear itself reducing memeory requirements when it # is no longer needed to provide a trail of errors. type= single desc= accumulate automount events for host $1 mount $2 continue= takenext ptype= regexp pattern= ([\w._-]+) automountd\[[0-9]+\]: (.*): No such file or directory context= ! suppress_automountd_mount_failure_$1_$2 action= add automountd_mount_failure_$1_$2 $0 ;\ set automountd_mount_failure_$1_$2 61 ##### # # This rule looks for problems with the server system # It gathers the events looking for multiple hosts reporting # a problem with the same filesystem. Reports it if more than 3 occur # in 5 minutes then suppresses reporting for one hour. Each client # host is counted only once because of the context # mount_failure_$1_$2_$3 # # Example input: # May 29 13:14:12 u12.example.com automountd[186]: \ # exem:/disk/sd0g/sources/conserver-8.3: No such file or directory # # use of context guarantees only unique client hosts are counted type= singlewiththreshold continue= takenext desc= nfs mount/server failure on $2 mounting $3 ptype= regexp pattern= ([\w._-]+) automountd\[[0-9]+\]: ([^:]+):(.+): No such file or directory action= add server_$2_mount_failure_$3 $0; \ report server_$2_mount_failure_$3 \ /usr/bin/mailx -s "mount_failure server $2" admin ;\ create suppress_automountd_server_failures_$2:$3 1800 context= ! server_$2_client_$1_mount_failure_$3 && \ ! suppress_automountd_server_failures_$2:$3 thresh= 3 window= 300 # record the event and create the mount failure contexts # The expire time is set to 5 minutes which allows the following # report: # A host 1 error foo:/opt (0 minutes) # B* host 2 error foo:/opt (4 minutes) # C host 1 error foo:/opt (4.5 minutes) (Ignored by threshold # command, but accumulated) # D* host 1 error foo:/opt (6 minutes) # E* host 3 error foo:/opt (8 minutes) # of 5 items when item E occurs because there have been 3 unique hosts # reporting in 5 minutes (B, D, E). Item C is ignored because it is # the same as item 1, and can't complete the three unique event # criteria. type= single desc= record mount failure server $2 directory $3 ptype= regexp pattern= ([\w._-]+) automountd\[[0-9]+\]: ([^:]+):(.+): No such file or directory context = ! suppress_automountd_server_failures_$2:$3 action= add server_$2_mount_failure_$3 $0; \ set server_$2_mount_failure_$3 300; \ create server_$2_client_$1_mount_failure_$3 300 # must add a rule to consume the event when the single rule above is # turned off. Otherwise it will pass the rule onto the default ruleset # for reporting. type=suppress desc= Capture suppressed errors to prevent default rules from triggering. ptype=regexp pattern= ([\w._-]+) automountd\[[0-9]+\]: ([^:]+):(.+): No such file or directory context = suppress_automountd_server_failures_$2:$3 # If we get here, we must not have handled the event. type= single continue= dontcont ptype= tvalue pattern= TRUE desc= unset EVENT_PROCESSED action= delete EVENT_PROCESSED # publish