# this is valid tcl code that is loaded into a running tkwatcher. # tkwatcher just sources the code right now. Ultimately mark's parser # should do some of the job. # # see the man page for a description of this file. # # I run tkwatcher with this file once an hour # #GENERAL TESTS look for HOST SPECIFIC later in the file # run vmstat and look for change in swap_ins, out or pages in out set watch(1,memory) {"vmstat -s" "virtual memory stats" { { {swap ins} {swap_ins 0 %d} } { {swap outs} {swap_outs 0 %d} } { {pages swapped\ in} {pg_swap_in 0 %d} } { {pages swapped\ out} {pg_swap_out 0 %d} } } { {swap_ins change} {swap_outs change} {pg_swap_in change} {pg_swap_out change} } } # look for available swap dropping by 50%, less than 1000 or less than # 10% free. set watch(2,memory) {"swap -s" "swap space" { {"^total" {allocated 1 %d} {reserved 5 %d} {used 8 %d} {available 10 %d}} } { {available pdelta_down 50%} {available range 1000 10000000} {available_% calc 100.0*(@available@)/(@available@+@used@) range 10 100} } } # {available delta 30000 10000000} # look at nfs server stats for an increase of more than 5, # any change in null_received, getattr% outside of the range of 0-30 # and increasing by more than 5 # symlink% outside the range of 0-10 and increasing by more then 5 # set watch(3,nfs,network) { "nfsstat -s" "nfs server stats" { {=3 {header 0-end %H}} {=4 {calls_rpc_server 0 %d} {badcalls_rcp_server 1 %d} {null_received 2 %d} {badlength 3 %d} {xdrcall 4 %d} } {=9 {header 0-end %H}} {=10 {null% 1 %d} {getattr% 3 %d} } {=12 {symlink% 13 %d}} } { {badcalls_rpc_server delta_up 5 } {null_received delta_up 0} {getattr% range 0 30 delta_up 5} {symlink% range 0 10 delta_up 5} } } # the timeout_badxid stuff is tricky. Only if the ratio between # badxid and timouts is 2 or less do we want to report. So we split # the range into 0.5 to 1 and 1 to 2 depending on which variable is bigger. # this displays the use of the iftrue/iffalse tests. set watch(4,nfs,network,!box7) { "nfsstat -c" "nfs client stats" { {=3 {header 0-end %H}} {=4 {calls_rpc_server 0 %d} {badcalls_rpc_server 1 %d} {retransmits 2 %d} {badxid 3 %d} {timeout 4 %d} {timers_expire 7 %d} } {=10 {getattr% 3 %d} {readlink% 12 %d} } } { {badcalls_rpc_server delta_up 10} {retrans_% calc 100.0*@retransmits@/@calls_rpc_server@ range 0 5 delta_up 5} {retrans_badxid calc 100*(@retransmits@-@badxid@)/@retransmits@ range 50 100 delta_down 0 } {timeout_badxid calc @timeout@>@badxid@ iftrue calc (@timeout@*1.0)/@badxid@ range 2 10000 } {timeout_badxid_1 calc @timeout@<@badxid@ iftrue calc (@timeout@*1.0)/@badxid@ range 0 0.5 } } } # look for disk usage output from iostat. Use the stats from the last # line to allow noise from startup of program to stabalize # disk usage should be in the 0-40 range, and report only if # 5 reports are out of range in the last 10 measurements. #set watch(5,disk) {"iostat -D 1 5" "disk usage" { {=1 {header 0-end %H}} {=2 {header 0-end %h}} {=7 {disk1_usage 2 %f} {disk2_usage 5 %f} {disk3_usage 8 %f} {disk4_usage 11 %f} } } { {disk1_usage range 0 40 cycle 5 10} {disk2_usage range 0 40 cycle 5 10} {disk3_usage range 0 40 cycle 5 10} {disk4_usage range 0 40 cycle 5 10} } } # look at disk space set watch(6,disk) {"df -k" "disk space" { {"=1" {header 0-end %H}} {"^/mnt" {ignore 0 %s} } {"^/.*/var" {filesystem 0 %k} {varcapacity 4 %d}} {"^/" {filesystem 0 %k} {capacity 4 %d}} {"^swap" {swapfile 0 %k} {swapcapacity 4 %d}} } { {capacity delta_up 10 range 0 80} {capacity delta_up 30 range 0 60 severity alert} {capacity range 0 99 severity emerg orgroup a} {capacity range 0 98 delta_up 0 severity alert orgroup a} {capacity range 0 90 delta_up 0 orgroup a} {varcapacity range 0 75 change} {varcapacity delta_up 10 range 0 50 severity warning} {swapcapacity range 0 80} } } # look at inode availability set watch(7,disk) {"/usr/ucb/df -i" "inodes" { {"=1" {header 0-end %H}} {"^/" {filesystem 0 %k} {%used 3 %d} {num_free 2 %d}} } { {%used range 0 85} {num_free range 1000 10000000} } } # look at kernel buffer pool failures set watch(8,memory) { "echo kmastat | crash" "memory stats" { { =1 {header 0-end %H}} { =2 {header 0-end %h}} {^small {size 0 %k} {pools 1 %d} {bytes_in_pool 2 %d} {bytes_allocated 3 %d} {failures 4 %d}} {^outsize {size 0 %k} {bytes_allocated 3 %d} {failures 4 %d}} {^large {size 0 %k} {pools 1 %d} {bytes_in_pool 2 %d} {bytes_allocated 3 %d} {failures 4 %d}} } { {failures change} } } # look at all sorts of interface stats, errors, collisions, # collisions as a percentage of output packets, queue lengths. set watch(9,network) { "netstat -i" "Interface stats" { { =1 {header 0-end %H} } {"lo0" {interface 0 %k} } {all {interface 0 %k} {Packets_in 4 %d} {Errors_in 5 %d} {Packets_out 6 %d} {Errors_out 7 %d} {Collisions 8 %d} {Queue_length 9 %d} } } { { collision_perc_of_output calc "@Collisions@*100.0/@Packets_out@" range 0 20 delta_up 1} { Collisions delta_up 1000000 } { Errors_in delta_up 5 } { Errors_out delta_up 10 } { Errors_perc_of_output calc "@Errors_out@*100.0/@Packets_out@" range 0 1 delta_up 1} { Queue_length change } } } # look at netstat info for network errors. set watch(10,network) { {sh -c "netstat -s | sed 's/=/= /'"} "Netstat errors" { { "udpInCksumErrs" {udp_checksum_errs 5 %d} {udp_in_ovrflow 5 %d} } { "ipInCksumErrs" {ip_checksum_errs 5 %d} } { "icmpInCksumErrs" {icmp_checksum_errs 2 %d} } { "udpInErrors" {udp_in_errors 5 %d} } { "tcpInErrs" {tcp_in_errors 2 %d} } { "rawipInOverflows" {rawip_in_ovrflow 2 %d} } } { {udp_in_errors delta_up 1 } {tcp_in_errors delta_up 1} {udp_checksum_errs delta_up 1 } {ip_checksum_errs delta_up 1 } {icmp_checksum_errs delta_up 1} {udp_in_ovrflow delta_up 1} {rawip_in_ovrflow delta_up 1} } } # look for runaway processes. These will be at the top of the # top output and will have racked up more than 55 minutes # of time in the past hour. # xlocks often get a lot of time when the processor is idle # so we exclude them from the top report set watch(11,process) {"/usr/local/bin/top -b 10" "Process time" { { =7 {header 0-end %H}} { "xlock$" {ignore 0 %s} } { =8 {proc_pid 0 %k} {proc_time 7 %d} } { =9 {proc_pid 0 %k} {proc_time 7 %d} } { =10 {proc_pid 0 %k} {proc_time 7 %d} } { =11 {proc_pid 0 %k} {proc_time 7 %d} } { =12 {proc_pid 0 %k} {proc_time 7 %d} } { =13 {proc_pid 0 %k} {proc_time 7 %d} } { =14 {proc_pid 0 %k} {proc_time 7 %d} } { =15 {proc_pid 0 %k} {proc_time 7 %d} } { =16 {proc_pid 0 %k} {proc_time 7 %d} } } { {proc_time delta_up 55} } } set watch(12,print) { "lpstat" "stalled print jobs" { {"=1" {jobid1 0 %s} {jobid1key 0 %k}} {"=2" {jobid2 0 %s} {jobid2key 0 %k}} {"=3" {jobid3 0 %s} {jobid3key 0 %k}} {"=4" {jobid3 0 %s} {jobid4key 0 %k}} } { {jobid1 nochange suppress 5} {jobid2 nochange suppress 5} {jobid3 nochange suppress 5} {jobid4 nochange suppress 5} } } set watch(13,print) { "lpstat -s" "lp subsystem" { { =1 {scheduler_status 2 %s} } { =2 {system_default 3 %s} } } { {scheduler_status value running} {system_default value sp2} } } set watch(14,daemons) { "ps -ef" "Daemon status" { { =1 {header 0-end %s} } { {root .*amd} {process_amd 7 %s}} { {root .*/usr/sbin/cron} {process_cron 7 %s}} { {root .*inetd -s} {process_inetd 7 %s}} { {root .*lockd} {process_lockd 7 %s}} { {root .*lpNet} {process_lpNet 7 %s}} { {root .*lpsched} {process_lpsched 7 %s}} { {root .*mountd} {process_mountd 7 %s}} { {root .*nfsd} {process_nfsd 7 %s}} { {root .*rpcbind} {process_rpcbind 7 %s}} { {root .*statd} {process_statd 7 %s}} { {root .*syslogd} {process_syslogd 7 %s}} { {root .*utmpd} {process_utmpd 7 %s}} } { { process_amd required } { process_cron required } { process_inetd required } { process_lockd required } { process_lpNet required } { process_lpsched required } { process_mountd required } { process_nfsd required } { process_rpcbind required } { process_statd required } { process_syslogd required } { process_utmpd required } } } ## HOST SPECIFIC set watch(xs1.1,fsinfo) { {sh -c "/tools/X11R6/bin/fsinfo -server xs1:7000 2>&1"} "ncdfs server" { {{unable\ to\ open\ server} {servername 5 %s}} } { {servername value "error"} {servername change} } } set watch(box7.1,speed) { {sh -c "PATH=/tools/scotty/bin /tools/scotty/bin/tcpspeed nj-ops-1"} "tcpspeed to nj" { {all { rate 0 %f } } } { {rate range 0.300 10000} {rate delta_down 0.200} } } # the output should always be at least 1 since the grep will match its # own command line, however this has been returning 0 for some reason. # so run it under sh and explicitly exit with 0 to suppress the child # process terminated abnormally message. set watch(box7.2,xs1.2,tftp) { {sh -c "ps -eadf | grep in.tftp | wc -l; exit 0"} "Tftp process count" { {=1 {count 0 %d} } } { {count range 0 5} } } set watch(box7.3,ntp) { "/tools/xntp/bin/ntpq -c peers" "ntp time check" { { =1 {header 0-end %H}} { =2 {ignore 0-1 %s}} { {147.249.240.255} {ignore 0 %s}} { {\*} {mhost 0 %k} {refid 1 %s} {reachability 6 %d} {offset 8 %d}} { all {host 0 %k} {refid 1 %s} {reachability 6 %d} {offset 8 %d}} } { {mhost required severity warning} {reachability cycle 4 4 suppress 1 nochange range 374 377} {offset range -20 20 cycle 3 5} {offset range -50 50} {refid novalue 0.0.0.0} {mhost last} {refid disable change} } } set watch(box7.4,ntp) { "/tools/xntp/bin/ntpq -n -c peers" "ntp hosts check" { { {192.168.240.255} {broadcast 0 %s}} { {127.127.1.1} {local 0 %s}} { {158.121.104.4} {timeserver_cs 0 %s}} { {192.168.240.71} {nfs1 0 %s}} { {192.168.240.72} {nfs2 0 %s}} { {192.168.240.2} {cisco2 0 %s}} { {192.168.240.180} {ts1 0 %s}} } { { broadcast required severity alert } { local required severity alert } { timeserver_cs required severity alert } { nfs1 required severity alert } { nfs2 required severity alert } { cisco2 required severity alert } { ts1 required severity alert } } } # qplus license has expired so eliminate it from consideration # this also checks for a given license being out for more than three hours # then it is suppressed for 12 hours so checkouts over weekends etc # generate less noise # # note that a "cycle 3 3" and a "suppress 12" actually waits for 15 runs # before reporting since the supress delivers 12 failed runs in a row, and # thus resets the cycle to be empty, and it takes the sysle three more # positive tests before it will allow a report. # set watch(box7.5,lmgrd) { {sh -c "LM_LICENSE_FILE=/etc/license.dat /tools/lmgrd/bin/lmstat -A 2>&1"} "lmgrd license check" { { {IST\ \(v3\.x\):} {status 2 %s} {Istatus 2 %s}} { {suntechd\ \(v3\.x\):} {status 2 %s} {sstatus 2 %s}} { {qplus\ :} {ignore 0 %s}} { {LOTUS123\ \(v3\.x\):} {status 2 %s} {Lstatus 2 %s}} { {iddis\.com\ /dev/} {key 0-end %k} {user 0 %s}} } { {status value UP} {user nochange cycle 3 3 suppress 12} {Istatus required} {sstatus required severity alert} {Lstatus required} } } set watch(box7.6.daemons) { "ps -ef" "Daemon status" { { =1 {header 0-end %s} } { {root .*arpwatch} {process_arpwatch 7 %s}} { {root .*logtrapd} {process_logtrapd 10 %s}} { {root .*lmgrd} {process_lmgrd 7 %s}} { {root .*xntpd} {process_xntpd 7 %s}} } { { process_arpwatch required } { process_logtrapd required } { process_lmgrd required } { process_xntpd required } } } # make sure that our hosts are still listed in the windows dns server, # and that things haven't failed again. Really wish the IT folks would # monitor this stuff. set watch(dns5) {"dig @192.20.3.7 sol.site.example.com" "sol in dns server 1" { {"^sol.site.example.com" {solname 1 %s} } } { {solname required} } } set watch(dns6) {"dig @192.20.3.6 sol.site.example.com" "sol in dns server 2" { {"^sol.site.example.com" {solname 1 %s} } } { {solname required} } } # a simple rule that calls a function set watch(test) { "echo 3 2" "tclproc test" { {=1 {aval 0 %s} {bval 1 %s}} } { { aval global} { bval tcl test {@aval@ @bval@} change} } } # The function called by the rule proc test {new {old 0}} { puts "in test $new $old\n" if {[expr $new + $old]} { uplevel 1 {set result "error output"} return 1 } return 0 } # a very simple function proc Watcher_proc {} { atest } # another simple function called by a function proc atest {} { #puts test }