# this is valid tcl code that is loaded into a running tkwatcher.
#   tkwatcher just sources the code right now. Ultimately mark's parser
#   should do some of the job.
#
# see the man page for a description of this file.
#
# I run tkwatcher with this file once an hour
#
#GENERAL TESTS look for HOST SPECIFIC later in the file

# run vmstat and look for change in swap_ins, out or pages in out
set watch(1,memory) {"vmstat -s" "virtual memory stats"
		{
			{ {swap ins} {swap_ins 0 %d} } 
			{ {swap outs} {swap_outs 0 %d} }
			{ {pages swapped\ in} {pg_swap_in 0 %d} } 
			{ {pages swapped\ out} {pg_swap_out 0 %d} } 
		}
		{
			{swap_ins change}
			{swap_outs change}
			{pg_swap_in change}
			{pg_swap_out change}
		}
	}

# look for available swap dropping by 50%, less than 1000 or less than
# 10% free.
set watch(2,memory) {"swap -s"
	      "swap space"
	      {
	              {"^total" {allocated 1 %d} {reserved 5 %d}
				{used 8 %d} {available 10 %d}}
	      }
	      { 
			{available pdelta_down 50%}
			{available range 1000 10000000}
			{available_% calc 
				100.0*(@available@)/(@available@+@used@)
				range 10 100}
	      }
	  }

#			{available delta 30000 10000000}

# look at nfs server stats for an increase of more than 5,
# any change in null_received, getattr% outside of the range of 0-30
# and increasing by more than 5
# symlink% outside the range of 0-10 and increasing by more then 5
#
set watch(3,nfs,network) { "nfsstat -s" "nfs server stats"
	     {
		{=3 {header 0-end %H}}
		{=4 {calls_rpc_server 0 %d} {badcalls_rcp_server 1 %d}
		    {null_received 2 %d} {badlength 3 %d} {xdrcall 4 %d}
		}
		{=9 {header 0-end %H}}
	        {=10 {null% 1 %d} {getattr% 3 %d} }
		{=12 {symlink% 13 %d}}
	     }
	    {
		{badcalls_rpc_server delta_up 5 }
		{null_received delta_up 0}
		{getattr% range 0 30 delta_up 5}
		{symlink% range 0 10 delta_up 5}
	    }
	}


# the timeout_badxid stuff is tricky. Only if the ratio between
# badxid and timouts is 2 or less do we want to report. So we split
# the range into 0.5 to 1 and 1 to 2 depending on which variable is bigger.
# this displays the use of the iftrue/iffalse tests.

set watch(4,nfs,network,!box7) { "nfsstat -c" "nfs client stats"
	     {
		{=3 {header 0-end %H}}
		{=4 {calls_rpc_server 0 %d} {badcalls_rpc_server 1 %d}
		    {retransmits 2 %d} {badxid 3 %d} {timeout 4 %d}
		    {timers_expire 7 %d}
		}
	        {=10 {getattr% 3 %d} {readlink% 12 %d} }
	     }
	    {
		{badcalls_rpc_server delta_up 10}
		{retrans_% calc 100.0*@retransmits@/@calls_rpc_server@ 
				range 0 5 delta_up 5}
		{retrans_badxid calc 
				100*(@retransmits@-@badxid@)/@retransmits@
				range 50 100 delta_down 0 }
		{timeout_badxid calc @timeout@>@badxid@ iftrue
			calc (@timeout@*1.0)/@badxid@ range 2 10000 }
		{timeout_badxid_1 calc @timeout@<@badxid@ iftrue
			calc (@timeout@*1.0)/@badxid@ range 0 0.5
			}
	    }
	}

# look for disk usage output from iostat. Use the stats from the last
# line to allow noise from startup of program to stabalize
# disk usage should be in the 0-40 range, and report only if
# 5 reports are out of range in the last 10 measurements.
#set watch(5,disk) {"iostat -D 1 5" "disk usage"
		{ {=1 {header 0-end %H}}
		  {=2 {header 0-end %h}}
		  {=7 {disk1_usage 2 %f} {disk2_usage 5 %f} {disk3_usage 8 %f}
			{disk4_usage 11 %f} }
		}
		{
			{disk1_usage range 0 40 cycle 5 10}
			{disk2_usage range 0 40 cycle 5 10}
			{disk3_usage range 0 40 cycle 5 10}
			{disk4_usage range 0 40 cycle 5 10}
		}
	}

# look at disk space
set watch(6,disk) {"df -k"
	      "disk space"
	      {
		      {"=1" {header 0-end %H}}
		      {"^/mnt" {ignore 0 %s} }
	              {"^/.*/var" {filesystem 0 %k} {varcapacity 4 %d}}
	              {"^/" {filesystem 0 %k} {capacity 4 %d}}
	              {"^swap" {swapfile 0 %k} {swapcapacity 4 %d}}
	      }
	      { {capacity delta_up 10 range 0 80}
		{capacity delta_up 30 range 0 60 severity alert}
		{capacity range 0 99 severity emerg orgroup a}
		{capacity range 0 98 delta_up 0 severity alert orgroup a}
		{capacity range 0 90 delta_up 0 orgroup a}
		{varcapacity range 0 75 change}
		{varcapacity delta_up 10 range 0 50 severity warning}
		{swapcapacity range 0 80}
	      }
	  }

# look at inode availability
set watch(7,disk) {"/usr/ucb/df -i"
	      "inodes"
	      {
		      {"=1" {header 0-end %H}} 
	              {"^/" {filesystem 0 %k} {%used 3 %d} {num_free 2 %d}}
	      }
	      { 
		{%used range 0 85}
		{num_free range 1000 10000000}
	      }
	  }

# look at kernel buffer pool failures
set watch(8,memory) { "echo kmastat | crash" "memory stats"
		{
			{ =1 {header 0-end %H}}
			{ =2 {header 0-end %h}}
			{^small {size 0 %k} {pools 1 %d} {bytes_in_pool 2 %d}
				{bytes_allocated 3 %d} {failures 4 %d}}
			{^outsize {size 0 %k} 
				  {bytes_allocated 3 %d} {failures 4 %d}}
			{^large {size 0 %k} {pools 1 %d} {bytes_in_pool 2 %d}
				{bytes_allocated 3 %d} {failures 4 %d}}
		}
		{
			{failures change}
		}
	      }

# look at all sorts of interface stats, errors, collisions,
# collisions as a percentage of output packets, queue lengths.
set watch(9,network) { "netstat -i" "Interface stats"
		{
			{ =1 {header 0-end %H} }
			{"lo0" {interface 0 %k} }
			{all {interface 0 %k} {Packets_in 4 %d}
			       {Errors_in 5 %d} {Packets_out 6 %d}
			       {Errors_out 7 %d}
			       {Collisions 8 %d} {Queue_length 9 %d}
			}
		}
		{
			{ collision_perc_of_output calc
				"@Collisions@*100.0/@Packets_out@"
				range 0 20 delta_up 1}
			{ Collisions delta_up 1000000 }
			{ Errors_in delta_up 5 }
			{ Errors_out delta_up 10 }
			{ Errors_perc_of_output calc 
			  	"@Errors_out@*100.0/@Packets_out@"
				range 0 1 delta_up 1}
			{ Queue_length change }
		}
	      }

# look at netstat info for network errors.
set watch(10,network) { {sh -c "netstat -s | sed 's/=/= /'"} "Netstat errors"
		{
			{ "udpInCksumErrs" {udp_checksum_errs 5 %d} 
					{udp_in_ovrflow 5 %d} }
			{ "ipInCksumErrs" {ip_checksum_errs 5 %d} }
			{ "icmpInCksumErrs" {icmp_checksum_errs 2 %d} }
			{ "udpInErrors" {udp_in_errors 5 %d} }
			{ "tcpInErrs" {tcp_in_errors 2 %d} }
			{ "rawipInOverflows" {rawip_in_ovrflow 2 %d} }
		}
		{
			{udp_in_errors delta_up 1 }
			{tcp_in_errors delta_up 1}
			{udp_checksum_errs delta_up 1 }
			{ip_checksum_errs delta_up 1 }
			{icmp_checksum_errs delta_up 1}
			{udp_in_ovrflow delta_up 1}
			{rawip_in_ovrflow delta_up 1}
		}
	}

# look for runaway processes. These will be at the top of the
# top output and will have racked up more than 55 minutes
# of time in the past hour.
# xlocks often get a lot of time when the processor is idle
# so we exclude them from the top report
set watch(11,process) {"/usr/local/bin/top -b 10" "Process time"
		{
			{ =7 {header 0-end %H}}
			{ "xlock$" {ignore 0 %s} }
			{ =8 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =9 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =10 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =11 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =12 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =13 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =14 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =15 {proc_pid 0 %k} {proc_time 7 %d} }
			{ =16 {proc_pid 0 %k} {proc_time 7 %d} }
		}
		{
			{proc_time delta_up 55}
		}
	}

set watch(12,print) { "lpstat" "stalled print jobs"
	{
		{"=1" {jobid1 0 %s} {jobid1key 0 %k}}
		{"=2" {jobid2 0 %s} {jobid2key 0 %k}}
		{"=3" {jobid3 0 %s} {jobid3key 0 %k}}
		{"=4" {jobid3 0 %s} {jobid4key 0 %k}}
	}
	{
		{jobid1 nochange suppress 5}
		{jobid2 nochange suppress 5}
		{jobid3 nochange suppress 5}
		{jobid4 nochange suppress 5}
	}
}

set watch(13,print) { "lpstat -s" "lp subsystem"
	{
		{ =1 {scheduler_status 2 %s} }
		{ =2 {system_default 3 %s} }
	}
	{
		{scheduler_status value running}
		{system_default value sp2}
	}
}

set watch(14,daemons) { "ps -ef" "Daemon status"
	{
		{ =1 {header 0-end %s} }
		{ {root .*amd} {process_amd 7 %s}}
		{ {root .*/usr/sbin/cron} {process_cron 7 %s}}
		{ {root .*inetd -s} {process_inetd 7 %s}}
		{ {root .*lockd} {process_lockd 7 %s}}
		{ {root .*lpNet} {process_lpNet 7 %s}}
		{ {root .*lpsched} {process_lpsched 7 %s}}
		{ {root .*mountd} {process_mountd 7 %s}}
		{ {root .*nfsd} {process_nfsd 7 %s}}
		{ {root .*rpcbind} {process_rpcbind 7 %s}}
		{ {root .*statd} {process_statd 7 %s}}
		{ {root .*syslogd} {process_syslogd 7 %s}}
		{ {root .*utmpd} {process_utmpd 7 %s}}
	}
	{
		{ process_amd required }
		{ process_cron required }
		{ process_inetd required }
		{ process_lockd required }
		{ process_lpNet required }
		{ process_lpsched required }
		{ process_mountd required }
		{ process_nfsd required }
		{ process_rpcbind required }
		{ process_statd required }
		{ process_syslogd required }
		{ process_utmpd required }
        }
}

## HOST SPECIFIC 

set watch(xs1.1,fsinfo) {
	{sh -c "/tools/X11R6/bin/fsinfo -server xs1:7000 2>&1"}
		"ncdfs server"
		{
		    {{unable\ to\ open\ server} {servername 5 %s}}
		}
		{
		    {servername value "error"}
		    {servername change}
		}
	     }

set watch(box7.1,speed) { {sh -c "PATH=/tools/scotty/bin /tools/scotty/bin/tcpspeed nj-ops-1"} "tcpspeed to nj"
	{
		{all { rate 0 %f } }
	}
	{
		{rate range 0.300 10000}
		{rate delta_down 0.200}
	}
}

# the output should always be at least 1 since the grep will match its
# own command line, however this has been returning 0 for some reason.
# so run it under sh and explicitly exit with 0 to suppress the child
# process terminated abnormally message.
set watch(box7.2,xs1.2,tftp) { 
	{sh -c "ps -eadf | grep in.tftp | wc -l; exit 0"}
			"Tftp process count"
		{
			{=1 {count 0 %d} }
		}
		{
			{count range 0 5}
		}
	}


set watch(box7.3,ntp) { "/tools/xntp/bin/ntpq -c peers" "ntp time check"
	{
		{ =1 {header 0-end %H}}
		{ =2 {ignore 0-1 %s}}
		{ {147.249.240.255} {ignore 0 %s}}
		{ {\*} {mhost 0 %k} {refid 1 %s} {reachability 6 %d}
			{offset 8 %d}}
		{ all {host 0 %k} {refid 1 %s} {reachability 6 %d}
			{offset 8 %d}}
	} {
		{mhost required severity warning}
		{reachability cycle 4 4 suppress 1 nochange range 374 377}
		{offset range -20 20 cycle 3 5}
		{offset range -50 50}
		{refid novalue 0.0.0.0}
		{mhost last}
		{refid disable change}
	}
}

set watch(box7.4,ntp) { "/tools/xntp/bin/ntpq -n -c peers" "ntp hosts check"
	{
		{ {192.168.240.255} {broadcast 0 %s}}
		{ {127.127.1.1} {local 0 %s}}		
		{ {158.121.104.4} {timeserver_cs 0 %s}}
		{ {192.168.240.71} {nfs1 0 %s}}
		{ {192.168.240.72} {nfs2 0 %s}}
		{ {192.168.240.2} {cisco2 0 %s}}
		{ {192.168.240.180} {ts1 0 %s}}
	} {
		{ broadcast required severity alert }
		{ local required severity alert }
		{ timeserver_cs required severity alert }
		{ nfs1 required severity alert }
		{ nfs2 required severity alert }
		{ cisco2 required severity alert }
		{ ts1 required severity alert }
	}
}

# qplus license has expired so eliminate it from consideration
# this also checks for a given license being out for more than three hours
# then it is suppressed for 12 hours so checkouts over weekends etc
# generate less noise
#
# note that a "cycle 3 3" and a "suppress 12" actually waits for 15 runs
# before reporting since the supress delivers 12 failed runs in a row, and
# thus resets the cycle to be empty, and it takes the sysle three more
# positive tests before it will allow a report.
#
set watch(box7.5,lmgrd) { 
	{sh -c "LM_LICENSE_FILE=/etc/license.dat /tools/lmgrd/bin/lmstat -A 2>&1"}
	"lmgrd license check"
	{
		{ {IST\ \(v3\.x\):} {status 2 %s} {Istatus 2 %s}}
		{ {suntechd\ \(v3\.x\):} {status 2 %s} {sstatus 2 %s}}
		{ {qplus\ :} {ignore 0 %s}}
		{ {LOTUS123\ \(v3\.x\):} {status 2 %s} {Lstatus 2 %s}}
		{ {iddis\.com\ /dev/} {key 0-end %k} {user 0 %s}}

	} {
		{status value UP}
		{user nochange cycle 3 3 suppress 12}
		{Istatus required}
		{sstatus required severity alert}
		{Lstatus required}
	}
}


set watch(box7.6.daemons) { "ps -ef" "Daemon status"
	{
		{ =1 {header 0-end %s} }
		{ {root .*arpwatch} {process_arpwatch 7 %s}}
		{ {root .*logtrapd} {process_logtrapd 10 %s}}
		{ {root .*lmgrd} {process_lmgrd 7 %s}}
		{ {root .*xntpd} {process_xntpd 7 %s}}
	}
	{
		{ process_arpwatch required }
		{ process_logtrapd required }
		{ process_lmgrd required }
		{ process_xntpd required }
        }
}


# make sure that our hosts are still listed in the windows dns server,
# and that things haven't failed again. Really wish the IT folks would
# monitor this stuff.
set watch(dns5) {"dig @192.20.3.7 sol.site.example.com" "sol in dns server 1"
                        {
				{"^sol.site.example.com" {solname 1 %s} }
                        }
                        {
                                {solname required}
                        }
}

set watch(dns6) {"dig @192.20.3.6 sol.site.example.com" "sol in dns server 2"
                        {
				{"^sol.site.example.com" {solname 1 %s} }
                        }
                        {
                                {solname required}
                        }
}

# a simple rule that calls a function
set watch(test) { "echo 3 2" "tclproc test"
	{
	   {=1 {aval 0 %s} {bval 1 %s}}
	}
	{
	  { aval global}
	  { bval tcl test {@aval@ @bval@} change}
	}
}

# The function called by the rule
proc test {new {old 0}} {
        puts "in test $new $old\n"

        if {[expr $new + $old]} {
		uplevel 1 {set result "error output"}
		return 1
	}
	return 0
}


# a very simple function
proc Watcher_proc {} {
atest
}

# another simple function called by a function
proc atest {} { #puts test }