#!/usr/bin/perl -w
# $Id: slpolice,v 1.24 2004/08/26 15:04:21 ws150726 Exp $
################ Introduction ################
#
# Copyright 2000-2004 by Wilson Snyder.  This program is free software;
# you can redistribute it and/or modify it under the terms of either the GNU
# General Public License or the Perl Artistic License.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
######################################################################

require 5.005;
use IO::File;
use Pod::Text;
use Getopt::Long;
use Schedule::Load::Hosts;
use Time::localtime;
use strict;

######################################################################

use vars qw($Debug
	    $Debug_User
	    $Nice19 $Sendmail
	    $Opt_Renice_Min $Opt_Cpu_Min $Opt_Reserved_Min
	    %Complaints
	    );

$Nice19 = "renice19 -only_if_at";
$Sendmail = "/usr/lib/sendmail";

######################################################################

my %opt_server_params = ();
if (!GetOptions (
		 "help"		=> \&usage,
		 "debug!"	=> \$Debug,
		 "cpu_min=i"	=> \$Opt_Cpu_Min,
		 "renice_min=i"	=> \$Opt_Renice_Min,
		 "reserved_min=i"=> \$Opt_Reserved_Min,
		 "port=i"	=> sub {shift; $opt_server_params{port} = shift;},
		 "dhost=s"	=> sub {shift; push @{$opt_server_params{dhost}}, split(':',shift);},
		 )) {
    usage();
}

my $scheduler = Schedule::Load::Hosts->fetch(%opt_server_params);
gather_loads($scheduler);
gather_reserveds($scheduler);
complain();

#----------------------------------------------------------------------

sub usage {
    print '$Id: slpolice,v 1.24 2004/08/26 15:04:21 ws150726 Exp $ ', "\n";
    $SIG{__WARN__} = sub{};	#pod2text isn't clean.
    pod2text($0);
    exit(1);
}

######################################################################

sub gather_loads {
    my $scheduler = shift;

    (my $FORMAT =           "%-12s   %6s    %-8s     %4s    %6s     %-5s    %6s     %5s%%    %s\n") =~ s/\s\s+/ /g;
    foreach my $host ( @{$scheduler->hosts} ){
	foreach my $p ( sort {$b->pctcpu <=> $a->pctcpu}
			@{$host->top_processes} ) {
	    my $mach = $host->hostname;
	    my $name = $p->uname;
	    #print "ck $line\n";
	    next if !$p->time;
	    next if $name eq "root";
	    my $pid = $p->pid;
	    my $min = $p->time/60.0;
	    $min = 0  if ($Debug);
	    $min = 9999 if ($Debug && $p->uid eq $<);  # Test... Every one of runner's violates
	    my $line = sprintf ($FORMAT, 
				$host->hostname,
				$p->pid, 
				$p->uname,		$p->nice0, 
				int(($p->size||0)/1024/1024)."M",
				$p->state, 		$p->time_hhmm,
				sprintf("%3.1f", $p->pctcpu),
				$p->fname);

	    print "Min $min  Name $name   Pid $pid\n" if $Debug;
	    if ($Opt_Renice_Min
		&& $min >= $Opt_Renice_Min) {
		my $lowered = 0;
		my $succ = `ssh $mach $Nice19 $pid 2>&1`;
		print "Lowering $mach $pid $succ\n" if $Debug;
		if ($succ !~ /%/) {
		    $Complaints{$name}{niced}{$pid}
		    = {one_subject => sprintf("Reniced Process %5d on $mach", $pid),
		       many_subject => sprintf("Reniced Processes"),
		       body_header => ("The following processes were reniced to 19\n"
				       ."Use `renice10 <pid>` to prevent this.\n"),
		       body_line => "   ".$line,
		       };
		}
	    } 
	    
	    if ($Opt_Cpu_Min
		&& $min >= $Opt_Cpu_Min) {
		$Complaints{$name}{cpu}{$pid}
		    = {one_subject => sprintf("High CPU Time Process %5d on $mach", $pid),
		       many_subject => sprintf("CPU Consuming Processes"),
		       body_header => ("The following processes have large CPU times\n"
				       ."Please consider killing them.\n"),
		       body_line => "   ".$line,
		       };
	    }
	}
    }
}

sub gather_reserveds {
    my $scheduler = shift;

    (my $FORMAT =             "%-12s   %-25s  %s\n") =~ s/\s\s+/ /g;
    foreach my $host ( @{$scheduler->hosts} ){
	if ($host->reserved) {
	    my $ostype = $host->archname ." ". $host->osvers;
	    $ostype =~ s/enterprise//;
	    $ostype .= " (on ".$host->slreportd_hostname.")" if $host->slreportd_hostname ne $host->hostname;

	    # Really the scheduler should provide preparsed information....
	    if ($host->reserved !~ /(\S+) at (\d\d)-(\S+) (\d\d):(\d\d)(;?.*)$/) {
		print "Res Parse Failed: ".$host->reserved."\n" if $Debug;
		next;
	    }

	    my ($name,$mday,$mon, $hr,$dmin,$cmt) = ($1, $2,$3,  $4,$5,    $6);
	    print "Compare then $mday $hr:$dmin  now ",localtime->mday," ",localtime->hour,":",localtime->min,"\n"  if $Debug;
	    my $min = localtime->min - $dmin;
	    $min += 60 * (localtime->hour - $hr);
	    $min += 24 * 60 * (localtime->mday - $mday);
	    $min = 0 if $min < 0;	# Too lazy to check months.  Let user get away with it.
	    #$min = 9999 if ($Debug);  # Test... Every one of runner's violates

	    my $res = "Reserved: ".$host->reserved;
	    my $line = sprintf ($FORMAT,
				$host->hostname, 
				$ostype,
				$res);
	    print "Res: $min: $line\n" if $Debug;
	    
	    if ($name ne "root"
		&& $Opt_Reserved_Min
		&& !$cmt
		&& $min >= $Opt_Reserved_Min) {
		$Complaints{$name}{reserved}{$host->hostname}
		    = {one_subject => sprintf("Long Reservation of %s", $host->hostname),
		       many_subject => sprintf("Long Reservations"),
		       body_header => ("The following reservations have been around for a long time\n"
				       ."Please consider releasing them.  Or, use a --comment with\n"
				       ."the reservation explaining your reservation reason.\n"),
		       body_line => "   ".$line,
		       };
	    }
	}
    }
}

sub complain {
    for my $to (sort (keys %Complaints)) {
	my $body = "";
	$body .= "DEBUGGING.  Really-To: $to\n\n" if $Debug;
	my $mailto = $to;
	$mailto = $ENV{USER} if $Debug;

	my $subj = undef;
	for my $topic (sort (keys %{$Complaints{$to}})) {
	    my $newtopic = 1;
	    for my $proc (sort (keys %{$Complaints{$to}{$topic}})) {
		my $procref = $Complaints{$to}{$topic}{$proc};
		if (!defined $subj) {
		    $subj = $procref->{one_subject};
		} else {
		    $subj = $procref->{many_subject};
		}
		if ($newtopic) {
		    $newtopic = 0;
		    $body .= "\n";
		    $body .= $procref->{body_header};
		    $body .= "\n";
		}
		$body .= $procref->{body_line};
	    }
	}

	# Some cleanups
	$body =~ s/\n\n+/\n\n/mg;
	print "To: $to\nSubject: $subj\n$body\n" if $Debug;

	# Send the mail
	my $cmd = "$Sendmail -F 'Rschedule Police' -f root $mailto";
	my $fh = IO::File->new("|$cmd") or die "%Error: $! $cmd";
	print $fh "To: $mailto\n";
	print $fh "From: Rschedule Police <root>\n";
	print $fh "Subject: Rschedule Police: $subj\n";
	print $fh "\n";
	print $fh "To see latest status, use:  rloads or rhosts\n";
	print $fh "\n";
	print $fh "$body\n";
	$fh->close;
    }
}

######################################################################
######################################################################
__END__

=pod

=head1 NAME

slpolice - Warn and renice top CPU hogs

=head1 SYNOPSIS

B<slpolice>
[ B<--help> ]
[ B<--port=>I<port> ]
[ B<--dhost=>I<host> ]
[ B<--cpu-hours> ]
[ B<--version> ]
[ B<--version> ]

=head1 DESCRIPTION

slpolice will determine the top cpu users across a cluster of hosts.
It will send mail if a process has over a specified amount of cpu time.

It will also mail if a user has a reservation for a long period of time.

Usually slpolice is run with a crontab entry similar to:

    5 8-21 * * * /usr/local/bin/slpolice --cpu_min 120 --reserved_min 120 >/dev/null 2>&1

This sends warnings each hour after 2 hours of CPU time.  It does not check
at night so that long overnight jobs will not receive warnings.

This program is most valuable when used with the L<nicercizerd> program, or
a operating system where nice 19 processes get only leftover cpu resources.
It requires a program called L<nice19> which is a version of nice that is
setgid root and renices a job to 19.  This comes with L<nicercizerd>.

=head1 ARGUMENTS

=over 4

=item --help

Displays this message and program version and exits.

=item --port <portnumber>

Specifies the port number that slchoosed uses.

=item --dhost <hostname>

Specifies the host name that slchoosed uses.  May be specified multiple
times to specify backup hosts.  Defaults to SLCHOOSED_HOST environment
variable, which contains colon separated host names.

=item --cpu_min

Number of cpu minutes the job should have before being reported to the user.
Defaults to 0, which is off.

=item --renice_min

Number of minutes after which the nice value of a high cpu using process is
not 10 is reniced to 19.  Defaults to 0, which is off.

=item --reserved_min

Number of minutes a host may be reserved before reporting it to the user.
Defaults to 0, which is off.

=item --version

Displays program version and exits.

=back

=head1 DISTRIBUTION

The latest version is available from CPAN and from L<http://www.veripool.com/>.

L<nicercizerd> is available from L<http://veripool.com>.

Copyright 1998-2004 by Wilson Snyder.  This package is free software; you
can redistribute it and/or modify it under the terms of either the GNU
Lesser General Public License or the Perl Artistic License.

=head1 AUTHORS

Wilson Snyder <wsnyder@wsnyder.org>

=head1 SEE ALSO

L<Schedule::Load>, L<nicercizerd>,  L<nice19>,

=cut
######################################################################
### Local Variables:
### compile-command: "./slpolice --debug --cpu_min 120 --reserved_min 120 "
### End:
