#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long qw(:config autohelp);
use POE::Component::IRC::Common qw(l_irc);
use Pod::Usage;

my $VERSION = '0.01';

GetOptions(
    'f|format=s'       => \(my $format = ''),
    'b|bot=s'          => \my $bot,
    'r|ignore-regex=s' => \my @ignore_regexes,
    'n|ignore-nick=s'  => \my @ignore_nicks,
    's|strip=s'        => \(my $strip = ''),
) or pod2usage();

my ($privmsg, $action);
if ($format eq 'pocoirc') {
    $privmsg = qr/^\S+ \S+ <(\S+)> (.*)/;
    $action = qr/^\S+ \S+ \* (.*)/;
}
elsif ($format eq 'irssi') {
    $privmsg = qr/^\S+\s+<.(\S+)> (.*)/;
    $action = qr/^\S+\s+\* (.*)/;
}
else {
    die "No log format specified with --format\n";
}

LINE: while (my $line = <STDIN>) {
    chomp $line;

    # encoding might be inconsistent, convert everything to UTF-8
    if ($format eq 'irssi') {
        require Encode;
        require Encode::Guess;
        $line = Encode::Guess::guess_encoding($line, 'utf8')
            ? Encode::decode('utf8', $line)
            : Encode::decode('cp1252', $line)
        ;
    }

    if ($strip =~ /^(?:color|both)$/) {
        $line = strip_color($line);
    }
    if ($strip =~ /^(?:formatting|both)$/) {
        $line = strip_formatting($line);
    }

    my ($nick, $msg);
    if (($nick, $msg) = $line =~ /$privmsg/) {
        # PRIVMSG 
        if (defined $bot) {
            my ($first) = $msg =~ /^\s*([^0\s][^\s:,;.!?]*)[:,;.!?]?\s*/;
            if (defined $first && l_irc($first) eq l_irc($bot)) {
                $msg =~ s/\Q$first\E[:,;.!?]?\s*//;
            }
        }
    }
    elsif (($msg) = $line =~ /$action/) {
        # ACTION
        ($nick) = $msg =~ /^(\S+)/;
    }
    else {
        next LINE;
    }
    
    # skip it if we want to ignore this nick
    for my $bad (@ignore_nicks) {
        next LINE if l_irc($nick) eq l_irc($bad);
    }

    # skip if it matches a regex
    for my $regex (@ignore_regexes) {
        next LINE if $msg =~ /$regex/;
    }

    print "$msg\n";
}

=head1 NAME

irchal_seed - Creates a L<MegaHAL|AI::MegaHAL> training file from logs
generated by L<POE::Component::IRC::Plugin::Logger|POE::Component::IRC::Plugin::Logger>
or irssi.

=head1 SYNOPSIS

B<irchal_seed> <options>

 Options:
   -f, --format=NAME   Specify a log format (pocoirc or irssi)
   -c, --case=NAME     The IRC server's case mapping (see below)
   -b, --bot           The name of the IRC bot
   -r, --ignore-regex  A regex for lines to ignore
   -n, --ignore-nick   A nickname to ignore
   -s, --strip=OPT     Strip 'color', 'formatting', or 'both'

Input is accepted on STDIN, and output is delivered to STDOUT.

The C<--case> option controls the second parameter passed to C<l_irc>
from L<POE::Component::IRC::Common|POE::Component::IRC::Common>. The defalt
is B<'rfc1459'>. You usually don't have to change this. Consult that
package's documentation for more details.

You should specify the C<--bot> option if your bot was present when the
channel was logged as it will strip the "botname: " part from message
where users addressed the bot.

If your bot (or any other bot in the channel) can take commands that all
start in a similar way, use the C<--ignore-regex> option to ignore those
lines, e.g. C<--ignore-regex='^!\w+'> for bots that react to commands
prefixed with '!'.

 Example:
  # convert logs, ignoring lines containing URLs
  cat ~/logs/\#chan.log | irchal_seed -b MyBot -r '\w+://\w' > megahal.trn

=head1 AUTHOR

Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com

=head1 LICENSE AND COPYRIGHT

Copyright 2009 Hinrik E<Ouml>rn SigurE<eth>sson

This program is free software, you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
