#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long qw(:config autohelp);
use POE::Component::IRC::Common qw(l_irc strip_color strip_formatting irc_to_utf8);
use Pod::Usage;

my $VERSION = '0.04';

GetOptions(
    'f|format=s'       => \(my $format = ''),
    'b|bot=s'          => \my $bot,
    'r|ignore-regex=s' => \my @ignore_regexes,
    'n|ignore-nick=s'  => \my @ignore_nicks,
    'v|version'        => sub { print "irchailo-seed version $VERSION\n"; exit },
) or pod2usage();

my ($privmsg, $action);
if ($format eq 'pocoirc') {
    $privmsg = qr/^\S+ \S+ <(\S+)> (.*)/;
    $action = qr/^\S+ \S+ \* (.*)/;
}
elsif ($format eq 'irssi') {
    $privmsg = qr/^\S+\s+<.(\S+)> (.*)/;
    $action = qr/^\S+\s+\* (.*)/;
}
else {
    die "No log format specified with --format\n";
}

binmode STDOUT, ':utf8';
LINE: while (my $line = <STDIN>) {
    chomp $line;

    # irssi's encoding might be inconsistent, convert everything to UTF-8
    $line = irc_to_utf8($line) if $format eq 'irssi';

    $line = strip_color($line);
    $line = strip_formatting($line);

    my ($nick, $msg);
    if (($nick, $msg) = $line =~ /$privmsg/) {
        # PRIVMSG 
        $msg =~ s/^\s*\Q$bot\E[:,;.!?~]?\s?//i if defined $bot;
    }
    elsif (($msg) = $line =~ /$action/) {
        # ACTION
        ($nick) = $msg =~ /^(\S+)/;
    }
    else {
        next LINE;
    }
    
    # skip it if we want to ignore this nick
    for my $bad (@ignore_nicks) {
        next LINE if l_irc($nick) eq l_irc($bad);
    }

    # skip if it matches a regex
    for my $regex (@ignore_regexes) {
        next LINE if $msg =~ /$regex/;
    }

    print "$msg\n";
}

=head1 NAME

irchailo-seed - Creates a L<Hailo|Hailo> training file from logs generated
by L<POE::Component::IRC::Plugin::Logger|POE::Component::IRC::Plugin::Logger>
or irssi.

=head1 SYNOPSIS

B<irchailo-seed> <options>

 Options:
   -f FORMAT, --format=NAME         Log format (pocoirc or irssi)
   -c CASE,   --case=CASE           The IRC server's case mapping
   -b NICK,   --bot=NICK            The nickname of the IRC bot
   -r RX      --ignore-regex=RX     A regex for lines to ignore
   -n NICK,   --ignore-nick=NICK    A nickname to ignore
   -h,        --help                Print this help message
   -v,        --version             Print version number

Input is accepted on STDIN, and output is delivered to STDOUT.

You should use the C<--bot> option if your bot was present when the channel
was logged as it will strip the "botname: " part from message where users
addressed the bot.

If your bot (or any other bot in the channel) can take commands that all
start in a similar way, use the C<--ignore-regex> option to ignore those
lines, e.g. C<--ignore-regex='^!\w+'> for bots that react to commands
prefixed with '!'.

Example:

  # convert irssi log, stripping "MyBot: " prefix and ignoring lines with URLs
  cat ~/logs/\#chan.log | irchailo-seed -f irssi -b MyBot -r '\w+://\w' > hailo.trn

The C<--case> option controls the second parameter passed to C<l_irc>
from L<POE::Component::IRC::Common|POE::Component::IRC::Common>. The default
is B<'rfc1459'>. You usually don't have to change this. Consult that
package's documentation for more information.

=head1 AUTHOR

Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com

=head1 LICENSE AND COPYRIGHT

Copyright 2010 Hinrik E<Ouml>rn SigurE<eth>sson

This program is free software, you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
