eval 'exec perl -S $0 "$@"'
    if $runnning_under_some_shell
;

#-----------------------------------------------------------------------

=head1 NAME

sitemapper - script for generating site maps

=head1 SYNOPSIS

    sitemapper [ -verbose ] [ -help ] [-doc ] [ -depth <depth> ] [ -proxy <proxy URL> ] [ -title <page title> ] -site <base URL>

=cut

#-----------------------------------------------------------------------

require 5.003;
use strict;

#-----------------------------------------------------------------------

=head1 DESCRIPTION

B<sitemapper> generates site maps for a given site. It traverses a site from
the base URL given as the L<OPTIONS/-site> option and generates an HTML page
consisting of a bulleted list which reflects the structure of the site. 

The structure reflects the distance from the home page of the pages listed;
i.e.  the first level bullets are pages accessible directly from the home page,
the next level, pages accessible from those pages, etc. Obviously, pages that
are linked from "higher" up pages may appear in the "wrong place" in the tree,
than they "belong".

=head1 OPTIONS

=over 4

=item -depth

Option to specify the depth of the site map generated. If no specified, 
generates a sitemap of unlimited depth.

=item -site

Option to specify a base URL to generate a site map for.

=item -proxy

Specify an HTTP proxy to use.

=item -title

Option to specify a page title for the site map.

=item -help

Display a short help message to standard output, with a brief
description of purpose, and supported command-line switches.

=item -doc

Display the full documentation for the script,
generated from the embedded pod format doc.

=item -verbose

Enable verbose reporting as the script runs.

=back

=head1 ENVIRONMENT

B<sitemapper> makes use of the C<$http_proxy> environment variable, if it is
set.

=head1 SEE ALSO

Getopt::Long (L<Getopt::Long>)
IO::File (L<IO::File>)
LWP::UserAgent (L<LWP::UserAgent>)
HTML::LinkExtor (L<HTML::LinkExtor>)
URI::URL (L<URI::URL>)
Pod::Text (L<Pod::Text>)
MD5 (L<MD5>)
Date::Format (L<Date::Format>)

=cut

#-----------------------------------------------------------------------

use Getopt::Long;
use IO::File;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
use Pod::Text;
use MD5;
use Date::Format;

#------------------------------------------------------------------------------
#
# Public global variables
#
#------------------------------------------------------------------------------

use vars qw( $VERSION );

$VERSION = '1.000';

#------------------------------------------------------------------------------
#
# Private global variables
#
#------------------------------------------------------------------------------

my (

    # command line options - see pod

    $opt_verbose,
    $opt_depth,
    $opt_title,
    $opt_site,
    $opt_help,
    $opt_doc,
    $opt_proxy,
    $opt_output,

    $Contact,           # contect e-mail address
    $Name,              # name of the program
    %MD5Hash,           # MD5 hash to identify identical pages
    %UrlSeen,           # hash of URLs already seen
);

( $Name ) = $0 =~ m{([^/]+)$};
$Contact = 'wrigley@cre.canon.co.uk';

#-----------------------------------------------------------------------

=head1 BUGS

Should use WWW::Robot (L<WWW::Robot>) to do the site traversal.

=head1 AUTHOR

Ave Wrigley E<lt>wrigley@cre.canon.co.ukE<gt>
Web Group, Canon Research Centre Europe

=head1 COPYRIGHT

Copyright (c) 1998 Canon Research Centre Europe. All rights reserved.

This script is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut

#==============================================================================
#
# Start of main
#
#==============================================================================

#------------------------------------------------------------------------------
#
# Get command line options
#
#------------------------------------------------------------------------------

&usage unless GetOptions
    'help'      => \$opt_help,
    'doc'       => \$opt_doc,
    'depth=i'   => \$opt_depth,
    'site=s'    => \$opt_site,
    'output=s'  => \$opt_output,
    'proxy=s'   => \$opt_proxy,
    'title=s'   => \$opt_title,
    'verbose'   => \$opt_verbose,
;

&usage if $opt_help;
&pod_self if $opt_doc;
&usage unless $opt_site;

#------------------------------------------------------------------------------
#
# Turn on autoflushing
#
#------------------------------------------------------------------------------

$|++;

#------------------------------------------------------------------------------
#
# Create MD5 object & LWP::UserAgent object
#
#------------------------------------------------------------------------------

my $MD5_Obj = new MD5;
my $Robot = new LWP::UserAgent;

#------------------------------------------------------------------------------
#
# Set the proxy from the environment
#
#------------------------------------------------------------------------------

if ( defined( $opt_proxy ) )
{
    &verbose( "proxy = $opt_proxy ..." );
    $Robot->proxy( [ 'http' ], $opt_proxy );
}
elsif ( defined( $ENV{ http_proxy } ) )
{
    &verbose( "getting proxy from environment ..." );
    &verbose( "proxy = $ENV{ http_proxy } ..." );
    $Robot->env_proxy;
}

#------------------------------------------------------------------------------
#
# Initialize the %UrlSeen hash (used to detect urls already seen)
#
#------------------------------------------------------------------------------

%UrlSeen = ();

#------------------------------------------------------------------------------
#
# Initialize the MD5Hash (used to detect different URLs which have the same
# content)
#
#------------------------------------------------------------------------------

%MD5Hash = ();

#------------------------------------------------------------------------------
#
# Create the root node
#
#------------------------------------------------------------------------------

my $root;

$root->{ 'url' } = $opt_site;

#------------------------------------------------------------------------------
#
# create site tree from the root node, recursively
#
#------------------------------------------------------------------------------

&create_site_tree( $root );

#------------------------------------------------------------------------------
#
# create the output file handle - either file sceified by -output, or STDOUT
#
#------------------------------------------------------------------------------

my $fh = new IO::File defined( $opt_output ) ? ">$opt_output" : ">&STDOUT" ;
die "$opt_output : $!\n" unless defined $fh;

#------------------------------------------------------------------------------
#
# print the header
#
#------------------------------------------------------------------------------

page_start( 
    $fh, 
    defined( $opt_title ) ? $opt_title : "Site Map for $opt_site" 
);

#------------------------------------------------------------------------------
#
# print site tree (recursively)
#
#------------------------------------------------------------------------------

print $fh '<UL>';
print_site_tree( $fh, $root );
print $fh '</UL>';

#------------------------------------------------------------------------------
#
# print the footer
#
#------------------------------------------------------------------------------

page_end( $fh );

#==============================================================================
#
# End of main
#
#==============================================================================

#==============================================================================
#
# Subroutines
#
#==============================================================================

#------------------------------------------------------------------------------
#
# hexhash - get an MD5 hex hash value for the contents of this URL
#
#------------------------------------------------------------------------------

sub hexhash
{
    my $html = shift;

    $MD5_Obj->reset;
    return $MD5_Obj->hexhash( $html );
}

#------------------------------------------------------------------------------
#
# get_links - returns a list of the links for a given HTML string
#
#------------------------------------------------------------------------------

sub get_links
{
    my $url     = shift;
    my $html    = shift;
    my $base    = shift;

    my ( @links );

    &verbose( "extracting links from $url ..." );

    my @frame_links = &expand_frameset( $url, $html, $base );
    return @frame_links if @frame_links;

    my $link_extor = new HTML::LinkExtor( 

        # anonymous callback function for HTML::LinkExtor

        sub {

            my ( $tag, %attr ) = @_; 
            my ( $link );

            # grab anchor / area links

            if( lc( $tag ) =~ /^a(?:rea)?$/ )
            {
                return unless defined( $link = $attr{ 'href' } );
            }
            else
            {
                return;
            }

            # ignore off site links

            unless ( $link =~ m!^$opt_site! )
            {
                return;
            }

            # strip hashes (i.e. ignore / don't distinguish page internal links)

            $link =~ s!#.*!!;

            # only follow html links (.html or .htm or no extension)

            if ( $link =~ m{\.([^./]+)$} )
            {
                return unless $1 =~ /^html?$/;
            }

            # only follow links we haven't seen yet ...

            return if $UrlSeen{ $link };
            $UrlSeen{ $link }++;

            &verbose( "adding $link ..." );

            push( @links, $link );
        },
        $base
    );

    # do the business ...

    $link_extor->parse( $html );

    # ... and return the links created in the callback

    return( @links );
}

#------------------------------------------------------------------------------
#
# expand_frameset - if this URL is a frameset, extract the links from the
# frames in that frameset
#
#------------------------------------------------------------------------------

sub expand_frameset
{
    my $url     = shift;
    my $html    = shift;
    my $base    = shift;

    # try extracting any frames ...

    my ( @frames, @links );

    my $frame_extor = new HTML::LinkExtor( 
        sub {

            my ( $tag, %attr ) = @_;
            my ( $frame );

            push( @frames, $frame )
                if ( lc( $tag ) eq 'frame' )
                and defined( $frame = $attr{ 'src' } )
            ;
        },
        $base
    );

    # do the business ...

    $frame_extor->parse( $html );

    if ( @frames )
    {
        foreach my $frame ( @frames )
        {
            my ( $content, $base ) = &get_content_and_base( $frame );
            push( @links, &get_links( $frame, $content, $base ) )
                if defined $content
            ;
        }
        return @links;
    }

    # no frames!

    return ();
}

#------------------------------------------------------------------------------
#
# get_response - get HTTP::Response object for a given URL. Returns undef on
# failure
#
#------------------------------------------------------------------------------

sub get_response
{
    my $url = shift;

    &verbose( "getting $url ..." );

    my $response = $Robot->request( new HTTP::Request( 'GET', $url ) );

    if ( $response->is_success )
    {
        return $response;
    }
    else
    {
        &verbose( "failed to get $url ..." );
        return undef;
    }
}

#------------------------------------------------------------------------------
#
# get_title - get the title from an HTML string
#
#------------------------------------------------------------------------------

sub get_title
{
    my $html = shift;

    # get title from page - if no title is specified, get the contents of the
    # first H1, H2, H3, H4, H5, or H6 tag

    my ( $title ) = 
        $html =~ m!<TITLE[^>]*>(.*?)</TITLE>!i or
        $html =~ m!<H([1-6])[^>]*>(?:.*?)</H\1>!i
    ;

    return $title;

}

#------------------------------------------------------------------------------
#
# create_site_tree - create the site tree data structure (recursively)
#
#------------------------------------------------------------------------------

sub create_site_tree
{
    my $node = shift;
    my $depth = shift || 0;

    my $url = $node->{ 'url' };
    my ( $content, $base ) = &get_content_and_base( $url );
    return unless defined( $content );
    $node->{ 'title' } = &get_title( $content ) || $url;

    my ( @links ) = &get_links( $url, $content, $base )
        if ( ! defined( $opt_depth ) ) or $depth < $opt_depth
    ;

    for ( @links )
    {
        my $child_node;
        $child_node->{ 'url' } = $_;
        push( @{ $node->{ children } }, $child_node );
        &create_site_tree( $child_node, $depth + 1 );
    }
}

#------------------------------------------------------------------------------
#
# get_content_and_base - get the HTML content, and the base URL for a given
# URL. Return undef if GET fails, or if have seen that page before
#
#------------------------------------------------------------------------------

sub get_content_and_base
{
    my $url = shift;

    my $response = &get_response( $url );
    return ( undef, undef ) unless defined $response;
    my $base = $response->base;
    my $content = $response->content;
    my $hash = &hexhash( $content );
    if ( $MD5Hash{ $hash } )
    {
        &verbose( "$url is identical to $MD5Hash{ $hash } ..." );
        return ( undef, undef );
    }
    $MD5Hash{ $hash } = $url;
    return ( $content, $base );
}

#------------------------------------------------------------------------------
#
# print_site_tree - print the code for each of the "children" of the root
# URL (recursively), based on the site tree structure
#
#------------------------------------------------------------------------------

sub print_site_tree
{
    my $fh      = shift;
    my $node    = shift;

    return if $node->{ deleted };

    my $title           = $node->{ 'title' };
    my $url             = $node->{ 'url' };
    my $children        = $node->{ 'children' };

    return unless $title and $url;

    print_node( $fh, $title, $url );

    if ( $children and @{ $children } )
    {
        print $fh '<UL>';
        for ( @{ $children } )
        {
            &print_site_tree( $fh, $_ );
        }
        print $fh '</UL>';
    }
}

#------------------------------------------------------------------------------
#
# print_node - print out a node of the site tree
#
#------------------------------------------------------------------------------

sub print_node
{
    my $fh      = shift;
    my $title   = shift;
    my $url     = shift;

    my ( $path );
    ( $path = $url ) =~ s!^$opt_site!!;
    print $fh <<HTML_CHILD;
<LI>
    <DL>
        <DT><A HREF = "$url"><B>$title</B></A>
        <DD><A HREF = "$url">$path</A>
    </DL>
HTML_CHILD
}

#------------------------------------------------------------------------------
#
# page_start - print out a standard HTML page header
#
#------------------------------------------------------------------------------

sub page_start
{
    my $fh      = shift;
    my $title   = shift;

    print $fh <<HTML_HEADER;
<HTML>
    <HEAD>
        <TITLE>$title</TITLE>
    </HEAD>
    <BODY BGCOLOR = "#FFFFFF">
        <H1>$title</H1>
        <HR NOSHADE>
HTML_HEADER
}

#------------------------------------------------------------------------------
#
# page_end - print out a standard HTML page footer
#
#------------------------------------------------------------------------------

sub page_end
{
    my $fh = shift;

    my $when = time2str( "on %A the %o of %B %Y at %r", time );

    print $fh <<HTML_FOOTER;
        <HR NOSHADE>
        <TABLE WIDTH = "100%">
            <TR>
                <TD VALIGN = "TOP" ALIGN = "LEFT">
                    $Name version $VERSION
                <TD VALIGN = "TOP" ALIGN = "RIGHT">
                    <A HREF = "mailto:$Contact">$Contact</A>
            <TR>
                <TD COLSPAN = 2 VALIGN = "TOP" ALIGN = "LEFT">
                    Generated $when
        </TABLE>
    </BODY>
</HTML>
HTML_FOOTER
}

#------------------------------------------------------------------------------
#
# usage - die with a usage message
#
#------------------------------------------------------------------------------

sub usage
{
    die <<USAGE;

$Name version $VERSION

usage : 
    $Name [options]

options :
    -site <base URL>    REQUIRED        # base URL of the site
    -output <filename>  OPTIONAL        # output file
    -verbose            OPTIONAL        # Print verbose messages
    -depth <depth>      OPTIONAL        # Maximum depth of sitemap
    -proxy <proxy URL>  OPTIONAL        # HTTP proxy
    -help               OPTIONAL        # Print this message
    -doc                OPTIONAL        # Print POD documentation
USAGE
}

#------------------------------------------------------------------------------
#
# pod_self - dump my POD as text
#
#------------------------------------------------------------------------------

sub pod_self
{
    $Pod::Text::termcap = 1;
    print pod2text( $0 );
    exit( 0 );
}

#------------------------------------------------------------------------------
#
# verbose - echo to STDERR if in verbose mode
#
#------------------------------------------------------------------------------

sub verbose
{
    return unless $opt_verbose;
    print STDERR "$Name: ", shift, "\n";
}

#==============================================================================
#
# End of subroutines
#
#==============================================================================
