#!/usr/bin/perl -n

# $Id: apache_referers,v 2.1 2003/11/18 14:59:54 struan Exp $

use strict;
use warnings;
use lib qw(/usr/local/lib/perl5/5.005);
use Template;
use Storable;
use POSIX qw( mktime );
use Getopt::Long;

use vars qw(@referers $template $store $last_date $num_hours $host %engines
            %new_engines $uri_length $ref_length $warn_new_engine 
            $host_length @logspammers $logspam);

BEGIN {
    # These are the configuration values to alter
    $store = $ENV{HOME} . "/.referers";     # where the store lives
    $num_hours = 24;        # number of hours of referers to use
    $host = 'exo.org.uk';   # domain of your website (don't include www)
    $uri_length = 20;       # truncate display uris after n characters
    $ref_length = 50;       # truncate display refs after n characters
    $host_length = 25;      # truncate display host after n characters
    $warn_new_engine = 1;   # warn if we can't extract terms from a query?
    # list of host that are either referer spam or sources of referer spam
    @logspammers = qw(141.85.3.130 217.73.164.106 girlie.giftdraw.com);
    
    # End config options

    # we only need the ones that don't use /q(?:uery)/ as the search
    # parameter
    %engines = (
        'search.yahoo.com'      =>  qr/(?:p|va)=([^&]*)/,
        'www.amazon.com'        =>  qr/websearch\.field-keywords=([^&]*)/,
        'www.mywebsearch.com'   =>  qr/searchfor=([^&]*)/,
        'kd.mysearch.myway.com' =>  qr/searchfor=([^&]*)/,
        'mysearch.myway.com'    =>  qr/searchfor=([^&]*)/,
        'www.mirago.co.uk'      =>  qr/qry=([^&]*)/,
        'search.iwon.com'       =>  qr/searchfor=([^&]*)/,
        'www.overture.com'      =>  qr/Keywords=([^&]*)/,
        'msxml.infospace.com'   =>  qr/qkw=([^&]*)/,
        'www.bellsouth.net'     =>  qw/string=([^&]*)/,
        'www.sirsearch.com'     =>  qr/Keywords=([^&]*)/,
    );

    $logspam = '(?:' . join('|', @logspammers) . ')';
    
    GetOptions( 'tmpl=s' => \$template );

    if ( $store and -f $store) {
        my $referers = retrieve($store);
        @referers = @$referers;
        
        # check we have data and grab newest referer from last run
        if ( $referers[0]->[0] ) {
            $last_date = $referers[$#referers]->[2];
        } else { # prob bad data so whack it. 
            undef @referers 
        }
    }

    $last_date ||= 0;
}

# mmmm, regular expressions
m#^(\S+)\s+\S+\s+\S+\s\[([^\]]*?)\]\s+"\S+\s+(\S+).*?"\s+\d+\s+\d+\s+"([^-].*?)"#;

# grab this so it's not wiped out by regex below
my $ip = $1;
my $date = $2;
my $uri = $3;
my $ref = $4;

push @referers, [$ref, $uri, $date, $ip] 
    if ($ref and $last_date < ( $date ? $date = date_convert($date) : 0 ) );

END {
    # stop if no information.
    $referers[0]->[0] or exit;

    $host =~ s/\./\\./g; # escape . in host name
    my $n_hours_ago = time - ($num_hours * 60 * 60);
    # only want external referers and those less than a n hours old
    @referers = grep { $_->[0] !~ m#^http://(?:www\.)?$host# 
                        && $_->[3] !~ /$logspam/ # referer log spam :(
                        && $_->[0] !~ /$logspam/ # referer log spam :(
                        && $_->[2] > $n_hours_ago } @referers;
   
    # to template or not to template
    if ($template) {
        # generate info suitable for display
        @referers = map { my $enc = $_->[0];
                          my $disp_uri = $_->[1];

                          # get search terms or make uri xhtml safe (ish)
                          my $search = get_search_terms($enc);
                          $search ? $enc = $search 
                                  : $enc =~ s/&([#a-z0-9]+[^;])/&amp;$1/i; 

                          # truncate URIs and hostnames
                          $enc = substr($enc, 0, $ref_length) . '...' 
                            if length($enc) > $ref_length;
                          $disp_uri = substr($disp_uri, 0, $uri_length) . '...' 
                            if length($disp_uri) > $uri_length;
                          $_->[3] = '...' . substr($_->[3], -$host_length)  
                            if length($_->[3]) > $host_length;

                          # create our data structure
                          { ref => { disp => $enc, uri => $_->[0] }, 
                            uri => { disp => $disp_uri, uri => $_->[1] }, 
                            date => { disp => display_date($_->[2]), date => $_->[2] }, 
                            host => $_->[3] 
                          }; 
                    } @referers;

        # we want to be able to pass absolute paths to TT
        my $tt = Template->new( ABSOLUTE => 1 ) or die $Template::ERROR;
        $tt->process($template, { refs => \@referers }) or die $tt->error();

        # get rid of display junk before storing
        @referers = map { [ $_->{ref}->{uri}, $_->{uri}->{uri}, 
                            $_->{date}->{date}, $_->{host} ] } @referers;
    } else {
        foreach (@referers) {
            # just spit out a list of referers
            print $_->[0] . "\n"; 
        }
    }

    # and store
    if ( $store ) {
        store \@referers, $store or warn "can't store to $store";
    }
}

# oooh, the sophistication
sub date_convert {
    my $date = shift;

    my %date_to_num  = (
        Jan =>  '0',
        Feb =>  '1',
        Mar =>  '2',
        Apr =>  '3',
        May =>  '4',
        Jun =>  '5',
        Jul =>  '6',
        Aug =>  '7',
        Sep =>  '8',
        Oct =>  '9',
        Nov =>  '10',
        Dec =>  '11',
    );

    my @date = ($date =~ m#(\d+)/([a-zA-Z]+)/(\d+):(\d+):(\d+):(\d+)#);
    my $time = mktime($date[5], $date[4], $date[3], $date[0], $date_to_num{$date[1]}, $date[2] - 1900);

    return $time;
}

sub display_date {
    my $date = shift;
    my $disp_date = '';

    my @today = localtime();
    my @ref_day = localtime($date);

    $disp_date = sprintf("%02d:%02d:%02d", 
                            $ref_day[2], $ref_day[1] ,$ref_day[0]);

    # date is not today so need to add date to time
    if ( ($today[3] != $ref_day[3]) || ($today[4] != $ref_day[4]) ) {
        $disp_date = sprintf("%02d/%02d", $ref_day[3], ( $ref_day[4] + 1 ) ) 
                     . " $disp_date";
    }

    return $disp_date;
}

# try and match with a known search engine. either explicit host or
# one of the larger search engines if no match.
sub which_engine {
    my $host = shift;
    
    if ( $engines{$host} ) {
        return $engines{$host};
    } elsif ( $host =~ /yahoo/ ) {
        return $engines{'search.yahoo.com'};
    }

    # most engines use this so we take a guess that it's this
    return qr/q(?:uery)?=([^&]*)/;
}

sub get_search_terms {
    my $uri = shift;

    # regex taken from URI.pm docs
    my ($scheme, $engine, $path, $query, $fragment) =
        $uri =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;

    return 0 unless $query;
                
    # strip guff from query. regex from URI::Escape docs
    $query =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
    # bloody lycos seems to encode this stuff twice...
    $query =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;

    # and get rid of + as they're usually there as spaces...
    $query =~ s/\+/ /g;

    my $engine_re = which_engine($engine);

    my ($terms) = ( $query =~ m/$engine_re/ );
    $engine =~ s/^www\.(.*)/$1/;
    return "$engine search for $terms" if $terms;

    # looks like we don't know how to extract the search terms
    unless ( $warn_new_engine and $new_engines{$engine} ) {
        warn "unknown engine: $engine";
        $new_engines{$engine} = 1;
    }  
    
    return 0;
}

__END__

=head1 NAME 

apache_referers

=head1 DESCRIPTION

Quick and dirty script to spit out the last n hours of referers and their targets from your apache logs.

=head1 USAGE

    apache_referers.pl [-t /path/to/tt2/template] /path/to/apache.log

The results of processing the template get spat out to standard out.

If you don't supply a template (which should be a Template Toolkit template) 
then it will just spit out a list of referering URIs.

If you supply a template then you get a chunk more information. See the 
section later on for details. You should just be able to edit the default
template to get what you want.

=head1 CONFIGURATION

Edit the values of the variables in the first BEGIN block. It'll run fine with the defaults although you'll see internal referers for your site unless you edit $host.

In case the comments are a bit too oblique this is what the variables do:

=over 4

=item $store

Path to the file to keep information between runs. You'll want this as otherwise when your logs are rotated any referer not in the new log file will dissapear. If you set this to 0 then nothing will be stored.

=item $num_hours

How many hours worth of referers you want to display

=item $host

The hostname for your website excluding the starting www (if it has one).

=item $uri_length

Length at which to truncate the displayed URI for the target page. '...' will be appended to the end of truncated URIs.

=item $ref_length

Length at which to truncate the URI of the referering page. This also applies to search descriptions.

=item $host_length

Length at which to truncate the referering host. This is truncated from the left hand side.

=item $warn_new_engine

If false then you won't get warnings about URIs that apache_referers can't extract search terms from.

=item @logspammers

Some people seem to think that spamming your referer logs is a sensible way
to generate traffic to their sites. If you add IPs or URIs of hosts that 
are either the refered to site or making the referal then they'll not turn 
up in the output. People pretty much suck.

=back

=head1 Template

The template is handed an array called refs. Each entry in the array has the
following values in it:

=over 4

=item entry.ref.disp

URI of referer suitable for display. i.e truncated and all the escapes decoded.
If this was a search engine query then you get the search terms instead.

=item entry.ref.uri

Raw URI of the referer.

=item entry.uri.disp

Truncated URI of target page

=item entry.uri.uri

Raw URI of target page

=item entry.date.disp

Date of refereral either in HH::MM::SS form if it was today or dd/mm HH::MM::SS
if it was earlier

=item entry.date.date

Date of referal as seconds since epoch

=item entry.host

Truncated host name or IP.

=over

So, to list all the refering URIs in cronological order:

    [% FOREACH entry = refs %]
        [% entry.ref.uri %]
    [% END %]

=head1 CAVEATS

Assumes log uses combined log format.

=head1 REQUIREMENTS

Template, GetOpt::Long, Storable.

=head1 AUTHOR

Struan Donald 
<code@exo.org.uk>
http://www.exo.org.uk/code/

=head1 COPYRIGHT

Copyright (C) 2003 Struan Donald. All rights reserved.

This program is free software; you can redistribute                             
it and/or modify it under the same terms as Perl itself.

=cut
