#!/usr/bin/perl
#
# Copyright (c) 2003,2004 by USUDA Hisashi <usu@d1.dion.ne.jp>
#
# Usage: searchreferer.pl [-i][-t limit][-c countlimit][-p path] < access_log
#

use CGI qw/:standard -no_debug/;
use Text::Iconv;
use Jcode;

my $Version = "searchreferer.pl version 1.2";
my $Copyright = "Copyright (c) 2003,2004 by usu\@d1.dion.ne.jp";

sub usage {
	print STDERR "Usage: searchreferer.pl [-i][-t n][-c n][-p path] < access_log\n";
	print STDERR "   -i: ignore case distinctions.\n";
	print STDERR "   -t: output only top n.\n";
	print STDERR "   -c: output only value n.\n";
	print STDERR "   -p: output only specific path.\n";
	exit 1;
}

sub count_keywords {
  my($s) = @_;
  my @kwds = split(/\s+|()/, $s);
  my $i;

  for($i=0; $i<=$#kwds; $i++) {
	next if $kwds[$i] =~ /^\s*$/;
	next if $kwds[$i] =~ /^$/;
	$kwds[$i] = lc($kwds[$i]) if $IGNCASE;
	$KEYWORD_CNT{$kwds[$i]}++;
	$KEYWORD_CNT_ALL++;
  }
}

sub parse_qstring {
  my($url, $line) = @_;
  my $cgi = CGI->new();
  my $ret = 0;
  my($s, $e);
  my $uconv = Text::Iconv->new("UTF-8", "EUC-JP");
  my $sconv = Text::Iconv->new("Shift-JIS", "EUC-JP");

  if($url =~ /^http:\/\/[^\.]+\.yahoo\.[^\/]+\/[^\?]*\?(\S+)$/) {
	$cgi->parse_params($1);
	$e = $cgi->param('p');
	Jcode::convert(\$e, "euc");
	&count_keywords($e);
	printf("  yahoo:  \"%s\" (%d)\n", $e, $line+1) if $DEBUG;
	$ret = 2;
  } elsif($url =~ /^http:\/\/[^\.]+\.google\.[^\/]+\/[^\?]*\?(\S+)$/) {
	$cgi->parse_params($1);
	if($cgi->param('q') =~ /^cache:\S+\s+(.*)$/) {
		$s = $1;
	} else {
		$s = $cgi->param('q');
	}
	$s =~ s{\\x([0-9a-fA-F]{2})}{chr(hex($1))}gsex;
	if($cgi->param('ie') =~ /^S\*JIS$/) {
		$e = $sconv->convert($s);
	} elsif(!$cgi->param('ie') && $cgi->param('lr') eq 'lang_ja') {
		$e = $sconv->convert($s);
	} else {
		$e = $uconv->convert($s);
	}
	&count_keywords($e);
	printf("  google: \"%s\" (%d)\n", $e, $line+1) if $DEBUG;
	$ret = 1;
  } elsif($url =~ /^http:\/\/search\.msn\.[^\/]+\/[^\?]*\?(\S+)$/) {
	$cgi->parse_params($1);
	if($cgi->param('CP') eq "932") {
		$e = $sconv->convert($cgi->param('q'));
	} else {
		$e = $uconv->convert($cgi->param('q'));
	}
	&count_keywords($e);
	printf("  msn:   \"%s\" (%d)\n", $e, $line+1) if $DEBUG;
	$ret = 3;
  } elsif($url =~ /^http:\/\/search.goo.ne.jp\/[^\?]*\?(\S+)$/) {
	$cgi->parse_params($1);
	&count_keywords($cgi->param('MT'));
	printf("  goo:    \"%s\" (%d)\n", $cgi->param('MT'), $line+1) if $DEBUG;
	$ret = 4;
  }
  $ret;
}

##

# $DEBUG = 1;
my ($i, $t) = (0, 0);

while ($ARGV[0] =~ /^-/) {
	$_ = shift;
	if(/^-t$/) { $TLIMIT = shift; next; }
	if(/^-c$/) { $CLIMIT = shift; next; }
	if(/^-p$/) { $SPATH = shift; next; }
	if(/^-i$/) { $IGNCASE = 1; next; }
	if(/^-d$/) { $DEBUG = 1; next; }
	if(/^-v$/) { print "$Version\n$Copyright\n"; exit 0; }
	&usage;
}

print "[Google/Yahoo/MSN/Goo θз";
if($SPATH) {
	print " : $SPATH";
} else {
	print " : ٤";
}
print "]\n";
print "\n" if $DEBUG;
for($all=0; <>; $all++) {
	chop;
	if(/^\S+ \S+ \S+ \[[^\]]*\] \"([^"]*)\" \S+ \S+ \"([^"]*)\"/) {
		my $spath   = $1;
		my $referer = $2;
		if(!$SPATH || ($spath =~ /\S+\s+$SPATH\s/)) {
			$r = &parse_qstring($referer, $all);
			$ACCESS_CNT[$r-1]++ if $r > 0;
		}
	} else {
		print "Warning: bad format: $_\n" if $DEBUG;
	}
}

if(!$SPATH) {
	print "\n[Google/Yahoo Υ]\n\n";
	printf("  Google    %8d\n  Yahoo     %8d\n  MSN       %8d\n" .
		"  Goo       %8d\n  <¾>  %8d\n",
		$ACCESS_CNT[0], $ACCESS_CNT[1], $ACCESS_CNT[2], $ACCESS_CNT[3],
		$all-$ACCESS_CNT[0]-$ACCESS_CNT[1]-$ACCESS_CNT[2]-$ACCESS_CNT[3]);
	printf("  ------------------\n  Total     %8d\n", $all);
}

print "\n[ɽи";
printf("(%d)", $TLIMIT) if $TLIMIT;
printf("(%dޤ)", $CLIMIT) if $CLIMIT;
print "]\n\n";
foreach $k (sort {$KEYWORD_CNT{$b} <=> $KEYWORD_CNT{$a}} keys %KEYWORD_CNT) {
	if($CLIMIT && $KEYWORD_CNT{$k} < $CLIMIT) {
		printf("  %-40s %8d\n", "<¾>", $KEYWORD_CNT_ALL - $t);
		last;
	}
	printf("  %-40s %8d\n", $k, $KEYWORD_CNT{$k});
	$t += $KEYWORD_CNT{$k};
	$i++;
	if($TLIMIT && $i >= $TLIMIT) {
		printf("  %-40s %8d\n", "<¾>", $KEYWORD_CNT_ALL - $t);
		last;
	}
}
print "  " . '-' x 49 . "\n";
printf("  %-40s %8d\n", "Total", $KEYWORD_CNT_ALL);
print "\n-- \n Generated by $Version\n";
