[OT] benchmarking "typical" programs
Simon Wistow
simon at thegestalt.org
Fri Sep 21 18:02:16 BST 2012
On Fri, Sep 21, 2012 at 10:22:44AM +0100, Nicholas Clark said:
> I'm not a search engineer (recovering or otherwise), so this represents
> rather more work that I wanted to do.
I'll try and know something together but really it's fairly simple
algorithm. Warning untested:
my %index;
foreach my $doc (@corpus) {
my $text = slurp($doc);
my @tokens = tokenize($text);
foreach my $token (@tokens) {
$index{$token}->{$doc}++;
}
}
my $D = scalar(@corpus);
foreach my $query (@queries) {
my %results;
my @tokens = tokenize($query);
foreach my $token (@tokens) {
my $docs = $index->{$token};
my $d = size keys %$docs;
foreach my $doc (keys %docs) {
# http://en.wikipedia.org/wiki/Tf*idf
my $tf = $docs->{$doc};
my $idf = log($D / $d);
$results{$doc} += $tf * $idf;
}
}
my $count = 1;
foreach my $doc (sort { $results{$b} <=> $results{$a} } %results) {
print "$count) $doc (score ".$results->{$doc}.")\n";
$count++;
}
}
sub tokenize {
my $text = shift;
my @words = split ' ', $text;
return map { stem($_) } grep { !$STOP_WORDS{$_} } @words;
}
# world's most usless stemmer
# here for munging performance checking only
sub stem {
my $word = shift;
$word =~ s!(ing|s|ed|ly$);
$word;
}
More information about the london.pm
mailing list