[OT] benchmarking "typical" programs

Fri Sep 21 18:02:16 BST 2012

On Fri, Sep 21, 2012 at 10:22:44AM +0100, Nicholas Clark said:
> I'm not a search engineer (recovering or otherwise), so this represents
> rather more work that I wanted to do.

I'll try and know something together but really it's fairly simple 
algorithm. Warning untested:

my %index;
foreach my $doc (@corpus) {
  my $text   = slurp($doc);
  my @tokens = tokenize($text);
  foreach my $token (@tokens) {
    $index{$token}->{$doc}++; 
  }
}

my $D  = scalar(@corpus);

foreach my $query (@queries) {
	my %results;     
	my @tokens  = tokenize($query);
    foreach my $token (@tokens) {
    	my $docs = $index->{$token};
        my $d    = size keys %$docs;
        foreach my $doc (keys %docs) {
            # http://en.wikipedia.org/wiki/Tf*idf
			my $tf  = $docs->{$doc};
            my $idf = log($D / $d); 
            $results{$doc} += $tf * $idf;
        } 
    }
    my $count = 1;
    foreach my $doc (sort { $results{$b} <=> $results{$a} } %results) {
		print "$count) $doc (score ".$results->{$doc}.")\n";
        $count++; 
    } 

}

sub tokenize {
  my $text  = shift;
  my @words = split ' ', $text;
  return map { stem($_) } grep { !$STOP_WORDS{$_} } @words; 
}

# world's most usless stemmer
# here for munging performance checking only
sub stem {
  my $word = shift;
  $word =~ s!(ing|s|ed|ly$);
  $word;
}