[KinoSearch] Problem with stopalizer
Marvin Humphrey
marvin at rectangular.com
Wed Jun 7 18:16:21 PDT 2006
Hi Mike,
I've fixed this in subversion. Please try repository revision 959.
On Jun 6, 2006, at 10:49 AM, Mike Wexler wrote:
> (+body:cream), (+body:), (+body:wheat)
>
> the problem is the middle expression. This doesn't seem to match
> anything.
> The stopalizer is change "of" into "" instead of removing it
> entirely as a term. Any suggestions on how to fix this?
The fix turned out to be pretty simple (scroll all the way down), but
it needed thorough testing to verify that it didn't have any nasty
side effects.
Marvin Humphrey
Rectangular Research
http://www.rectangular.com/
$ svn diff
Index: t/601-queryparser.t
===================================================================
--- t/601-queryparser.t (revision 946)
+++ t/601-queryparser.t (working copy)
@@ -4,18 +4,50 @@
use lib 't';
use KinoSearch qw( kdump );
-use Test::More 'no_plan';
+use Test::More tests => 205;
use File::Spec::Functions qw( catfile );
BEGIN { use_ok('KinoSearch::QueryParser::QueryParser') }
-use KinoSearchTestInvIndex qw( create_invindex );
+use KinoSearch::InvIndexer;
use KinoSearch::Searcher;
+use KinoSearch::Store::RAMInvIndex;
use KinoSearch::Analysis::Tokenizer;
+use KinoSearch::Analysis::Stopalizer;
+use KinoSearch::Analysis::PolyAnalyzer;
my $whitespace_tokenizer
= KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/ );
+my $stopalizer
+ = KinoSearch::Analysis::Stopalizer->new( stoplist => { x => 1 } );
+my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
+ analyzers => [ $whitespace_tokenizer, $stopalizer, ], );
+my @docs = ( 'x', 'y', 'z', 'x a', 'x a b', 'x a b c', 'x foo a b c
d', );
+my $invindex = KinoSearch::Store::RAMInvIndex->new( create => 1 );
+my $stop_invindex = KinoSearch::Store::RAMInvIndex->new( create => 1 );
+my $invindexer = KinoSearch::InvIndexer->new(
+ invindex => $invindex,
+ analyzer => $whitespace_tokenizer,
+);
+my $stop_invindexer = KinoSearch::InvIndexer->new(
+ invindex => $stop_invindex,
+ analyzer => $polyanalyzer,
+);
+$invindexer->spec_field( name => 'content' );
+$stop_invindexer->spec_field( name => 'content' );
+
+for my $content_string (@docs) {
+ my $doc = $invindexer->new_doc;
+ $doc->set_value( content => $content_string );
+ $invindexer->add_doc($doc);
+ $doc = $stop_invindexer->new_doc;
+ $doc->set_value( content => $content_string );
+ $stop_invindexer->add_doc($doc);
+}
+$invindexer->finish;
+$stop_invindexer->finish;
+
my $OR_parser = KinoSearch::QueryParser::QueryParser->new(
analyzer => $whitespace_tokenizer,
default_field => 'content',
@@ -26,98 +58,120 @@
default_boolop => 'AND',
);
-my @docs = ( 'x', 'y', 'z', 'x a', 'x a b', 'x a b c', 'x a b c
d', );
-my $invindex = create_invindex(@docs);
+my $OR_stop_parser = KinoSearch::QueryParser::QueryParser->new(
+ analyzer => $polyanalyzer,
+ default_field => 'content',
+);
+my $AND_stop_parser = KinoSearch::QueryParser::QueryParser->new(
+ analyzer => $polyanalyzer,
+ default_field => 'content',
+ default_boolop => 'AND',
+);
-my $searcher = KinoSearch::Searcher->new( invindex => $invindex );
+my $searcher = KinoSearch::Searcher->new( invindex => $invindex );
+my $stop_searcher = KinoSearch::Searcher->new( invindex =>
$stop_invindex );
my @logical_tests = (
- 'b' => [ 3, 3, ],
- '(a)' => [ 4, 4, ],
- '"a"' => [ 4, 4, ],
- '"(a)"' => [ 0, 0, ],
- '("a")' => [ 4, 4, ],
+ 'b' => [ 3, 3, 3, 3, ],
+ '(a)' => [ 4, 4, 4, 4, ],
+ '"a"' => [ 4, 4, 4, 4, ],
+ '"(a)"' => [ 0, 0, 0, 0, ],
+ '("a")' => [ 4, 4, 4, 4, ],
- 'a b' => [ 4, 3, ],
- 'a (b)' => [ 4, 3, ],
- 'a "b"' => [ 4, 3, ],
- 'a ("b")' => [ 4, 3, ],
- 'a "(b)"' => [ 4, 0, ],
+ 'a b' => [ 4, 3, 4, 3, ],
+ 'a (b)' => [ 4, 3, 4, 3, ],
+ 'a "b"' => [ 4, 3, 4, 3, ],
+ 'a ("b")' => [ 4, 3, 4, 3, ],
+ 'a "(b)"' => [ 4, 0, 4, 0, ],
- '(a b)' => [ 4, 3, ],
- '"a b"' => [ 3, 3, ],
- '("a b")' => [ 3, 3, ],
- '"(a b)"' => [ 0, 0, ],
+ '(a b)' => [ 4, 3, 4, 3, ],
+ '"a b"' => [ 3, 3, 3, 3, ],
+ '("a b")' => [ 3, 3, 3, 3, ],
+ '"(a b)"' => [ 0, 0, 0, 0, ],
- 'a b c' => [ 4, 2, ],
- 'a (b c)' => [ 4, 2, ],
- 'a "b c"' => [ 4, 2, ],
- 'a ("b c")' => [ 4, 2, ],
- 'a "(b c)"' => [ 4, 0, ],
- '"a b c"' => [ 2, 2, ],
+ 'a b c' => [ 4, 2, 4, 2, ],
+ 'a (b c)' => [ 4, 2, 4, 2, ],
+ 'a "b c"' => [ 4, 2, 4, 2, ],
+ 'a ("b c")' => [ 4, 2, 4, 2, ],
+ 'a "(b c)"' => [ 4, 0, 4, 0, ],
+ '"a b c"' => [ 2, 2, 2, 2, ],
- '-x' => [ 0, 0, ],
- 'x -c' => [ 3, 3, ],
- 'x "-c"' => [ 5, 0, ],
- 'x +c' => [ 2, 2, ],
- 'x "+c"' => [ 5, 0, ],
+ '-x' => [ 0, 0, 0, 0, ],
+ 'x -c' => [ 3, 3, 0, 0, ],
+ 'x "-c"' => [ 5, 0, 0, 0, ],
+ 'x +c' => [ 2, 2, 2, 2, ],
+ 'x "+c"' => [ 5, 0, 0, 0, ],
- '+x +c' => [ 2, 2, ],
- '+x -c' => [ 3, 3, ],
- '-x +c' => [ 0, 0, ],
- '-x -c' => [ 0, 0, ],
+ '+x +c' => [ 2, 2, 2, 2, ],
+ '+x -c' => [ 3, 3, 0, 0, ],
+ '-x +c' => [ 0, 0, 2, 2, ],
+ '-x -c' => [ 0, 0, 0, 0, ],
- 'x y' => [ 6, 0, ],
- 'x a d' => [ 5, 1, ],
- 'x "a d"' => [ 5, 0, ],
+ 'x y' => [ 6, 0, 1, 1, ],
+ 'x a d' => [ 5, 1, 4, 1, ],
+ 'x "a d"' => [ 5, 0, 0, 0, ],
+ '"x a"' => [ 3, 3, 3, 3, ],
- 'x AND y' => [ 0, 0, ],
- 'x OR y' => [ 6, 6, ],
- 'x AND NOT y' => [ 5, 5, ],
+ 'x AND y' => [ 0, 0, 1, 1, ],
+ 'x OR y' => [ 6, 6, 1, 1, ],
+ 'x AND NOT y' => [ 5, 5, 0, 0, ],
- 'x (b OR c)' => [ 5, 3, ],
- 'x AND (b OR c)' => [ 3, 3, ],
- 'x OR (b OR c)' => [ 5, 5, ],
- 'x (y OR c)' => [ 6, 2, ],
- 'x AND (y OR c)' => [ 2, 2, ],
+ 'x (b OR c)' => [ 5, 3, 3, 3, ],
+ 'x AND (b OR c)' => [ 3, 3, 3, 3, ],
+ 'x OR (b OR c)' => [ 5, 5, 3, 3, ],
+ 'x (y OR c)' => [ 6, 2, 3, 3, ],
+ 'x AND (y OR c)' => [ 2, 2, 3, 3, ],
- 'a AND NOT (b OR "c d")' => [ 1, 1, ],
- 'a AND NOT "a b"' => [ 1, 1, ],
- 'a AND NOT ("a b" OR "c d")' => [ 1, 1, ],
+ 'a AND NOT (b OR "c d")' => [ 1, 1, 1, 1, ],
+ 'a AND NOT "a b"' => [ 1, 1, 1, 1, ],
+ 'a AND NOT ("a b" OR "c d")' => [ 1, 1, 1, 1, ],
- '+"b c" -d' => [ 1, 1, ],
- '"a b" +d' => [ 1, 1, ],
+ '+"b c" -d' => [ 1, 1, 1, 1, ],
+ '"a b" +d' => [ 1, 1, 1, 1, ],
- 'x AND NOT (b OR (c AND d))' => [ 2, 2, ],
+ 'x AND NOT (b OR (c AND d))' => [ 2, 2, 0, 0, ],
- '-(+foo)' => [ 0, 0 ],
+ '-(+notthere)' => [ 0, 0, 0, 0 ],
- 'content:b' => [ 3, 3, ],
- 'bogusfield:a' => [ 0, 0, ],
- 'bogusfield:a content:b' => [ 3, 0, ],
+ 'content:b' => [ 3, 3, 3, 3, ],
+ 'bogusfield:a' => [ 0, 0, 0, 0, ],
+ 'bogusfield:a content:b' => [ 3, 0, 3, 0, ],
);
-do {
- my $i = 0;
- while ( $i < @logical_tests ) {
- my $qstring = $logical_tests[ $i++ ];
- my $OR_expected = $logical_tests[$i][0];
- my $query = $OR_parser->parse($qstring);
- my $hits = $searcher->search( query => $query );
- $hits->seek( 0, 50 );
- is( $hits->total_hits, $OR_expected, "OR: $qstring" );
+my $i = 0;
+while ( $i < @logical_tests ) {
+ my $qstring = $logical_tests[$i];
+ $i++;
- $query = $AND_parser->parse($qstring);
- my $AND_expected = $logical_tests[ $i++ ][1];
- $hits = $searcher->search( query => $query );
- $hits->seek( 0, 50 );
- is( $hits->total_hits, $AND_expected, "AND: $qstring" );
- $hits->{searcher} = undef, $hits->{reader} = undef,
- $hits->{weight} = undef,
- # kdump($query);
- # exit;
- }
- }
+ my $query = $OR_parser->parse($qstring);
+ my $hits = $searcher->search( query => $query );
+ $hits->seek( 0, 50 );
+ is( $hits->total_hits, $logical_tests[$i][0], "OR: $qstring" );
+ $query = $AND_parser->parse($qstring);
+ $hits = $searcher->search( query => $query );
+ $hits->seek( 0, 50 );
+ is( $hits->total_hits, $logical_tests[$i][1], "AND: $qstring" );
+
+ $query = $OR_stop_parser->parse($qstring);
+ $hits = $stop_searcher->search( query => $query );
+ $hits->seek( 0, 50 );
+ is( $hits->total_hits, $logical_tests[$i][2], "stoplist-OR:
$qstring" );
+
+ $query = $AND_stop_parser->parse($qstring);
+ $hits = $stop_searcher->search( query => $query );
+ $hits->seek( 0, 50 );
+ is( $hits->total_hits, $logical_tests[$i][3],
+ "stoplist-AND: $qstring" );
+
+ $i++;
+
+ $hits->{searcher} = undef;
+ $hits->{reader} = undef;
+ $hits->{weight} = undef;
+ #kdump($query);
+ #exit;
+}
+
Index: lib/KinoSearch/QueryParser/QueryParser.pm
===================================================================
--- lib/KinoSearch/QueryParser/QueryParser.pm (revision 946)
+++ lib/KinoSearch/QueryParser/QueryParser.pm (working copy)
@@ -163,6 +163,7 @@
elsif (s/([^"(\s]+)//) {
my $token_texts = $self->_analyze($1);
my @terms = map { KinoSearch::Index::Term->new( $field,
$_ ) }
+ grep { $_ ne '' }
@$token_texts;
for my $term (@terms) {
my $query
slothbear:~/Desktop/ksfixhits marvin$
_______________________________________________
KinoSearch mailing list
KinoSearch at rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch
More information about the kinosearch
mailing list