[KinoSearch] Problem with stopalizer
Mike Wexler
mwexler at tias.com
Wed Jun 7 18:31:19 PDT 2006
Marvin Humphrey wrote:
> Hi Mike,
>
> I've fixed this in subversion. Please try repository revision 959.
Perfect. Thanks.
>
> On Jun 6, 2006, at 10:49 AM, Mike Wexler wrote:
>
>> (+body:cream), (+body:), (+body:wheat)
>>
>> the problem is the middle expression. This doesn't seem to match
>> anything.
>> The stopalizer is change "of" into "" instead of removing it entirely
>> as a term. Any suggestions on how to fix this?
>
> The fix turned out to be pretty simple (scroll all the way down), but
> it needed thorough testing to verify that it didn't have any nasty
> side effects.
>
> Marvin Humphrey
> Rectangular Research
> http://www.rectangular.com/
>
> $ svn diff
> Index: t/601-queryparser.t
> ===================================================================
> --- t/601-queryparser.t (revision 946)
> +++ t/601-queryparser.t (working copy)
> @@ -4,18 +4,50 @@
> use lib 't';
> use KinoSearch qw( kdump );
> -use Test::More 'no_plan';
> +use Test::More tests => 205;
> use File::Spec::Functions qw( catfile );
> BEGIN { use_ok('KinoSearch::QueryParser::QueryParser') }
> -use KinoSearchTestInvIndex qw( create_invindex );
> +use KinoSearch::InvIndexer;
> use KinoSearch::Searcher;
> +use KinoSearch::Store::RAMInvIndex;
> use KinoSearch::Analysis::Tokenizer;
> +use KinoSearch::Analysis::Stopalizer;
> +use KinoSearch::Analysis::PolyAnalyzer;
> my $whitespace_tokenizer
> = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/ );
> +my $stopalizer
> + = KinoSearch::Analysis::Stopalizer->new( stoplist => { x => 1 } );
> +my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
> + analyzers => [ $whitespace_tokenizer, $stopalizer, ], );
> +my @docs = ( 'x', 'y', 'z', 'x a', 'x a b', 'x a b c', 'x foo a b c
> d', );
> +my $invindex = KinoSearch::Store::RAMInvIndex->new( create => 1 );
> +my $stop_invindex = KinoSearch::Store::RAMInvIndex->new( create => 1 );
> +my $invindexer = KinoSearch::InvIndexer->new(
> + invindex => $invindex,
> + analyzer => $whitespace_tokenizer,
> +);
> +my $stop_invindexer = KinoSearch::InvIndexer->new(
> + invindex => $stop_invindex,
> + analyzer => $polyanalyzer,
> +);
> +$invindexer->spec_field( name => 'content' );
> +$stop_invindexer->spec_field( name => 'content' );
> +
> +for my $content_string (@docs) {
> + my $doc = $invindexer->new_doc;
> + $doc->set_value( content => $content_string );
> + $invindexer->add_doc($doc);
> + $doc = $stop_invindexer->new_doc;
> + $doc->set_value( content => $content_string );
> + $stop_invindexer->add_doc($doc);
> +}
> +$invindexer->finish;
> +$stop_invindexer->finish;
> +
> my $OR_parser = KinoSearch::QueryParser::QueryParser->new(
> analyzer => $whitespace_tokenizer,
> default_field => 'content',
> @@ -26,98 +58,120 @@
> default_boolop => 'AND',
> );
> -my @docs = ( 'x', 'y', 'z', 'x a', 'x a b', 'x a b c', 'x a b c
> d', );
> -my $invindex = create_invindex(@docs);
> +my $OR_stop_parser = KinoSearch::QueryParser::QueryParser->new(
> + analyzer => $polyanalyzer,
> + default_field => 'content',
> +);
> +my $AND_stop_parser = KinoSearch::QueryParser::QueryParser->new(
> + analyzer => $polyanalyzer,
> + default_field => 'content',
> + default_boolop => 'AND',
> +);
> -my $searcher = KinoSearch::Searcher->new( invindex => $invindex );
> +my $searcher = KinoSearch::Searcher->new( invindex => $invindex );
> +my $stop_searcher = KinoSearch::Searcher->new( invindex =>
> $stop_invindex );
> my @logical_tests = (
> - 'b' => [ 3, 3, ],
> - '(a)' => [ 4, 4, ],
> - '"a"' => [ 4, 4, ],
> - '"(a)"' => [ 0, 0, ],
> - '("a")' => [ 4, 4, ],
> + 'b' => [ 3, 3, 3, 3, ],
> + '(a)' => [ 4, 4, 4, 4, ],
> + '"a"' => [ 4, 4, 4, 4, ],
> + '"(a)"' => [ 0, 0, 0, 0, ],
> + '("a")' => [ 4, 4, 4, 4, ],
> - 'a b' => [ 4, 3, ],
> - 'a (b)' => [ 4, 3, ],
> - 'a "b"' => [ 4, 3, ],
> - 'a ("b")' => [ 4, 3, ],
> - 'a "(b)"' => [ 4, 0, ],
> + 'a b' => [ 4, 3, 4, 3, ],
> + 'a (b)' => [ 4, 3, 4, 3, ],
> + 'a "b"' => [ 4, 3, 4, 3, ],
> + 'a ("b")' => [ 4, 3, 4, 3, ],
> + 'a "(b)"' => [ 4, 0, 4, 0, ],
> - '(a b)' => [ 4, 3, ],
> - '"a b"' => [ 3, 3, ],
> - '("a b")' => [ 3, 3, ],
> - '"(a b)"' => [ 0, 0, ],
> + '(a b)' => [ 4, 3, 4, 3, ],
> + '"a b"' => [ 3, 3, 3, 3, ],
> + '("a b")' => [ 3, 3, 3, 3, ],
> + '"(a b)"' => [ 0, 0, 0, 0, ],
> - 'a b c' => [ 4, 2, ],
> - 'a (b c)' => [ 4, 2, ],
> - 'a "b c"' => [ 4, 2, ],
> - 'a ("b c")' => [ 4, 2, ],
> - 'a "(b c)"' => [ 4, 0, ],
> - '"a b c"' => [ 2, 2, ],
> + 'a b c' => [ 4, 2, 4, 2, ],
> + 'a (b c)' => [ 4, 2, 4, 2, ],
> + 'a "b c"' => [ 4, 2, 4, 2, ],
> + 'a ("b c")' => [ 4, 2, 4, 2, ],
> + 'a "(b c)"' => [ 4, 0, 4, 0, ],
> + '"a b c"' => [ 2, 2, 2, 2, ],
> - '-x' => [ 0, 0, ],
> - 'x -c' => [ 3, 3, ],
> - 'x "-c"' => [ 5, 0, ],
> - 'x +c' => [ 2, 2, ],
> - 'x "+c"' => [ 5, 0, ],
> + '-x' => [ 0, 0, 0, 0, ],
> + 'x -c' => [ 3, 3, 0, 0, ],
> + 'x "-c"' => [ 5, 0, 0, 0, ],
> + 'x +c' => [ 2, 2, 2, 2, ],
> + 'x "+c"' => [ 5, 0, 0, 0, ],
> - '+x +c' => [ 2, 2, ],
> - '+x -c' => [ 3, 3, ],
> - '-x +c' => [ 0, 0, ],
> - '-x -c' => [ 0, 0, ],
> + '+x +c' => [ 2, 2, 2, 2, ],
> + '+x -c' => [ 3, 3, 0, 0, ],
> + '-x +c' => [ 0, 0, 2, 2, ],
> + '-x -c' => [ 0, 0, 0, 0, ],
> - 'x y' => [ 6, 0, ],
> - 'x a d' => [ 5, 1, ],
> - 'x "a d"' => [ 5, 0, ],
> + 'x y' => [ 6, 0, 1, 1, ],
> + 'x a d' => [ 5, 1, 4, 1, ],
> + 'x "a d"' => [ 5, 0, 0, 0, ],
> + '"x a"' => [ 3, 3, 3, 3, ],
> - 'x AND y' => [ 0, 0, ],
> - 'x OR y' => [ 6, 6, ],
> - 'x AND NOT y' => [ 5, 5, ],
> + 'x AND y' => [ 0, 0, 1, 1, ],
> + 'x OR y' => [ 6, 6, 1, 1, ],
> + 'x AND NOT y' => [ 5, 5, 0, 0, ],
> - 'x (b OR c)' => [ 5, 3, ],
> - 'x AND (b OR c)' => [ 3, 3, ],
> - 'x OR (b OR c)' => [ 5, 5, ],
> - 'x (y OR c)' => [ 6, 2, ],
> - 'x AND (y OR c)' => [ 2, 2, ],
> + 'x (b OR c)' => [ 5, 3, 3, 3, ],
> + 'x AND (b OR c)' => [ 3, 3, 3, 3, ],
> + 'x OR (b OR c)' => [ 5, 5, 3, 3, ],
> + 'x (y OR c)' => [ 6, 2, 3, 3, ],
> + 'x AND (y OR c)' => [ 2, 2, 3, 3, ],
> - 'a AND NOT (b OR "c d")' => [ 1, 1, ],
> - 'a AND NOT "a b"' => [ 1, 1, ],
> - 'a AND NOT ("a b" OR "c d")' => [ 1, 1, ],
> + 'a AND NOT (b OR "c d")' => [ 1, 1, 1, 1, ],
> + 'a AND NOT "a b"' => [ 1, 1, 1, 1, ],
> + 'a AND NOT ("a b" OR "c d")' => [ 1, 1, 1, 1, ],
> - '+"b c" -d' => [ 1, 1, ],
> - '"a b" +d' => [ 1, 1, ],
> + '+"b c" -d' => [ 1, 1, 1, 1, ],
> + '"a b" +d' => [ 1, 1, 1, 1, ],
> - 'x AND NOT (b OR (c AND d))' => [ 2, 2, ],
> + 'x AND NOT (b OR (c AND d))' => [ 2, 2, 0, 0, ],
> - '-(+foo)' => [ 0, 0 ],
> + '-(+notthere)' => [ 0, 0, 0, 0 ],
> - 'content:b' => [ 3, 3, ],
> - 'bogusfield:a' => [ 0, 0, ],
> - 'bogusfield:a content:b' => [ 3, 0, ],
> + 'content:b' => [ 3, 3, 3, 3, ],
> + 'bogusfield:a' => [ 0, 0, 0, 0, ],
> + 'bogusfield:a content:b' => [ 3, 0, 3, 0, ],
> );
> -do {
> - my $i = 0;
> - while ( $i < @logical_tests ) {
> - my $qstring = $logical_tests[ $i++ ];
> - my $OR_expected = $logical_tests[$i][0];
> - my $query = $OR_parser->parse($qstring);
> - my $hits = $searcher->search( query => $query );
> - $hits->seek( 0, 50 );
> - is( $hits->total_hits, $OR_expected, "OR: $qstring" );
> +my $i = 0;
> +while ( $i < @logical_tests ) {
> + my $qstring = $logical_tests[$i];
> + $i++;
> - $query = $AND_parser->parse($qstring);
> - my $AND_expected = $logical_tests[ $i++ ][1];
> - $hits = $searcher->search( query => $query );
> - $hits->seek( 0, 50 );
> - is( $hits->total_hits, $AND_expected, "AND: $qstring" );
> - $hits->{searcher} = undef, $hits->{reader} = undef,
> - $hits->{weight} = undef,
> - # kdump($query);
> - # exit;
> - }
> - }
> + my $query = $OR_parser->parse($qstring);
> + my $hits = $searcher->search( query => $query );
> + $hits->seek( 0, 50 );
> + is( $hits->total_hits, $logical_tests[$i][0], "OR: $qstring" );
> + $query = $AND_parser->parse($qstring);
> + $hits = $searcher->search( query => $query );
> + $hits->seek( 0, 50 );
> + is( $hits->total_hits, $logical_tests[$i][1], "AND: $qstring" );
> +
> + $query = $OR_stop_parser->parse($qstring);
> + $hits = $stop_searcher->search( query => $query );
> + $hits->seek( 0, 50 );
> + is( $hits->total_hits, $logical_tests[$i][2], "stoplist-OR:
> $qstring" );
> +
> + $query = $AND_stop_parser->parse($qstring);
> + $hits = $stop_searcher->search( query => $query );
> + $hits->seek( 0, 50 );
> + is( $hits->total_hits, $logical_tests[$i][3],
> + "stoplist-AND: $qstring" );
> +
> + $i++;
> +
> + $hits->{searcher} = undef;
> + $hits->{reader} = undef;
> + $hits->{weight} = undef;
> + #kdump($query);
> + #exit;
> +}
> +
> Index: lib/KinoSearch/QueryParser/QueryParser.pm
> ===================================================================
> --- lib/KinoSearch/QueryParser/QueryParser.pm (revision 946)
> +++ lib/KinoSearch/QueryParser/QueryParser.pm (working copy)
> @@ -163,6 +163,7 @@
> elsif (s/([^"(\s]+)//) {
> my $token_texts = $self->_analyze($1);
> my @terms = map { KinoSearch::Index::Term->new( $field,
> $_ ) }
> + grep { $_ ne '' }
> @$token_texts;
> for my $term (@terms) {
> my $query
> slothbear:~/Desktop/ksfixhits marvin$
>
>
>
>
More information about the kinosearch
mailing list