[KinoSearch] Problem with stopalizer

Mike Wexler mwexler at tias.com
Wed Jun 7 18:31:19 PDT 2006



Marvin Humphrey wrote:
> Hi Mike,
>
> I've fixed this in subversion.  Please try repository revision 959.
Perfect. Thanks.

>
> On Jun 6, 2006, at 10:49 AM, Mike Wexler wrote:
>
>> (+body:cream), (+body:), (+body:wheat)
>>
>> the problem is the middle expression. This doesn't seem to match 
>> anything.
>> The stopalizer is change "of" into "" instead of removing it entirely 
>> as a term. Any suggestions on how to fix this?
>
> The fix turned out to be pretty simple (scroll all the way down), but 
> it needed thorough testing to verify that it didn't have any nasty 
> side effects.
>
> Marvin Humphrey
> Rectangular Research
> http://www.rectangular.com/
>
> $ svn diff
> Index: t/601-queryparser.t
> ===================================================================
> --- t/601-queryparser.t (revision 946)
> +++ t/601-queryparser.t (working copy)
> @@ -4,18 +4,50 @@
> use lib 't';
> use KinoSearch qw( kdump );
> -use Test::More 'no_plan';
> +use Test::More tests => 205;
> use File::Spec::Functions qw( catfile );
> BEGIN { use_ok('KinoSearch::QueryParser::QueryParser') }
> -use KinoSearchTestInvIndex qw( create_invindex );
> +use KinoSearch::InvIndexer;
> use KinoSearch::Searcher;
> +use KinoSearch::Store::RAMInvIndex;
> use KinoSearch::Analysis::Tokenizer;
> +use KinoSearch::Analysis::Stopalizer;
> +use KinoSearch::Analysis::PolyAnalyzer;
> my $whitespace_tokenizer
>      = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/ );
> +my $stopalizer
> +    = KinoSearch::Analysis::Stopalizer->new( stoplist => { x => 1 } );
> +my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
> +    analyzers => [ $whitespace_tokenizer, $stopalizer, ], );
> +my @docs = ( 'x', 'y', 'z', 'x a', 'x a b', 'x a b c', 'x foo a b c 
> d', );
> +my $invindex      = KinoSearch::Store::RAMInvIndex->new( create => 1 );
> +my $stop_invindex = KinoSearch::Store::RAMInvIndex->new( create => 1 );
> +my $invindexer    = KinoSearch::InvIndexer->new(
> +    invindex => $invindex,
> +    analyzer => $whitespace_tokenizer,
> +);
> +my $stop_invindexer = KinoSearch::InvIndexer->new(
> +    invindex => $stop_invindex,
> +    analyzer => $polyanalyzer,
> +);
> +$invindexer->spec_field( name      => 'content' );
> +$stop_invindexer->spec_field( name => 'content' );
> +
> +for my $content_string (@docs) {
> +    my $doc = $invindexer->new_doc;
> +    $doc->set_value( content => $content_string );
> +    $invindexer->add_doc($doc);
> +    $doc = $stop_invindexer->new_doc;
> +    $doc->set_value( content => $content_string );
> +    $stop_invindexer->add_doc($doc);
> +}
> +$invindexer->finish;
> +$stop_invindexer->finish;
> +
> my $OR_parser = KinoSearch::QueryParser::QueryParser->new(
>      analyzer      => $whitespace_tokenizer,
>      default_field => 'content',
> @@ -26,98 +58,120 @@
>      default_boolop => 'AND',
> );
> -my @docs     = ( 'x', 'y', 'z', 'x a', 'x a b', 'x a b c', 'x a b c 
> d', );
> -my $invindex = create_invindex(@docs);
> +my $OR_stop_parser = KinoSearch::QueryParser::QueryParser->new(
> +    analyzer      => $polyanalyzer,
> +    default_field => 'content',
> +);
> +my $AND_stop_parser = KinoSearch::QueryParser::QueryParser->new(
> +    analyzer       => $polyanalyzer,
> +    default_field  => 'content',
> +    default_boolop => 'AND',
> +);
> -my $searcher = KinoSearch::Searcher->new( invindex => $invindex );
> +my $searcher      = KinoSearch::Searcher->new( invindex => $invindex );
> +my $stop_searcher = KinoSearch::Searcher->new( invindex => 
> $stop_invindex );
> my @logical_tests = (
> -    'b'     => [ 3, 3, ],
> -    '(a)'   => [ 4, 4, ],
> -    '"a"'   => [ 4, 4, ],
> -    '"(a)"' => [ 0, 0, ],
> -    '("a")' => [ 4, 4, ],
> +    'b'     => [ 3, 3, 3, 3, ],
> +    '(a)'   => [ 4, 4, 4, 4, ],
> +    '"a"'   => [ 4, 4, 4, 4, ],
> +    '"(a)"' => [ 0, 0, 0, 0, ],
> +    '("a")' => [ 4, 4, 4, 4, ],
> -    'a b'     => [ 4, 3, ],
> -    'a (b)'   => [ 4, 3, ],
> -    'a "b"'   => [ 4, 3, ],
> -    'a ("b")' => [ 4, 3, ],
> -    'a "(b)"' => [ 4, 0, ],
> +    'a b'     => [ 4, 3, 4, 3, ],
> +    'a (b)'   => [ 4, 3, 4, 3, ],
> +    'a "b"'   => [ 4, 3, 4, 3, ],
> +    'a ("b")' => [ 4, 3, 4, 3, ],
> +    'a "(b)"' => [ 4, 0, 4, 0, ],
> -    '(a b)'   => [ 4, 3, ],
> -    '"a b"'   => [ 3, 3, ],
> -    '("a b")' => [ 3, 3, ],
> -    '"(a b)"' => [ 0, 0, ],
> +    '(a b)'   => [ 4, 3, 4, 3, ],
> +    '"a b"'   => [ 3, 3, 3, 3, ],
> +    '("a b")' => [ 3, 3, 3, 3, ],
> +    '"(a b)"' => [ 0, 0, 0, 0, ],
> -    'a b c'     => [ 4, 2, ],
> -    'a (b c)'   => [ 4, 2, ],
> -    'a "b c"'   => [ 4, 2, ],
> -    'a ("b c")' => [ 4, 2, ],
> -    'a "(b c)"' => [ 4, 0, ],
> -    '"a b c"'   => [ 2, 2, ],
> +    'a b c'     => [ 4, 2, 4, 2, ],
> +    'a (b c)'   => [ 4, 2, 4, 2, ],
> +    'a "b c"'   => [ 4, 2, 4, 2, ],
> +    'a ("b c")' => [ 4, 2, 4, 2, ],
> +    'a "(b c)"' => [ 4, 0, 4, 0, ],
> +    '"a b c"'   => [ 2, 2, 2, 2, ],
> -    '-x'     => [ 0, 0, ],
> -    'x -c'   => [ 3, 3, ],
> -    'x "-c"' => [ 5, 0, ],
> -    'x +c'   => [ 2, 2, ],
> -    'x "+c"' => [ 5, 0, ],
> +    '-x'     => [ 0, 0, 0, 0, ],
> +    'x -c'   => [ 3, 3, 0, 0, ],
> +    'x "-c"' => [ 5, 0, 0, 0, ],
> +    'x +c'   => [ 2, 2, 2, 2, ],
> +    'x "+c"' => [ 5, 0, 0, 0, ],
> -    '+x +c' => [ 2, 2, ],
> -    '+x -c' => [ 3, 3, ],
> -    '-x +c' => [ 0, 0, ],
> -    '-x -c' => [ 0, 0, ],
> +    '+x +c' => [ 2, 2, 2, 2, ],
> +    '+x -c' => [ 3, 3, 0, 0, ],
> +    '-x +c' => [ 0, 0, 2, 2, ],
> +    '-x -c' => [ 0, 0, 0, 0, ],
> -    'x y'     => [ 6, 0, ],
> -    'x a d'   => [ 5, 1, ],
> -    'x "a d"' => [ 5, 0, ],
> +    'x y'     => [ 6, 0, 1, 1, ],
> +    'x a d'   => [ 5, 1, 4, 1, ],
> +    'x "a d"' => [ 5, 0, 0, 0, ],
> +    '"x a"'   => [ 3, 3, 3, 3, ],
> -    'x AND y'     => [ 0, 0, ],
> -    'x OR y'      => [ 6, 6, ],
> -    'x AND NOT y' => [ 5, 5, ],
> +    'x AND y'     => [ 0, 0, 1, 1, ],
> +    'x OR y'      => [ 6, 6, 1, 1, ],
> +    'x AND NOT y' => [ 5, 5, 0, 0, ],
> -    'x (b OR c)'     => [ 5, 3, ],
> -    'x AND (b OR c)' => [ 3, 3, ],
> -    'x OR (b OR c)'  => [ 5, 5, ],
> -    'x (y OR c)'     => [ 6, 2, ],
> -    'x AND (y OR c)' => [ 2, 2, ],
> +    'x (b OR c)'     => [ 5, 3, 3, 3, ],
> +    'x AND (b OR c)' => [ 3, 3, 3, 3, ],
> +    'x OR (b OR c)'  => [ 5, 5, 3, 3, ],
> +    'x (y OR c)'     => [ 6, 2, 3, 3, ],
> +    'x AND (y OR c)' => [ 2, 2, 3, 3, ],
> -    'a AND NOT (b OR "c d")'     => [ 1, 1, ],
> -    'a AND NOT "a b"'            => [ 1, 1, ],
> -    'a AND NOT ("a b" OR "c d")' => [ 1, 1, ],
> +    'a AND NOT (b OR "c d")'     => [ 1, 1, 1, 1, ],
> +    'a AND NOT "a b"'            => [ 1, 1, 1, 1, ],
> +    'a AND NOT ("a b" OR "c d")' => [ 1, 1, 1, 1, ],
> -    '+"b c" -d' => [ 1, 1, ],
> -    '"a b" +d'  => [ 1, 1, ],
> +    '+"b c" -d' => [ 1, 1, 1, 1, ],
> +    '"a b" +d'  => [ 1, 1, 1, 1, ],
> -    'x AND NOT (b OR (c AND d))' => [ 2, 2, ],
> +    'x AND NOT (b OR (c AND d))' => [ 2, 2, 0, 0, ],
> -    '-(+foo)' => [ 0, 0 ],
> +    '-(+notthere)' => [ 0, 0, 0, 0 ],
> -    'content:b'              => [ 3, 3, ],
> -    'bogusfield:a'           => [ 0, 0, ],
> -    'bogusfield:a content:b' => [ 3, 0, ],
> +    'content:b'              => [ 3, 3, 3, 3, ],
> +    'bogusfield:a'           => [ 0, 0, 0, 0, ],
> +    'bogusfield:a content:b' => [ 3, 0, 3, 0, ],
> );
> -do {
> -    my $i = 0;
> -    while ( $i < @logical_tests ) {
> -        my $qstring     = $logical_tests[ $i++ ];
> -        my $OR_expected = $logical_tests[$i][0];
> -        my $query       = $OR_parser->parse($qstring);
> -        my $hits        = $searcher->search( query => $query );
> -        $hits->seek( 0, 50 );
> -        is( $hits->total_hits, $OR_expected, "OR:    $qstring" );
> +my $i = 0;
> +while ( $i < @logical_tests ) {
> +    my $qstring = $logical_tests[$i];
> +    $i++;
> -        $query = $AND_parser->parse($qstring);
> -        my $AND_expected = $logical_tests[ $i++ ][1];
> -        $hits = $searcher->search( query => $query );
> -        $hits->seek( 0, 50 );
> -        is( $hits->total_hits, $AND_expected, "AND:   $qstring" );
> -        $hits->{searcher} = undef, $hits->{reader} = undef,
> -            $hits->{weight} = undef,
> -            # kdump($query);
> -            # exit;
> -    }
> -    }
> +    my $query = $OR_parser->parse($qstring);
> +    my $hits = $searcher->search( query => $query );
> +    $hits->seek( 0, 50 );
> +    is( $hits->total_hits, $logical_tests[$i][0], "OR:    $qstring" );
> +    $query = $AND_parser->parse($qstring);
> +    $hits = $searcher->search( query => $query );
> +    $hits->seek( 0, 50 );
> +    is( $hits->total_hits, $logical_tests[$i][1], "AND:   $qstring" );
> +
> +    $query = $OR_stop_parser->parse($qstring);
> +    $hits = $stop_searcher->search( query => $query );
> +    $hits->seek( 0, 50 );
> +    is( $hits->total_hits, $logical_tests[$i][2], "stoplist-OR:   
> $qstring" );
> +
> +    $query = $AND_stop_parser->parse($qstring);
> +    $hits = $stop_searcher->search( query => $query );
> +    $hits->seek( 0, 50 );
> +    is( $hits->total_hits, $logical_tests[$i][3],
> +        "stoplist-AND:   $qstring" );
> +
> +    $i++;
> +
> +    $hits->{searcher} = undef;
> +    $hits->{reader}   = undef;
> +    $hits->{weight}   = undef;
> +    #kdump($query);
> +    #exit;
> +}
> +
> Index: lib/KinoSearch/QueryParser/QueryParser.pm
> ===================================================================
> --- lib/KinoSearch/QueryParser/QueryParser.pm   (revision 946)
> +++ lib/KinoSearch/QueryParser/QueryParser.pm   (working copy)
> @@ -163,6 +163,7 @@
>          elsif (s/([^"(\s]+)//) {
>              my $token_texts = $self->_analyze($1);
>              my @terms = map { KinoSearch::Index::Term->new( $field, 
> $_ ) }
> +                grep { $_ ne '' }
>                  @$token_texts;
>              for my $term (@terms) {
>                  my $query
> slothbear:~/Desktop/ksfixhits marvin$
>
>
>
>





More information about the kinosearch mailing list