[KinoSearch] utf8 (unicode) any progress on TokenBatch?

Marvin Humphrey marvin at rectangular.com
Tue Aug 15 22:48:02 PDT 2006




On Aug 15, 2006, at 1:11 AM, Marc Elser wrote:

> Please, let me know if you fixed these problems.

I've taken a shot at it.  :)  Please give the current repository  
revision 1030 a try.

Marvin Humphrey
Rectangular Research
http://www.rectangular.com/


slothbear:~/projects/ks marvin$ svn diff -r 1026
Index: t/601-queryparser.t
===================================================================
--- t/601-queryparser.t (revision 1026)
+++ t/601-queryparser.t (working copy)
@@ -4,17 +4,20 @@
use lib 't';
use KinoSearch qw( kdump );
-use Test::More tests => 205;
+use Test::More tests => 207;
use File::Spec::Functions qw( catfile );
BEGIN { use_ok('KinoSearch::QueryParser::QueryParser') }
+use KinoSearchTestInvIndex qw( create_invindex );
+
use KinoSearch::InvIndexer;
use KinoSearch::Searcher;
use KinoSearch::Store::RAMInvIndex;
use KinoSearch::Analysis::Tokenizer;
use KinoSearch::Analysis::Stopalizer;
use KinoSearch::Analysis::PolyAnalyzer;
+use KinoSearch::Util::StringHelper qw( utf8_flag_on );
my $whitespace_tokenizer
      = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/ );
@@ -175,3 +178,16 @@
      #exit;
}
+my $motorhead = "Mot\xC3\xB6rhead";
+utf8_flag_on($motorhead);
+$invindex = create_invindex($motorhead);
+my $tokenizer = KinoSearch::Analysis::Tokenizer->new;
+$searcher = KinoSearch::Searcher->new(
+    analyzer => $tokenizer,
+    invindex => $invindex,
+);
+
+my $hits = $searcher->search('Mot');
+is( $hits->total_hits, 0, "Pre-test - indexing worked properly" );
+$hits = $searcher->search($motorhead);
+is( $hits->total_hits, 1, "QueryParser parses UTF-8 strings  
correctly" );
Index: lib/KinoSearch/Analysis/Tokenizer.pm
===================================================================
--- lib/KinoSearch/Analysis/Tokenizer.pm        (revision 1026)
+++ lib/KinoSearch/Analysis/Tokenizer.pm        (working copy)
@@ -3,7 +3,6 @@
use warnings;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Analysis::Analyzer );
-use locale;
BEGIN {
      __PACKAGE__->init_instance_vars(
@@ -50,20 +49,24 @@
      # alias input to $_
      while ( $batch->next ) {
          local $_ = $batch->get_text;
+        my $copy = $_;
-        # ensure that pos is set to 0 for this scalar
-        pos = 0;
-
          # accumulate token start_offsets and end_offsets
          my ( @starts, @ends );
-        1 while ( m/$separator_re/g and push @starts,
-            pos and m/$token_re/g and push @ends, pos );
+        my $orig_length = bytes::length($_);
+        while (1) {
+            s/$separator_re//;
+            push @starts, $orig_length - bytes::length($_);
+            last unless s/$token_re//;
+            push @ends, $orig_length - bytes::length($_);
+        }
+
          # correct for overshoot
          $#starts = $#ends;
          # add the new tokens to the batch
-        $new_batch->add_many_tokens( $_, \@starts, \@ends );
+        $new_batch->add_many_tokens( $copy, \@starts, \@ends );
      }
      return $new_batch;
Index: lib/KinoSearch/Analysis/TokenBatch.pm
===================================================================
--- lib/KinoSearch/Analysis/TokenBatch.pm       (revision 1026)
+++ lib/KinoSearch/Analysis/TokenBatch.pm       (working copy)
@@ -69,7 +69,6 @@
      char *string_start = SvPV(string_sv, len);
      I32 i;
      const I32 max = av_len(starts_av);
-    STRLEN unicount = 0;
      for (i = 0; i <= max; i++) {
          STRLEN start_offset, end_offset;
@@ -93,24 +92,9 @@
              Kino_confess("end_offset > len (%d > %"UVuf")",
                  end_offset, (UV)len);
-        /* advance the pointer past as many unicode characters as  
required */
-        while (1) {
-            if (unicount == start_offset)
-                break;
-
-            /* header byte */
-            string_start++;
-
-            /* continutation bytes */
-            while ((*string_start & 0xC0) == 0xC0)
-                string_start++;
-
-            unicount++;
-        }
-
          /* calculate the start of the substring and add the token */
          token = Kino_Token_new(
-            string_start,
+            string_start + start_offset,
              (end_offset - start_offset),
              start_offset,
              end_offset,
Index: lib/KinoSearch/Index/Term.pm
===================================================================
--- lib/KinoSearch/Index/Term.pm        (revision 1026)
+++ lib/KinoSearch/Index/Term.pm        (working copy)
@@ -12,6 +12,8 @@
      __PACKAGE__->ready_get_set(qw( field text ));
}
+use KinoSearch::Util::StringHelper qw( utf8_flag_on utf8_flag_off );
+
sub new {
      croak("usage: KinoSearch::Index::Term->new( field, text )")
          unless @_ == 3;
@@ -26,6 +28,7 @@
sub new_from_string {
      my ( $class, $termstring, $finfos ) = @_;
      my $field_num = unpack( 'n', bytes::substr( $termstring, 0, 2,  
'' ) );
+    utf8_flag_on($termstring);
      my $field_name = $finfos->field_name($field_num);
      return __PACKAGE__->new( $field_name, $termstring );
}
@@ -37,7 +40,9 @@
      my ( $self, $finfos ) = @_;
      my $field_num = $finfos->get_field_num( $self->{field} );
      return unless defined $field_num;
-    return pack( 'n', $field_num ) . $self->{text};
+    my $termtext = $self->{text};
+    utf8_flag_off($termtext);
+    return pack( 'n', $field_num ) . $termtext;
}
sub to_string {
Index: lib/KinoSearch/Util/StringHelper.pm
===================================================================
--- lib/KinoSearch/Util/StringHelper.pm (revision 1026)
+++ lib/KinoSearch/Util/StringHelper.pm (working copy)
@@ -3,7 +3,7 @@
use warnings;
use base qw( Exporter );
-our @EXPORT_OK = qw( utf8_flag_on );
+our @EXPORT_OK = qw( utf8_flag_on utf8_flag_off );
1;
@@ -19,6 +19,12 @@
PPCODE:
      SvUTF8_on(sv);
+void
+utf8_flag_off(sv)
+    SV *sv;
+PPCODE:
+    SvUTF8_off(sv);
+
__H__
#ifndef H_KINO_STRING_HELPER
slothbear:~/projects/ks marvin$





_______________________________________________
KinoSearch mailing list
KinoSearch at rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch




More information about the kinosearch mailing list