[KinoSearch] utf8 (unicode) any progress on TokenBatch?
Marvin Humphrey
marvin at rectangular.com
Tue Aug 15 22:48:02 PDT 2006
On Aug 15, 2006, at 1:11 AM, Marc Elser wrote:
> Please, let me know if you fixed these problems.
I've taken a shot at it. :) Please give the current repository
revision 1030 a try.
Marvin Humphrey
Rectangular Research
http://www.rectangular.com/
slothbear:~/projects/ks marvin$ svn diff -r 1026
Index: t/601-queryparser.t
===================================================================
--- t/601-queryparser.t (revision 1026)
+++ t/601-queryparser.t (working copy)
@@ -4,17 +4,20 @@
use lib 't';
use KinoSearch qw( kdump );
-use Test::More tests => 205;
+use Test::More tests => 207;
use File::Spec::Functions qw( catfile );
BEGIN { use_ok('KinoSearch::QueryParser::QueryParser') }
+use KinoSearchTestInvIndex qw( create_invindex );
+
use KinoSearch::InvIndexer;
use KinoSearch::Searcher;
use KinoSearch::Store::RAMInvIndex;
use KinoSearch::Analysis::Tokenizer;
use KinoSearch::Analysis::Stopalizer;
use KinoSearch::Analysis::PolyAnalyzer;
+use KinoSearch::Util::StringHelper qw( utf8_flag_on );
my $whitespace_tokenizer
= KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/ );
@@ -175,3 +178,16 @@
#exit;
}
+my $motorhead = "Mot\xC3\xB6rhead";
+utf8_flag_on($motorhead);
+$invindex = create_invindex($motorhead);
+my $tokenizer = KinoSearch::Analysis::Tokenizer->new;
+$searcher = KinoSearch::Searcher->new(
+ analyzer => $tokenizer,
+ invindex => $invindex,
+);
+
+my $hits = $searcher->search('Mot');
+is( $hits->total_hits, 0, "Pre-test - indexing worked properly" );
+$hits = $searcher->search($motorhead);
+is( $hits->total_hits, 1, "QueryParser parses UTF-8 strings
correctly" );
Index: lib/KinoSearch/Analysis/Tokenizer.pm
===================================================================
--- lib/KinoSearch/Analysis/Tokenizer.pm (revision 1026)
+++ lib/KinoSearch/Analysis/Tokenizer.pm (working copy)
@@ -3,7 +3,6 @@
use warnings;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Analysis::Analyzer );
-use locale;
BEGIN {
__PACKAGE__->init_instance_vars(
@@ -50,20 +49,24 @@
# alias input to $_
while ( $batch->next ) {
local $_ = $batch->get_text;
+ my $copy = $_;
- # ensure that pos is set to 0 for this scalar
- pos = 0;
-
# accumulate token start_offsets and end_offsets
my ( @starts, @ends );
- 1 while ( m/$separator_re/g and push @starts,
- pos and m/$token_re/g and push @ends, pos );
+ my $orig_length = bytes::length($_);
+ while (1) {
+ s/$separator_re//;
+ push @starts, $orig_length - bytes::length($_);
+ last unless s/$token_re//;
+ push @ends, $orig_length - bytes::length($_);
+ }
+
# correct for overshoot
$#starts = $#ends;
# add the new tokens to the batch
- $new_batch->add_many_tokens( $_, \@starts, \@ends );
+ $new_batch->add_many_tokens( $copy, \@starts, \@ends );
}
return $new_batch;
Index: lib/KinoSearch/Analysis/TokenBatch.pm
===================================================================
--- lib/KinoSearch/Analysis/TokenBatch.pm (revision 1026)
+++ lib/KinoSearch/Analysis/TokenBatch.pm (working copy)
@@ -69,7 +69,6 @@
char *string_start = SvPV(string_sv, len);
I32 i;
const I32 max = av_len(starts_av);
- STRLEN unicount = 0;
for (i = 0; i <= max; i++) {
STRLEN start_offset, end_offset;
@@ -93,24 +92,9 @@
Kino_confess("end_offset > len (%d > %"UVuf")",
end_offset, (UV)len);
- /* advance the pointer past as many unicode characters as
required */
- while (1) {
- if (unicount == start_offset)
- break;
-
- /* header byte */
- string_start++;
-
- /* continutation bytes */
- while ((*string_start & 0xC0) == 0xC0)
- string_start++;
-
- unicount++;
- }
-
/* calculate the start of the substring and add the token */
token = Kino_Token_new(
- string_start,
+ string_start + start_offset,
(end_offset - start_offset),
start_offset,
end_offset,
Index: lib/KinoSearch/Index/Term.pm
===================================================================
--- lib/KinoSearch/Index/Term.pm (revision 1026)
+++ lib/KinoSearch/Index/Term.pm (working copy)
@@ -12,6 +12,8 @@
__PACKAGE__->ready_get_set(qw( field text ));
}
+use KinoSearch::Util::StringHelper qw( utf8_flag_on utf8_flag_off );
+
sub new {
croak("usage: KinoSearch::Index::Term->new( field, text )")
unless @_ == 3;
@@ -26,6 +28,7 @@
sub new_from_string {
my ( $class, $termstring, $finfos ) = @_;
my $field_num = unpack( 'n', bytes::substr( $termstring, 0, 2,
'' ) );
+ utf8_flag_on($termstring);
my $field_name = $finfos->field_name($field_num);
return __PACKAGE__->new( $field_name, $termstring );
}
@@ -37,7 +40,9 @@
my ( $self, $finfos ) = @_;
my $field_num = $finfos->get_field_num( $self->{field} );
return unless defined $field_num;
- return pack( 'n', $field_num ) . $self->{text};
+ my $termtext = $self->{text};
+ utf8_flag_off($termtext);
+ return pack( 'n', $field_num ) . $termtext;
}
sub to_string {
Index: lib/KinoSearch/Util/StringHelper.pm
===================================================================
--- lib/KinoSearch/Util/StringHelper.pm (revision 1026)
+++ lib/KinoSearch/Util/StringHelper.pm (working copy)
@@ -3,7 +3,7 @@
use warnings;
use base qw( Exporter );
-our @EXPORT_OK = qw( utf8_flag_on );
+our @EXPORT_OK = qw( utf8_flag_on utf8_flag_off );
1;
@@ -19,6 +19,12 @@
PPCODE:
SvUTF8_on(sv);
+void
+utf8_flag_off(sv)
+ SV *sv;
+PPCODE:
+ SvUTF8_off(sv);
+
__H__
#ifndef H_KINO_STRING_HELPER
slothbear:~/projects/ks marvin$
_______________________________________________
KinoSearch mailing list
KinoSearch at rectangular.com
http://www.rectangular.com/mailman/listinfo/kinosearch
More information about the kinosearch
mailing list