83 lines
3.0 KiB
Perl
83 lines
3.0 KiB
Perl
|
# ==================================================================
|
||
|
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||
|
#
|
||
|
# GT::SQL::Search::Base::Common
|
||
|
# Author : Aki Mimoto
|
||
|
# CVS Info : 087,071,086,086,085
|
||
|
# $Id: Common.pm,v 1.8 2004/10/13 21:45:02 aki Exp $
|
||
|
#
|
||
|
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||
|
# ==================================================================
|
||
|
#
|
||
|
# Description:
|
||
|
# Base classes upon which all search drivers are based
|
||
|
#
|
||
|
package GT::SQL::Search::Base::Common;
|
||
|
|
||
|
use strict;
|
||
|
use Exporter;
|
||
|
use vars qw/ @ISA @EXPORT $STOPWORDS /;
|
||
|
|
||
|
@ISA = qw( Exporter );
|
||
|
@EXPORT = qw( &_tokenize &_check_word $STOPWORDS );
|
||
|
|
||
|
$STOPWORDS = { map { $_ => 1 } qw/
|
||
|
of about or all several also she among since an some and such are than
|
||
|
as that at the be them because there been these between they both this
|
||
|
but those by to do toward during towards each upon either for from was
|
||
|
had were has what have when he where her which his while however with if
|
||
|
within in would into you your is it its many more most must on re it
|
||
|
test not above add am pm jan january feb february mar march apr april
|
||
|
may jun june jul july aug august sep sept september oct october nov
|
||
|
november dec december find & > < we http com www inc other
|
||
|
including
|
||
|
/ };
|
||
|
|
||
|
sub _tokenize {
|
||
|
#--------------------------------------------------------------------------------
|
||
|
# takes a strings and chops it up into little bits
|
||
|
my $self = shift;
|
||
|
my $text = shift;
|
||
|
my ( @words, $i, %rejected, $word, $code );
|
||
|
|
||
|
# split on any non-word (includes accents) characters
|
||
|
@words = split /[^\w\x80-\xFF\-]+/, lc $text;
|
||
|
$self->debug_dumper( "Words: ", \@words ) if ($self->{_debug});
|
||
|
|
||
|
# drop all words that are too small, etc.
|
||
|
$i = 0;
|
||
|
while ( $i <= $#words ) {
|
||
|
$word = $words[ $i ];
|
||
|
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
|
||
|
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
|
||
|
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
|
||
|
splice( @words, $i, 1 );
|
||
|
$rejected{$word} = $self->{'rejections'}->{$code};
|
||
|
}
|
||
|
else {
|
||
|
$i++; # Words ok.
|
||
|
}
|
||
|
}
|
||
|
$self->debug_dumper( "Accepted Words: ", \@words ) if ($self->{_debug});
|
||
|
$self->debug_dumper( "Rejected Words: ", \%rejected ) if ($self->{_debug});
|
||
|
|
||
|
return ( \@words, \%rejected );
|
||
|
}
|
||
|
|
||
|
sub _check_word {
|
||
|
#--------------------------------------------------------------------------------
|
||
|
# Returns an error code if it is an invalid word, otherwise returns nothing.
|
||
|
#
|
||
|
my $self = shift;
|
||
|
my $word = shift;
|
||
|
my $code;
|
||
|
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
|
||
|
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
|
||
|
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
|
||
|
return $code;
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
1;
|