# ================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::SQL::Search::Base::Common # Author : Aki Mimoto # CVS Info : 087,071,086,086,085 # $Id: Common.pm,v 1.8 2004/10/13 21:45:02 aki Exp $ # # Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. # ================================================================== # # Description: # Base classes upon which all search drivers are based # package GT::SQL::Search::Base::Common; use strict; use Exporter; use vars qw/ @ISA @EXPORT $STOPWORDS /; @ISA = qw( Exporter ); @EXPORT = qw( &_tokenize &_check_word $STOPWORDS ); $STOPWORDS = { map { $_ => 1 } qw/ of about or all several also she among since an some and such are than as that at the be them because there been these between they both this but those by to do toward during towards each upon either for from was had were has what have when he where her which his while however with if within in would into you your is it its many more most must on re it test not above add am pm jan january feb february mar march apr april may jun june jul july aug august sep sept september oct october nov november dec december find & > < we http com www inc other including / }; sub _tokenize { #-------------------------------------------------------------------------------- # takes a strings and chops it up into little bits my $self = shift; my $text = shift; my ( @words, $i, %rejected, $word, $code ); # split on any non-word (includes accents) characters @words = split /[^\w\x80-\xFF\-]+/, lc $text; $self->debug_dumper( "Words: ", \@words ) if ($self->{_debug}); # drop all words that are too small, etc. $i = 0; while ( $i <= $#words ) { $word = $words[ $i ]; if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or (length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or (length($word) > $self->{max_word_size} and $code = 'TOOBIG')) { splice( @words, $i, 1 ); $rejected{$word} = $self->{'rejections'}->{$code}; } else { $i++; # Words ok. } } $self->debug_dumper( "Accepted Words: ", \@words ) if ($self->{_debug}); $self->debug_dumper( "Rejected Words: ", \%rejected ) if ($self->{_debug}); return ( \@words, \%rejected ); } sub _check_word { #-------------------------------------------------------------------------------- # Returns an error code if it is an invalid word, otherwise returns nothing. # my $self = shift; my $word = shift; my $code; if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or (length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or (length($word) > $self->{max_word_size} and $code = 'TOOBIG')) { return $code; } return; } 1;