First pass at adding key files
This commit is contained in:
@ -0,0 +1,82 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::Base::Common
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Common.pm,v 1.8 2004/10/13 21:45:02 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Base classes upon which all search drivers are based
|
||||
#
|
||||
package GT::SQL::Search::Base::Common;
|
||||
|
||||
use strict;
|
||||
use Exporter;
|
||||
use vars qw/ @ISA @EXPORT $STOPWORDS /;
|
||||
|
||||
@ISA = qw( Exporter );
|
||||
@EXPORT = qw( &_tokenize &_check_word $STOPWORDS );
|
||||
|
||||
$STOPWORDS = { map { $_ => 1 } qw/
|
||||
of about or all several also she among since an some and such are than
|
||||
as that at the be them because there been these between they both this
|
||||
but those by to do toward during towards each upon either for from was
|
||||
had were has what have when he where her which his while however with if
|
||||
within in would into you your is it its many more most must on re it
|
||||
test not above add am pm jan january feb february mar march apr april
|
||||
may jun june jul july aug august sep sept september oct october nov
|
||||
november dec december find & > < we http com www inc other
|
||||
including
|
||||
/ };
|
||||
|
||||
sub _tokenize {
|
||||
#--------------------------------------------------------------------------------
|
||||
# takes a strings and chops it up into little bits
|
||||
my $self = shift;
|
||||
my $text = shift;
|
||||
my ( @words, $i, %rejected, $word, $code );
|
||||
|
||||
# split on any non-word (includes accents) characters
|
||||
@words = split /[^\w\x80-\xFF\-]+/, lc $text;
|
||||
$self->debug_dumper( "Words: ", \@words ) if ($self->{_debug});
|
||||
|
||||
# drop all words that are too small, etc.
|
||||
$i = 0;
|
||||
while ( $i <= $#words ) {
|
||||
$word = $words[ $i ];
|
||||
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
|
||||
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
|
||||
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
|
||||
splice( @words, $i, 1 );
|
||||
$rejected{$word} = $self->{'rejections'}->{$code};
|
||||
}
|
||||
else {
|
||||
$i++; # Words ok.
|
||||
}
|
||||
}
|
||||
$self->debug_dumper( "Accepted Words: ", \@words ) if ($self->{_debug});
|
||||
$self->debug_dumper( "Rejected Words: ", \%rejected ) if ($self->{_debug});
|
||||
|
||||
return ( \@words, \%rejected );
|
||||
}
|
||||
|
||||
sub _check_word {
|
||||
#--------------------------------------------------------------------------------
|
||||
# Returns an error code if it is an invalid word, otherwise returns nothing.
|
||||
#
|
||||
my $self = shift;
|
||||
my $word = shift;
|
||||
my $code;
|
||||
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
|
||||
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
|
||||
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
|
||||
return $code;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
1;
|
Reference in New Issue
Block a user