discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/Base/Common.pm

83 lines
3.0 KiB
Perl
Raw Permalink Normal View History

2024-06-17 11:49:12 +00:00
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::Base::Common
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Common.pm,v 1.8 2004/10/13 21:45:02 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Base classes upon which all search drivers are based
#
package GT::SQL::Search::Base::Common;
use strict;
use Exporter;
use vars qw/ @ISA @EXPORT $STOPWORDS /;
@ISA = qw( Exporter );
@EXPORT = qw( &_tokenize &_check_word $STOPWORDS );
$STOPWORDS = { map { $_ => 1 } qw/
of about or all several also she among since an some and such are than
as that at the be them because there been these between they both this
but those by to do toward during towards each upon either for from was
had were has what have when he where her which his while however with if
within in would into you your is it its many more most must on re it
test not above add am pm jan january feb february mar march apr april
may jun june jul july aug august sep sept september oct october nov
november dec december find &amp &gt &lt we http com www inc other
including
/ };
sub _tokenize {
#--------------------------------------------------------------------------------
# takes a strings and chops it up into little bits
my $self = shift;
my $text = shift;
my ( @words, $i, %rejected, $word, $code );
# split on any non-word (includes accents) characters
@words = split /[^\w\x80-\xFF\-]+/, lc $text;
$self->debug_dumper( "Words: ", \@words ) if ($self->{_debug});
# drop all words that are too small, etc.
$i = 0;
while ( $i <= $#words ) {
$word = $words[ $i ];
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
splice( @words, $i, 1 );
$rejected{$word} = $self->{'rejections'}->{$code};
}
else {
$i++; # Words ok.
}
}
$self->debug_dumper( "Accepted Words: ", \@words ) if ($self->{_debug});
$self->debug_dumper( "Rejected Words: ", \%rejected ) if ($self->{_debug});
return ( \@words, \%rejected );
}
sub _check_word {
#--------------------------------------------------------------------------------
# Returns an error code if it is an invalid word, otherwise returns nothing.
#
my $self = shift;
my $word = shift;
my $code;
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
return $code;
}
return;
}
1;