# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
#   GT::SQL::Search::Base::Common
#   Author  : Aki Mimoto
#   CVS Info : 087,071,086,086,085      
#   $Id: Common.pm,v 1.8 2004/10/13 21:45:02 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc.  All Rights Reserved.
# ==================================================================
#
# Description:
#   Base classes upon which all search drivers are based
#
package GT::SQL::Search::Base::Common;

use strict;
use Exporter;
use vars qw/ @ISA @EXPORT $STOPWORDS /;

    @ISA = qw( Exporter );
    @EXPORT = qw( &_tokenize &_check_word $STOPWORDS );

    $STOPWORDS = { map { $_ => 1 } qw/
        of about or all several also she among since an some and such are than
        as that at the be them because there been these between they both this
        but those by to do toward during towards each upon either for from was
        had were has what have when he where her which his while however with if
        within in would into you your is it its many more most must on re it
        test not above add am pm jan january feb february mar march apr april
        may jun june jul july aug august sep sept september oct october nov
        november dec december find &amp &gt &lt we http com www inc other
        including 
    / };

sub _tokenize {
#--------------------------------------------------------------------------------
# takes a strings and chops it up into little bits
    my $self    = shift;
    my $text    = shift;
    my ( @words, $i, %rejected, $word, $code );

# split on any non-word (includes accents) characters
    @words = split /[^\w\x80-\xFF\-]+/, lc $text;
    $self->debug_dumper( "Words: ", \@words ) if ($self->{_debug});

# drop all words that are too small, etc.
    $i = 0;
    while ( $i <= $#words ) {
        $word = $words[ $i ];
        if ((exists $self->{stopwords}{$word}   and ($code = 'STOPWORD')) or
            (length($word) < $self->{min_word_size} and $code = 'TOOSMALL' )  or
            (length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
                splice( @words, $i, 1 );
                $rejected{$word}    = $self->{'rejections'}->{$code};
        }
        else {
            $i++;   # Words ok.
        }
    }
    $self->debug_dumper( "Accepted Words: ", \@words  )   if ($self->{_debug});
    $self->debug_dumper( "Rejected Words: ", \%rejected ) if ($self->{_debug});
    
    return ( \@words, \%rejected );
}

sub _check_word {
#--------------------------------------------------------------------------------
# Returns an error code if it is an invalid word, otherwise returns nothing.
#
    my $self = shift;
    my $word = shift;
    my $code;
    if ((exists $self->{stopwords}{$word}      and ($code = 'STOPWORD')) or
        (length($word) < $self->{min_word_size} and $code = 'TOOSMALL' )  or
        (length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
            return $code;
    }
    return;
}

1;