First pass at adding key files
This commit is contained in:
		@@ -0,0 +1,411 @@
 | 
			
		||||
# ====================================================================
 | 
			
		||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
 | 
			
		||||
#
 | 
			
		||||
#   GT::SQL::Search::INTERNAL::Indexer
 | 
			
		||||
#   Author: Aki Mimoto
 | 
			
		||||
#   CVS Info : 087,071,086,086,085      
 | 
			
		||||
#   $Id: Indexer.pm,v 1.11 2004/01/13 01:35:19 jagerman Exp $
 | 
			
		||||
#
 | 
			
		||||
# Copyright (c) 2004 Gossamer Threads Inc.  All Rights Reserved.
 | 
			
		||||
# ====================================================================
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
package GT::SQL::Search::INTERNAL::Indexer;
 | 
			
		||||
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Preamble information related to the object
 | 
			
		||||
    use strict;
 | 
			
		||||
    use vars qw/@ISA $ATTRIBS $VERSION $DEBUG /;
 | 
			
		||||
    use GT::SQL::Search::Base::Indexer;
 | 
			
		||||
    @ISA = qw/ GT::SQL::Search::Base::Indexer /;
 | 
			
		||||
    $DEBUG   = 0;
 | 
			
		||||
    $VERSION = sprintf "%d.%03d", q$Revision: 1.11 $ =~ /(\d+)\.(\d+)/;
 | 
			
		||||
 | 
			
		||||
sub load {
 | 
			
		||||
    shift;
 | 
			
		||||
    return GT::SQL::Search::INTERNAL::Indexer->new(@_)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub drop_search_driver {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $table = $self->{table}->name;
 | 
			
		||||
    my $rc1 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Word_List");
 | 
			
		||||
    my $rc2 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Score_List");
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub add_search_driver {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $name    = $self->{table}->name;
 | 
			
		||||
 | 
			
		||||
# first create the table that handles the words.
 | 
			
		||||
    my $creator = $self->{table}->creator ( $name . "_Word_List" );
 | 
			
		||||
    $creator->cols(
 | 
			
		||||
        Word_ID => {
 | 
			
		||||
            pos      => 1,
 | 
			
		||||
            type     => 'int',
 | 
			
		||||
            not_null => 1,
 | 
			
		||||
            unsigned => 1
 | 
			
		||||
        },
 | 
			
		||||
        Word => {
 | 
			
		||||
            pos     => 2,
 | 
			
		||||
            type    => 'varchar',
 | 
			
		||||
            not_null=> 1,
 | 
			
		||||
            size    => '50'
 | 
			
		||||
        },
 | 
			
		||||
        Frequency => {
 | 
			
		||||
            pos     => 3,
 | 
			
		||||
            type    => 'int',
 | 
			
		||||
            not_null=> 1
 | 
			
		||||
        }
 | 
			
		||||
    );
 | 
			
		||||
    $creator->pk('Word_ID');
 | 
			
		||||
    $creator->ai('Word_ID');
 | 
			
		||||
    $creator->unique({ $name . "_wordndx" => ['Word'] });
 | 
			
		||||
    $creator->create('force') or return;
 | 
			
		||||
 | 
			
		||||
# now create the handler for scores
 | 
			
		||||
    $creator = $self->{table}->creator( $name . '_Score_List' );
 | 
			
		||||
    $creator->cols(
 | 
			
		||||
        Word_ID => {
 | 
			
		||||
            pos      => 1,
 | 
			
		||||
            type     => 'int',
 | 
			
		||||
            not_null => 1,
 | 
			
		||||
            unsigned => 1
 | 
			
		||||
        },
 | 
			
		||||
        Item_ID => {
 | 
			
		||||
            pos      => 2,
 | 
			
		||||
            type     => 'int',
 | 
			
		||||
            not_null => 1,
 | 
			
		||||
            unsigned => 1
 | 
			
		||||
        },
 | 
			
		||||
        Score => {
 | 
			
		||||
            pos      => 3,
 | 
			
		||||
            type     => 'int',
 | 
			
		||||
            not_null => 1
 | 
			
		||||
        },
 | 
			
		||||
        Word_Pos => {
 | 
			
		||||
            pos      => 4,
 | 
			
		||||
            type     => 'int',
 | 
			
		||||
            not_null => 1
 | 
			
		||||
        }
 | 
			
		||||
    );
 | 
			
		||||
    $creator->index({ 'wndx' => ['Word_ID', 'Item_ID', 'Score'], 'itndx' => ['Item_ID'] });
 | 
			
		||||
    $creator->create('force') or return;
 | 
			
		||||
    return 1;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub post_create_table {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# creates the index tables..
 | 
			
		||||
#
 | 
			
		||||
    return $_[0]->add_search_driver(@_);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub post_drop_table {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# Remove the index tables.
 | 
			
		||||
#
 | 
			
		||||
    return $_[0]->drop_search_driver(@_);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub init_queries {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# Pre-load all our queries.
 | 
			
		||||
#
 | 
			
		||||
    my $self       = shift;
 | 
			
		||||
    my $queries    = shift;
 | 
			
		||||
 | 
			
		||||
    my $driver     = $self->{table}->{driver} or return $self->error ('NODRIVER', 'FATAL');
 | 
			
		||||
    my $table_name = $self->{table}->name()   or return $self->error('NOSCHEMA', 'FATAL');
 | 
			
		||||
    my $wtable     = $table_name . '_Word_List';
 | 
			
		||||
    my $seq        = $wtable . '_seq';
 | 
			
		||||
    my $stable     = $table_name . '_Score_List';
 | 
			
		||||
 | 
			
		||||
    my %ai_queries = (
 | 
			
		||||
        ins_word_ORACLE => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES ($seq.NEXTVAL, ?, ?)",
 | 
			
		||||
        ins_word_PG     => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES (NEXTVAL('$seq'), ?, ?)",
 | 
			
		||||
        ins_word        => "INSERT INTO $wtable (Word, Frequency) VALUES (?, ?)"
 | 
			
		||||
    );
 | 
			
		||||
    my %queries = (
 | 
			
		||||
        upd_word  => "UPDATE $wtable SET Frequency = ? WHERE  Word_ID = ?",
 | 
			
		||||
        sel_word  => "SELECT Word_ID,Word,Frequency FROM $wtable WHERE  Word = ?",
 | 
			
		||||
        sel_freq  => "SELECT Frequency FROM $wtable WHERE  Word_ID = ?",
 | 
			
		||||
        del_word  => "DELETE FROM $wtable WHERE  Word_ID = ?",
 | 
			
		||||
        mod_word  => "UPDATE $wtable SET Frequency = Frequency - ? WHERE Word_ID = ?",
 | 
			
		||||
        ins_scor  => "INSERT INTO $stable (Word_ID, Item_ID, Score, Word_Pos) VALUES (?, ?, ?, ?)",
 | 
			
		||||
        item_cnt  => "SELECT Word_ID, COUNT(*) FROM $stable WHERE Item_ID = ? GROUP BY Word_ID",
 | 
			
		||||
        scr_del   => "DELETE FROM $stable WHERE Item_ID = ?",
 | 
			
		||||
        dump_word => "DELETE FROM $wtable",
 | 
			
		||||
        dump_scor => "DELETE FROM $stable"
 | 
			
		||||
    );
 | 
			
		||||
    my $type = uc $self->{table}->{connect}->{driver};
 | 
			
		||||
    $self->{ins_word} = $driver->prepare($ai_queries{"ins_word_$type"} || $ai_queries{"ins_word"});
 | 
			
		||||
 | 
			
		||||
# check to see if the table exist
 | 
			
		||||
    $self->{table}->new_table( $wtable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
 | 
			
		||||
    $self->{table}->new_table( $stable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    if ($type eq 'MYSQL') {
 | 
			
		||||
        foreach my $query (keys %queries) {
 | 
			
		||||
            $self->{$query} = $driver->prepare_raw ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        foreach my $query (keys %queries) {
 | 
			
		||||
            $self->{$query} = $driver->prepare ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub post_add_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# indexes a single record
 | 
			
		||||
    my ($self, $rec, $insert_sth ) = @_;
 | 
			
		||||
 | 
			
		||||
# Only continue if we have weights and a primary key.
 | 
			
		||||
    my $tbl     = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    my %weights = $tbl->_weight_cols() or return;
 | 
			
		||||
    my ($pk)    = $tbl->pk();
 | 
			
		||||
    my $item_id = ( $tbl->ai() and $insert_sth ) ? $insert_sth->insert_id() : $rec->{$pk};
 | 
			
		||||
    my $index   = 0;
 | 
			
		||||
 | 
			
		||||
    $self->{init} or $self->init_queries;
 | 
			
		||||
 | 
			
		||||
# Go through each column and index it.
 | 
			
		||||
    foreach my $column ( keys %weights ) {
 | 
			
		||||
        my ($word_list, $rejected) = $self->_tokenize( $rec->{$column} );
 | 
			
		||||
        $word_list or next;
 | 
			
		||||
 | 
			
		||||
# Build a hash of word => frequency.
 | 
			
		||||
        my %words;
 | 
			
		||||
        foreach my $word (@{$word_list}) {
 | 
			
		||||
            $words{$word}++;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
# Add the words in, or update frequency.
 | 
			
		||||
        my %word_ids = ();
 | 
			
		||||
        while (my ($word, $freq) = each %words) {
 | 
			
		||||
            $self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
            my $word_r = $self->{sel_word}->fetchrow_arrayref; # Word_ID, Word, Frequency
 | 
			
		||||
            if ($word_r) {
 | 
			
		||||
                $word_r->[2] += $freq;
 | 
			
		||||
                $word_ids{$word} = $word_r->[0];
 | 
			
		||||
                $self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
            }
 | 
			
		||||
            else {
 | 
			
		||||
                $self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
                $word_ids{$word} = $self->{ins_word}->insert_id();
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
# now that we have the word ids, insert each of the word-points
 | 
			
		||||
        my $weight = $weights{$column};
 | 
			
		||||
        foreach my $word ( @{$word_list} ) {
 | 
			
		||||
            $self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
        }
 | 
			
		||||
        $index++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub reindex_all {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $table   = shift;
 | 
			
		||||
    my $opts    = shift;
 | 
			
		||||
    my $tick    = $opts->{tick} || 0;
 | 
			
		||||
    my $max     = $opts->{max}  || 5000;
 | 
			
		||||
 | 
			
		||||
    my %weights     = $self->{table}->_weight_cols() or return;
 | 
			
		||||
    my @weight_list = keys %weights;
 | 
			
		||||
    my @weight_arr  = map { $weights{$_} } @weight_list;
 | 
			
		||||
    my ($pk)    = $self->{table}->pk();
 | 
			
		||||
    my $index   = 0;
 | 
			
		||||
    my $word_id = 1;
 | 
			
		||||
    $self->{init} or $self->init_queries;
 | 
			
		||||
    
 | 
			
		||||
# first nuke the current index
 | 
			
		||||
    $self->dump_index();
 | 
			
		||||
 | 
			
		||||
# Go through the table and index each field.
 | 
			
		||||
    my $iterations = 1;
 | 
			
		||||
    my $count = 0;
 | 
			
		||||
 | 
			
		||||
    while (1) {
 | 
			
		||||
        if ($max) {
 | 
			
		||||
            my $offset = ($iterations-1) * $max;
 | 
			
		||||
            $table->select_options ( "LIMIT $offset,$max");
 | 
			
		||||
        }
 | 
			
		||||
        my $cond     = $opts->{cond} || {};
 | 
			
		||||
        my $sth      = $table->select($cond, [ $pk, @weight_list] );
 | 
			
		||||
        my $done     = 1;
 | 
			
		||||
 | 
			
		||||
        while ( my $arrayref = $sth->fetchrow_arrayref() ) {
 | 
			
		||||
# the primary key value
 | 
			
		||||
            my $i       = 0;
 | 
			
		||||
            my $item_id = $arrayref->[($i++)];
 | 
			
		||||
            $index      = 0;
 | 
			
		||||
            $done       = 0;
 | 
			
		||||
 | 
			
		||||
# start going through the record data
 | 
			
		||||
            foreach my $weight ( @weight_arr ) {
 | 
			
		||||
                my ($word_list, $junk) = $self->_tokenize( $arrayref->[$i++]  );
 | 
			
		||||
                $word_list or next;
 | 
			
		||||
 | 
			
		||||
# Build a hash of word => frequency.
 | 
			
		||||
                my %words;
 | 
			
		||||
                foreach my $word (@{$word_list}) {
 | 
			
		||||
                    $words{$word}++;
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
# Add the words in, or update frequency.
 | 
			
		||||
                my %word_ids = ();
 | 
			
		||||
                while (my ($word, $freq) = each %words) {
 | 
			
		||||
                    $self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
                    my $word_r = $self->{sel_word}->fetchrow_arrayref; # WordID,Word,Freq
 | 
			
		||||
                    if ($word_r) {
 | 
			
		||||
                        $word_r->[2] += $freq;
 | 
			
		||||
                        $word_ids{$word} = $word_r->[0];
 | 
			
		||||
                        $self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
                    }
 | 
			
		||||
                    else {
 | 
			
		||||
                        $self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
                        $word_ids{$word} = $self->{ins_word}->insert_id();
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
# now that we have the word ids, insert each of the word-points
 | 
			
		||||
                foreach my $word ( @{$word_list} ) {
 | 
			
		||||
                    $self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
                }
 | 
			
		||||
                $index++;
 | 
			
		||||
            }
 | 
			
		||||
            if ($tick) {
 | 
			
		||||
                $count++;
 | 
			
		||||
                $count % $tick      or (print "$count ");
 | 
			
		||||
                $count % ($tick*10) or (print "\n");
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        return if ($done);
 | 
			
		||||
        $iterations++;
 | 
			
		||||
        return if (! $max);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub pre_delete_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# Delete a records index values.
 | 
			
		||||
#
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $where   = shift; 
 | 
			
		||||
 | 
			
		||||
    my $tbl     = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    my %weights = $tbl->_weight_cols() or return;
 | 
			
		||||
    my ($pk)    = $tbl->pk();
 | 
			
		||||
    my $q       = $tbl->select( $where, [ $pk ] );
 | 
			
		||||
 | 
			
		||||
    while ( my $aref = $q->fetchrow_arrayref() ) {
 | 
			
		||||
        my $item_id = $aref->[0] or next;
 | 
			
		||||
        my @weight_list = keys %weights;
 | 
			
		||||
        my $index   = 0;
 | 
			
		||||
        $self->{init} or $self->init_queries;
 | 
			
		||||
 | 
			
		||||
    # Get a frequency count for each word 
 | 
			
		||||
        $self->{item_cnt}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
 | 
			
		||||
    # Now go through and either decrement the freq, or remove the entry.
 | 
			
		||||
        while ( my ($word_id, $frequency) = $self->{item_cnt}->fetchrow_array() ) {
 | 
			
		||||
            $self->{sel_freq}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
            $self->debug( "Deleting frequencies for $word_id. decreasing by $frequency" ) if ($self->{_debug});
 | 
			
		||||
            if (my $freq = $self->{sel_freq}->fetchrow_arrayref) {
 | 
			
		||||
                if ($freq->[0] == $frequency) {
 | 
			
		||||
                    $self->{del_word}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
                }
 | 
			
		||||
                else {
 | 
			
		||||
                    $self->{mod_word}->execute($frequency, $word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    # Remove the listings from the scores table.
 | 
			
		||||
        $self->{scr_del}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
    }
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub post_update_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
    my ( $self, $set_cond, $where_cond, $tmp ) = @_;
 | 
			
		||||
 | 
			
		||||
# delete the previous record
 | 
			
		||||
    $self->pre_delete_record( $where_cond ) or return;
 | 
			
		||||
#
 | 
			
		||||
# the new record
 | 
			
		||||
    my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    my $q   = $tbl->select( $where_cond );
 | 
			
		||||
    while ( my $href = $q->fetchrow_hashref() ) {
 | 
			
		||||
        $self->post_add_record( $href );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return 1;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub reindex_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# reindexes a record. basically deletes all associated records from current db abnd does an index.
 | 
			
		||||
# it's safe to use this
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $rec     = shift;
 | 
			
		||||
 | 
			
		||||
    $self->delete_record($rec);
 | 
			
		||||
    $self->index_record($rec);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub dump_index {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    $self->{init} or $self->init_queries;
 | 
			
		||||
 | 
			
		||||
    $self->{dump_word}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
    $self->{dump_scor}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
sub debug_dumper {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# calls debug but also dumps all the messages
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $message = shift;
 | 
			
		||||
    my $level   = ref $_[0] ? 1 : shift;
 | 
			
		||||
 | 
			
		||||
    if ( $self->{_debug} >= $level ) {
 | 
			
		||||
        require GT::Dumper;
 | 
			
		||||
        $self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ ));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub DESTROY {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Calls finish on init queries.
 | 
			
		||||
#
 | 
			
		||||
    my $self = shift;
 | 
			
		||||
    return unless ($self->{init});
 | 
			
		||||
    $self->{upd_word}->finish;
 | 
			
		||||
#   $self->{ins_word}->finish; will get finished automatically
 | 
			
		||||
    $self->{sel_word}->finish;
 | 
			
		||||
    $self->{sel_freq}->finish;
 | 
			
		||||
    $self->{del_word}->finish;
 | 
			
		||||
    $self->{mod_word}->finish;
 | 
			
		||||
    $self->{ins_scor}->finish;
 | 
			
		||||
    $self->{item_cnt}->finish;
 | 
			
		||||
    $self->{scr_del}->finish;
 | 
			
		||||
    $self->{dump_word}->finish;
 | 
			
		||||
    $self->{dump_scor}->finish;
 | 
			
		||||
    $self->{init} = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
1;
 | 
			
		||||
@@ -0,0 +1,604 @@
 | 
			
		||||
# ==================================================================
 | 
			
		||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
 | 
			
		||||
#
 | 
			
		||||
#   GT::SQL::Indexer
 | 
			
		||||
#   Author  : Aki Mimoto
 | 
			
		||||
#   CVS Info : 087,071,086,086,085      
 | 
			
		||||
#   $Id: Search.pm,v 1.18 2004/08/28 03:53:47 jagerman Exp $
 | 
			
		||||
#
 | 
			
		||||
# Copyright (c) 2004 Gossamer Threads Inc.  All Rights Reserved.
 | 
			
		||||
# ==================================================================
 | 
			
		||||
#
 | 
			
		||||
# Description:
 | 
			
		||||
#   Class used to make changes to tables and create tables.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
package GT::SQL::Search::INTERNAL::Search;
 | 
			
		||||
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
    use strict;
 | 
			
		||||
    use vars qw/@ISA $VERSION $DEBUG $ATTRIBS /;
 | 
			
		||||
    use GT::SQL::Search::Base::Search;
 | 
			
		||||
    @ISA = qw( GT::SQL::Search::Base::Search );
 | 
			
		||||
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Preamble information related to the object
 | 
			
		||||
 | 
			
		||||
    $DEBUG   = 0;
 | 
			
		||||
    $VERSION = sprintf "%d.%03d", q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/;
 | 
			
		||||
    $ATTRIBS = {
 | 
			
		||||
# the max number of links that can be handled by UNION before it should simply 
 | 
			
		||||
# shunt the searching pipe to NONINDEXED system
 | 
			
		||||
        'union_shunt_threshold'  => '5000',
 | 
			
		||||
        'phrase_shunt_threshold' => '1000',
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
################################################################################
 | 
			
		||||
# Internal functions
 | 
			
		||||
################################################################################
 | 
			
		||||
 | 
			
		||||
sub load {
 | 
			
		||||
    shift;
 | 
			
		||||
    return GT::SQL::Search::INTERNAL::Search->new(@_)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _query {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# this just checks to ensure that the words are not all search keywords
 | 
			
		||||
#
 | 
			
		||||
    my ( $self, $input, $buckets ) = @_;
 | 
			
		||||
 | 
			
		||||
# calculate wordids and frequencies
 | 
			
		||||
    foreach ( keys %$buckets ) {
 | 
			
		||||
        $buckets->{$_} = $self->get_wordids( $buckets->{$_}, ( /phrase/ ? "phrases" : "keywords" ) );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
# the following is a bit tricky and will be replaced however, if the number 
 | 
			
		||||
# of results from a union is more than the maximum shunt value, it will 
 | 
			
		||||
# simply do a nonindexed query
 | 
			
		||||
    if ( $buckets->{keywords} ) {
 | 
			
		||||
        my $rec       = _count_frequencies( $buckets->{keywords} );
 | 
			
		||||
        my $count     = 0;
 | 
			
		||||
        foreach ( values %$rec ) { $count +=  $_; }
 | 
			
		||||
        if ($count > $self->{union_shunt_threshold}) {
 | 
			
		||||
            $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
 | 
			
		||||
            return $self->alternate_driver_query( 'NONINDEXED', $input );
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Now test the phrases. Just due to how the phrase searching works, the queries
 | 
			
		||||
# can grow in size extremely rapidly, and slowdown the search. So the limit for
 | 
			
		||||
# phrase searching is separate as it requires a different cutoff value than
 | 
			
		||||
# the keyword search which is usually much lower!
 | 
			
		||||
    if ($buckets->{phrases}) {
 | 
			
		||||
        foreach my $phrase ( keys %{$buckets->{phrases} || {} } ) {
 | 
			
		||||
            my $rec       = _count_frequencies( $buckets->{phrases}->{$phrase}->{word_info} );
 | 
			
		||||
            my ( $count ) = sort values %$rec; # Get smallest frequency.
 | 
			
		||||
            if ( $count > $self->{phrase_shunt_threshold} ) {
 | 
			
		||||
                $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
 | 
			
		||||
                return $self->alternate_driver_query( 'NONINDEXED', $input );
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    if ($buckets->{phrases_must}) {
 | 
			
		||||
        foreach my $phrase ( keys %{$buckets->{phrases_must} || {} } ) {
 | 
			
		||||
            my $rec       = _count_frequencies( $buckets->{phrases_must}->{$phrase}->{word_info} );
 | 
			
		||||
            my ( $count ) = sort values %$rec; # Get smallest frequency.
 | 
			
		||||
            if ( $count > $self->{phrase_shunt_threshold} ) {
 | 
			
		||||
                $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
 | 
			
		||||
                return $self->alternate_driver_query( 'NONINDEXED', $input );
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    return $self->SUPER::_query( $input, $buckets );
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _count_frequencies {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
    my $word_info = shift;
 | 
			
		||||
    my $rec       = {};
 | 
			
		||||
    foreach my $word ( keys %$word_info ) {
 | 
			
		||||
        my $freq  = 0;
 | 
			
		||||
        foreach ( values %{$word_info->{$word}->{word_info}} ) {
 | 
			
		||||
            $freq += $_;
 | 
			
		||||
        }
 | 
			
		||||
        $rec->{$word} = $freq;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $rec;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _table_names {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# return the table names
 | 
			
		||||
#
 | 
			
		||||
    my $self    =  shift;
 | 
			
		||||
    my $table   = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
 | 
			
		||||
    my $wtable  = $table . '_Word_List';
 | 
			
		||||
    my $stable  = $table . '_Score_List';
 | 
			
		||||
 | 
			
		||||
    return ( $table, $wtable, $stable);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _word_infos {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# get the word ids and frequencies 
 | 
			
		||||
#
 | 
			
		||||
    my $self       = shift;
 | 
			
		||||
    my $word_infos = shift;
 | 
			
		||||
 | 
			
		||||
    my $rec        = {};
 | 
			
		||||
 | 
			
		||||
    foreach my $word ( keys %$word_infos ) {
 | 
			
		||||
        my $wi     = $word_infos->{$word}->{word_info};
 | 
			
		||||
        $rec->{$word} = [ map { [ $_, $wi->{$_} ] } keys %$wi ];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $rec;
 | 
			
		||||
    
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _union_query {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Takes a list of words and gets all words that match
 | 
			
		||||
# returns { itemid -> score } of hits that match
 | 
			
		||||
#
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $words   = shift;
 | 
			
		||||
    my $results = shift || {};
 | 
			
		||||
    my ( $query, $where, $db, $word_infos );
 | 
			
		||||
    my ( $table, $wtable, $stable) = $self->_table_names();
 | 
			
		||||
 | 
			
		||||
    $db         = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    $word_infos = $self->_word_infos( $words ) or return $results;
 | 
			
		||||
 | 
			
		||||
    return $results unless (keys %{$word_infos});
 | 
			
		||||
 | 
			
		||||
    $self->debug_dumper( "Getting words: ", $words) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# build the where clause
 | 
			
		||||
    my @word_ids;
 | 
			
		||||
    foreach my $word_synonym_list  ( values %$word_infos ) {
 | 
			
		||||
        next unless ( $word_synonym_list );
 | 
			
		||||
        foreach my $word_id ( @{$word_synonym_list }) {
 | 
			
		||||
            next unless ( ref $word_id eq 'ARRAY' ); # ensure it's a reference
 | 
			
		||||
            push @word_ids, $word_id->[0]; # we need to shed the word quantities
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $results unless ( @word_ids );
 | 
			
		||||
    $where = 'Word_ID IN(' . join(",", @word_ids) . ")";
 | 
			
		||||
 | 
			
		||||
# build the query
 | 
			
		||||
    $query = qq!
 | 
			
		||||
        SELECT Item_ID, SUM(Score)
 | 
			
		||||
            FROM $stable
 | 
			
		||||
            WHERE
 | 
			
		||||
                $where
 | 
			
		||||
            GROUP BY Item_ID
 | 
			
		||||
    !;
 | 
			
		||||
 | 
			
		||||
    $self->debug( "Union Query: $query" ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# prepare the query
 | 
			
		||||
    my $sth = $db->prepare( $query ) or return;
 | 
			
		||||
    $sth->execute() or return;
 | 
			
		||||
 | 
			
		||||
# get the results
 | 
			
		||||
    my %word_infos = $sth->fetchall_list;
 | 
			
		||||
 | 
			
		||||
# merge the current result set into found
 | 
			
		||||
    foreach my $item ( keys %{$results} ) {
 | 
			
		||||
        $word_infos{$item} += $results->{$item};
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    return \%word_infos;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _intersect_query {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Takes a list of words and gets all words that match all the keywords
 | 
			
		||||
# returns { itemid -> score } of hits that match
 | 
			
		||||
#
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $words   = shift;
 | 
			
		||||
    my $results = shift || {};
 | 
			
		||||
 | 
			
		||||
    $words or return $results;
 | 
			
		||||
    keys %{$words} or return $results;
 | 
			
		||||
 | 
			
		||||
    my ( $query, $where, $db, $word_infos, $word_hits );
 | 
			
		||||
    my ( $table, $wtable, $stable) = $self->_table_names();
 | 
			
		||||
 | 
			
		||||
# have we left any of our words out?
 | 
			
		||||
    $db         = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    $word_infos = $self->_word_infos( $words ) or return {};
 | 
			
		||||
    if ( keys %{$word_infos} < keys %{$words} ) {
 | 
			
		||||
        return {};
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    $self->debug_dumper( "Keyword Intersect words: ", $word_infos ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# take the words and get a hash of the word scores
 | 
			
		||||
    foreach my $word ( keys %{$word_infos} ) {
 | 
			
		||||
 | 
			
		||||
        my $total_freq = 0;
 | 
			
		||||
        foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
 | 
			
		||||
            $total_freq += $word_synonyms->[1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        $word_hits->{$word} = $total_freq or return;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
# so now, sort out the words from lowest frequency to highest frequency
 | 
			
		||||
    my @search_order = sort { $word_hits->{$a} <=> $word_hits->{$b} } keys %{$word_hits};
 | 
			
		||||
 | 
			
		||||
    $self->debug_dumper( "Searching words in this order: ", \@search_order) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# find out how we're going to handle the searching, if the first elements
 | 
			
		||||
 | 
			
		||||
################################################################################
 | 
			
		||||
### The following part is for smaller intersect subsets
 | 
			
		||||
################################################################################
 | 
			
		||||
    my $intersect = $results;
 | 
			
		||||
    foreach my $word ( @search_order ) {
 | 
			
		||||
 | 
			
		||||
# setup the where clause to get all the words associated
 | 
			
		||||
        my $where   = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
 | 
			
		||||
 | 
			
		||||
# setup the intersect for the previous if required. for iterative intersecting
 | 
			
		||||
        if ( keys %{$intersect} ) {
 | 
			
		||||
            $where  .= " AND Item_ID in(" . join(",",keys %{$intersect}) . ")";
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
# make the database engine work a little bit
 | 
			
		||||
        $query = qq!
 | 
			
		||||
            SELECT Item_ID, SUM(Score) AS Score
 | 
			
		||||
            FROM $stable
 | 
			
		||||
            WHERE
 | 
			
		||||
                $where
 | 
			
		||||
            GROUP BY Item_ID
 | 
			
		||||
        !;
 | 
			
		||||
        $self->debug( "Intersect Query: $query" ) if ($self->{_debug});
 | 
			
		||||
        my $intersect_sth = $db->prepare( $query );
 | 
			
		||||
 | 
			
		||||
        $intersect_sth->execute();
 | 
			
		||||
 | 
			
		||||
# get a list of all the matches
 | 
			
		||||
        my $matches = $intersect_sth->fetchall_arrayref();
 | 
			
		||||
 | 
			
		||||
        $self->debug_dumper( "Matches found for $word: ", $matches ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# go through all the matches and intersect them
 | 
			
		||||
        my %tmp = ();
 | 
			
		||||
        foreach my $row ( @{$matches} ) {
 | 
			
		||||
            my ( $itemid, $score ) = @{$row};
 | 
			
		||||
            $intersect->{$itemid} ||= 0;
 | 
			
		||||
            $tmp{ $itemid } = $intersect->{$itemid} + $score;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
# inform the system of that development
 | 
			
		||||
        %tmp or return;
 | 
			
		||||
        $intersect = \%tmp;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $intersect;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _disjoin_query {
 | 
			
		||||
#------------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $words   = shift;
 | 
			
		||||
    my $results = shift || {};
 | 
			
		||||
    $words or return $results;
 | 
			
		||||
 | 
			
		||||
    my ( $query, $where, $db, $word_infos, $word_hits );
 | 
			
		||||
    my ( $table, $wtable, $stable) = $self->_table_names();
 | 
			
		||||
 | 
			
		||||
    $db = $self->{table}->{driver} or return $results;
 | 
			
		||||
 | 
			
		||||
# have we left any of our words out?
 | 
			
		||||
    $word_infos = $self->_word_infos( $words ) or return $results;
 | 
			
		||||
#   if ( keys %{$word_infos} < keys %{$words} ) {
 | 
			
		||||
#       return $results;
 | 
			
		||||
#   }
 | 
			
		||||
 | 
			
		||||
# take the words and get a hash of the word scores
 | 
			
		||||
    foreach my $word ( keys %{$word_infos} ) {
 | 
			
		||||
        my $total_freq = 0;
 | 
			
		||||
        foreach my $word_synonyms ( $word_infos->{$word} ) {
 | 
			
		||||
            $total_freq += ( $word_synonyms->[0] || 0 );
 | 
			
		||||
        }
 | 
			
		||||
# if the value is null this mean there is actually no results, whoops!
 | 
			
		||||
        $total_freq and $word_hits->{$word} = $total_freq;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
# so now, sort out the words from lowest frequency to highest frequency
 | 
			
		||||
    my @search_order = sort { $word_hits->{$b} <=> $word_hits->{$b} } keys %{$word_hits};
 | 
			
		||||
    $self->debug_dumper( "Disjoining words in the following order: ", \@search_order) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
################################################################################
 | 
			
		||||
### This following part is for smaller disjoin presets
 | 
			
		||||
################################################################################
 | 
			
		||||
    foreach my $word ( @search_order ) {
 | 
			
		||||
 | 
			
		||||
# setup the where clause to get all the words associated
 | 
			
		||||
        my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
 | 
			
		||||
 | 
			
		||||
# setup the intersect for the previous if required. for iterative intersecting
 | 
			
		||||
        if ( keys %{$results} ) {
 | 
			
		||||
            $where .= " AND Item_ID in(" . join(",", keys %{$results}) . ")";
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
# make the database engine work a little bit
 | 
			
		||||
        $query = qq!
 | 
			
		||||
            SELECT Item_ID
 | 
			
		||||
            FROM $stable
 | 
			
		||||
            WHERE
 | 
			
		||||
                $where
 | 
			
		||||
            GROUP BY Item_ID
 | 
			
		||||
        !;
 | 
			
		||||
        $self->debug($query) if ($self->{_debug});
 | 
			
		||||
        my $intersect_sth = $db->prepare( $query );
 | 
			
		||||
 | 
			
		||||
        $intersect_sth->execute();
 | 
			
		||||
 | 
			
		||||
# get a list of all the matches
 | 
			
		||||
        my $matches = $intersect_sth->fetchall_arrayref();
 | 
			
		||||
 | 
			
		||||
# strip the matches from the current result set
 | 
			
		||||
        foreach my $word ( map { $_->[0] } @{$matches}) {
 | 
			
		||||
            delete $results->{$word};
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $results;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _phrase_disjoin_query {
 | 
			
		||||
#------------------------------------------------------------
 | 
			
		||||
# subtracts the found phrases from the list
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $phrases = shift;
 | 
			
		||||
    my $results = shift || {};
 | 
			
		||||
    $phrases or return $results;
 | 
			
		||||
 | 
			
		||||
    foreach my $phrase ( values %{$phrases} ) {
 | 
			
		||||
        my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
 | 
			
		||||
 | 
			
		||||
# perform disjoin
 | 
			
		||||
        foreach my $itemid ( keys %{$temp} ) {
 | 
			
		||||
            $self->debug( "Deleting $itemid from list" ) if ($self->{_debug});
 | 
			
		||||
            delete $results->{$itemid};
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $results;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _phrase_intersect_query {
 | 
			
		||||
#------------------------------------------------------------
 | 
			
		||||
# intersects phrases together
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $phrases = shift;
 | 
			
		||||
    my $results = shift || {};
 | 
			
		||||
 | 
			
		||||
    $phrases or return $results;
 | 
			
		||||
 | 
			
		||||
    foreach my $phrase ( values %{$phrases} ) {
 | 
			
		||||
        my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
 | 
			
		||||
 | 
			
		||||
# perform intersect
 | 
			
		||||
        foreach my $itemid ( keys %{$temp} ) {
 | 
			
		||||
            $temp->{$itemid} += $results->{$itemid} || 0;
 | 
			
		||||
        }
 | 
			
		||||
        $results = $temp;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $results;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _phrase_query {
 | 
			
		||||
#------------------------------------------------------------
 | 
			
		||||
# this is a phrase union query
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $phrases = shift or return;
 | 
			
		||||
    my $results = shift || {};
 | 
			
		||||
 | 
			
		||||
    foreach my $phrase ( values %{$phrases} ) {
 | 
			
		||||
        $self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug});
 | 
			
		||||
        $results = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info}, $results );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $results;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _get_phrase {
 | 
			
		||||
#------------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $wordlist= shift;
 | 
			
		||||
    my $word_info = shift;
 | 
			
		||||
    my $results = shift || {};
 | 
			
		||||
 | 
			
		||||
    $wordlist or return $results;
 | 
			
		||||
 | 
			
		||||
    my ( $query, $where, $db, $word_infos, %word_hits );
 | 
			
		||||
    my ( $table, $wtable, $stable) = $self->_table_names();
 | 
			
		||||
    my ($pk) = $self->{table}->pk;
 | 
			
		||||
 | 
			
		||||
    $self->debug_dumper( "Getting words: ", $wordlist ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# get all the word ids that we want to handle   
 | 
			
		||||
    $db         = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    $word_infos = $self->_word_infos( $word_info ) or return;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    $self->debug_dumper( "Word infos: ", $word_infos ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# take the words and get a hash of the word scores
 | 
			
		||||
    foreach my $word ( keys %{$word_infos} ) {
 | 
			
		||||
 | 
			
		||||
        @{$word_infos->{$word} || []} or return;
 | 
			
		||||
 | 
			
		||||
        my $total_freq = 0;
 | 
			
		||||
        foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
 | 
			
		||||
            $total_freq += $word_synonyms->[1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
# if the value is null this mean there is actually no results, whoops!
 | 
			
		||||
        $word_hits{$word} = $total_freq;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    $self->debug_dumper( "With synonyms tallied: ",  \%word_hits ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# so now, setup the order of search
 | 
			
		||||
    my $i = 0;
 | 
			
		||||
    my %word_order = map { $_ => $i++ } @{$wordlist};
 | 
			
		||||
    my @search_order = sort { $word_hits{$a} <=> $word_hits{$b} } keys %word_hits;
 | 
			
		||||
 | 
			
		||||
    $self->debug_dumper( "Word search order: ", \@search_order ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
################################################################################
 | 
			
		||||
### This following part is for smaller phrases
 | 
			
		||||
################################################################################
 | 
			
		||||
# start getting words in order of their frequency
 | 
			
		||||
    my %matches = ();
 | 
			
		||||
    my $index = 0;
 | 
			
		||||
    foreach my $word ( @search_order ) {
 | 
			
		||||
 | 
			
		||||
# setup the where clause for the individual words, firstly
 | 
			
		||||
        if ( keys %matches ) {
 | 
			
		||||
            my $vector  = $word_order{$word} - $index;
 | 
			
		||||
            $where = '(';
 | 
			
		||||
            $where =
 | 
			
		||||
                '(' .
 | 
			
		||||
                join(
 | 
			
		||||
                    " OR ",
 | 
			
		||||
                    map(
 | 
			
		||||
                        "Item_ID = $_ AND Word_Pos IN(" . join(",", map $_->[0] + $vector, @{$matches{$_}}) . ')',
 | 
			
		||||
                        keys %matches
 | 
			
		||||
                    )
 | 
			
		||||
                ) .
 | 
			
		||||
                ") AND ";
 | 
			
		||||
        }
 | 
			
		||||
        else {
 | 
			
		||||
            $where = '';
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        $where .= "Word_ID IN(" . ( join ",", map { $_->[0] || () } @{$word_infos->{$word}} or return $results ) . ')';
 | 
			
		||||
 | 
			
		||||
        $query = qq!
 | 
			
		||||
            SELECT 
 | 
			
		||||
                Item_ID, Score, Word_Pos
 | 
			
		||||
            FROM 
 | 
			
		||||
                $stable 
 | 
			
		||||
            WHERE
 | 
			
		||||
                $where
 | 
			
		||||
        !;
 | 
			
		||||
 | 
			
		||||
        $self->debug( "Phrase get for '$word': " . $query ) if ($self->{_debug});
 | 
			
		||||
        my $sth = $db->prepare( $query );
 | 
			
		||||
        $sth->execute();
 | 
			
		||||
 | 
			
		||||
        %matches = ();
 | 
			
		||||
 | 
			
		||||
        while (my $hit = $sth->fetchrow_arrayref) {
 | 
			
		||||
            push @{$matches{$hit->[0]}}, [ $hit->[2], $hit->[1] ];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
# If there are no values stored in %matches, it means that for
 | 
			
		||||
# this keyword, there have been no hits based upon position.
 | 
			
		||||
# In that case, terminate and return a null result
 | 
			
		||||
        keys %matches or last;
 | 
			
		||||
 | 
			
		||||
# where were we in the string?
 | 
			
		||||
        $index = $word_order{$word};
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
# now tally up all the scores and merge the new records in
 | 
			
		||||
    foreach my $itemid ( keys %matches ) {
 | 
			
		||||
        my $score = 0;
 | 
			
		||||
        foreach my $sub_total ( @{$matches{$itemid}} ) {
 | 
			
		||||
            $score += $sub_total->[1];
 | 
			
		||||
        }
 | 
			
		||||
        $results->{$itemid} += $score;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $results;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub get_wordids {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Get a list of words 
 | 
			
		||||
#
 | 
			
		||||
    my $self     = shift;
 | 
			
		||||
    my $elements = shift or return;
 | 
			
		||||
    my $mode     = lc shift || 'keywords';
 | 
			
		||||
 | 
			
		||||
    if ( $mode eq 'keywords' ) {
 | 
			
		||||
        $elements = $self->_get_wordid($elements);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        foreach my $phrase ( keys %$elements ) {
 | 
			
		||||
            my $results = $self->_get_wordid({
 | 
			
		||||
                map { ($_ => { substring => 0 }) } @{$elements->{$phrase}->{phrase}}
 | 
			
		||||
            });
 | 
			
		||||
 | 
			
		||||
            $elements->{$phrase}->{word_info} = $results;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $elements;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _get_wordid {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Get a list of words 
 | 
			
		||||
#
 | 
			
		||||
    my $self  = shift;
 | 
			
		||||
    my $words = shift;
 | 
			
		||||
    my $tbl   = $self->{table};
 | 
			
		||||
    
 | 
			
		||||
    my ( $table, $wtable, $stable) = $self->_table_names();
 | 
			
		||||
 | 
			
		||||
    foreach my $word ( keys %$words ) {
 | 
			
		||||
        my $query =
 | 
			
		||||
            qq!SELECT Word_ID, Frequency FROM $wtable WHERE Word LIKE '! .
 | 
			
		||||
            quotemeta($word) .
 | 
			
		||||
            ( $words->{$word}->{substring} ? '%' : '' ) .
 | 
			
		||||
            "'";
 | 
			
		||||
        my $sth = $tbl->do_query($query) or next;
 | 
			
		||||
        my $tmp = { $sth->fetchall_list };
 | 
			
		||||
 | 
			
		||||
        $words->{$word}->{word_info} = $tmp;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return $words;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
##
 | 
			
		||||
# Internal Use
 | 
			
		||||
# $self->_cgi_to_hash ($in);
 | 
			
		||||
# --------------------------
 | 
			
		||||
#   Creates a hash ref from a cgi object.
 | 
			
		||||
##
 | 
			
		||||
sub _cgi_to_hash {
 | 
			
		||||
    my ($self, $cgi) = @_;
 | 
			
		||||
    $cgi and UNIVERSAL::can($cgi, 'param') or return $self->error(NODRIVER => 'FATAL');
 | 
			
		||||
    my @keys = $cgi->param;
 | 
			
		||||
    my $result = {};
 | 
			
		||||
    foreach my $key (@keys) {
 | 
			
		||||
        my @values = $cgi->param($key);
 | 
			
		||||
        if (@values == 1) { $result->{$key} = $values[0] }
 | 
			
		||||
        else              { $result->{$key} = \@values   }
 | 
			
		||||
    }
 | 
			
		||||
    return $result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
1;
 | 
			
		||||
		Reference in New Issue
	
	Block a user