412 lines
14 KiB
Perl
412 lines
14 KiB
Perl
|
# ====================================================================
|
||
|
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||
|
#
|
||
|
# GT::SQL::Search::INTERNAL::Indexer
|
||
|
# Author: Aki Mimoto
|
||
|
# CVS Info : 087,071,086,086,085
|
||
|
# $Id: Indexer.pm,v 1.11 2004/01/13 01:35:19 jagerman Exp $
|
||
|
#
|
||
|
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||
|
# ====================================================================
|
||
|
#
|
||
|
|
||
|
package GT::SQL::Search::INTERNAL::Indexer;
|
||
|
|
||
|
# ------------------------------------------------------------------------------
|
||
|
# Preamble information related to the object
|
||
|
use strict;
|
||
|
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG /;
|
||
|
use GT::SQL::Search::Base::Indexer;
|
||
|
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||
|
$DEBUG = 0;
|
||
|
$VERSION = sprintf "%d.%03d", q$Revision: 1.11 $ =~ /(\d+)\.(\d+)/;
|
||
|
|
||
|
sub load {
|
||
|
shift;
|
||
|
return GT::SQL::Search::INTERNAL::Indexer->new(@_)
|
||
|
}
|
||
|
|
||
|
sub drop_search_driver {
|
||
|
# ------------------------------------------------------------------------------
|
||
|
my $self = shift;
|
||
|
my $table = $self->{table}->name;
|
||
|
my $rc1 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Word_List");
|
||
|
my $rc2 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Score_List");
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
sub add_search_driver {
|
||
|
# ------------------------------------------------------------------------------
|
||
|
my $self = shift;
|
||
|
my $name = $self->{table}->name;
|
||
|
|
||
|
# first create the table that handles the words.
|
||
|
my $creator = $self->{table}->creator ( $name . "_Word_List" );
|
||
|
$creator->cols(
|
||
|
Word_ID => {
|
||
|
pos => 1,
|
||
|
type => 'int',
|
||
|
not_null => 1,
|
||
|
unsigned => 1
|
||
|
},
|
||
|
Word => {
|
||
|
pos => 2,
|
||
|
type => 'varchar',
|
||
|
not_null=> 1,
|
||
|
size => '50'
|
||
|
},
|
||
|
Frequency => {
|
||
|
pos => 3,
|
||
|
type => 'int',
|
||
|
not_null=> 1
|
||
|
}
|
||
|
);
|
||
|
$creator->pk('Word_ID');
|
||
|
$creator->ai('Word_ID');
|
||
|
$creator->unique({ $name . "_wordndx" => ['Word'] });
|
||
|
$creator->create('force') or return;
|
||
|
|
||
|
# now create the handler for scores
|
||
|
$creator = $self->{table}->creator( $name . '_Score_List' );
|
||
|
$creator->cols(
|
||
|
Word_ID => {
|
||
|
pos => 1,
|
||
|
type => 'int',
|
||
|
not_null => 1,
|
||
|
unsigned => 1
|
||
|
},
|
||
|
Item_ID => {
|
||
|
pos => 2,
|
||
|
type => 'int',
|
||
|
not_null => 1,
|
||
|
unsigned => 1
|
||
|
},
|
||
|
Score => {
|
||
|
pos => 3,
|
||
|
type => 'int',
|
||
|
not_null => 1
|
||
|
},
|
||
|
Word_Pos => {
|
||
|
pos => 4,
|
||
|
type => 'int',
|
||
|
not_null => 1
|
||
|
}
|
||
|
);
|
||
|
$creator->index({ 'wndx' => ['Word_ID', 'Item_ID', 'Score'], 'itndx' => ['Item_ID'] });
|
||
|
$creator->create('force') or return;
|
||
|
return 1;
|
||
|
|
||
|
}
|
||
|
|
||
|
sub post_create_table {
|
||
|
# ------------------------------------------------------------------------------
|
||
|
# creates the index tables..
|
||
|
#
|
||
|
return $_[0]->add_search_driver(@_);
|
||
|
}
|
||
|
|
||
|
sub post_drop_table {
|
||
|
# -------------------------------------------------------
|
||
|
# Remove the index tables.
|
||
|
#
|
||
|
return $_[0]->drop_search_driver(@_);
|
||
|
}
|
||
|
|
||
|
sub init_queries {
|
||
|
# -------------------------------------------------------
|
||
|
# Pre-load all our queries.
|
||
|
#
|
||
|
my $self = shift;
|
||
|
my $queries = shift;
|
||
|
|
||
|
my $driver = $self->{table}->{driver} or return $self->error ('NODRIVER', 'FATAL');
|
||
|
my $table_name = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
|
||
|
my $wtable = $table_name . '_Word_List';
|
||
|
my $seq = $wtable . '_seq';
|
||
|
my $stable = $table_name . '_Score_List';
|
||
|
|
||
|
my %ai_queries = (
|
||
|
ins_word_ORACLE => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES ($seq.NEXTVAL, ?, ?)",
|
||
|
ins_word_PG => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES (NEXTVAL('$seq'), ?, ?)",
|
||
|
ins_word => "INSERT INTO $wtable (Word, Frequency) VALUES (?, ?)"
|
||
|
);
|
||
|
my %queries = (
|
||
|
upd_word => "UPDATE $wtable SET Frequency = ? WHERE Word_ID = ?",
|
||
|
sel_word => "SELECT Word_ID,Word,Frequency FROM $wtable WHERE Word = ?",
|
||
|
sel_freq => "SELECT Frequency FROM $wtable WHERE Word_ID = ?",
|
||
|
del_word => "DELETE FROM $wtable WHERE Word_ID = ?",
|
||
|
mod_word => "UPDATE $wtable SET Frequency = Frequency - ? WHERE Word_ID = ?",
|
||
|
ins_scor => "INSERT INTO $stable (Word_ID, Item_ID, Score, Word_Pos) VALUES (?, ?, ?, ?)",
|
||
|
item_cnt => "SELECT Word_ID, COUNT(*) FROM $stable WHERE Item_ID = ? GROUP BY Word_ID",
|
||
|
scr_del => "DELETE FROM $stable WHERE Item_ID = ?",
|
||
|
dump_word => "DELETE FROM $wtable",
|
||
|
dump_scor => "DELETE FROM $stable"
|
||
|
);
|
||
|
my $type = uc $self->{table}->{connect}->{driver};
|
||
|
$self->{ins_word} = $driver->prepare($ai_queries{"ins_word_$type"} || $ai_queries{"ins_word"});
|
||
|
|
||
|
# check to see if the table exist
|
||
|
$self->{table}->new_table( $wtable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
|
||
|
$self->{table}->new_table( $stable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
|
||
|
|
||
|
|
||
|
if ($type eq 'MYSQL') {
|
||
|
foreach my $query (keys %queries) {
|
||
|
$self->{$query} = $driver->prepare_raw ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
foreach my $query (keys %queries) {
|
||
|
$self->{$query} = $driver->prepare ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub post_add_record {
|
||
|
# -------------------------------------------------------
|
||
|
# indexes a single record
|
||
|
my ($self, $rec, $insert_sth ) = @_;
|
||
|
|
||
|
# Only continue if we have weights and a primary key.
|
||
|
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||
|
my %weights = $tbl->_weight_cols() or return;
|
||
|
my ($pk) = $tbl->pk();
|
||
|
my $item_id = ( $tbl->ai() and $insert_sth ) ? $insert_sth->insert_id() : $rec->{$pk};
|
||
|
my $index = 0;
|
||
|
|
||
|
$self->{init} or $self->init_queries;
|
||
|
|
||
|
# Go through each column and index it.
|
||
|
foreach my $column ( keys %weights ) {
|
||
|
my ($word_list, $rejected) = $self->_tokenize( $rec->{$column} );
|
||
|
$word_list or next;
|
||
|
|
||
|
# Build a hash of word => frequency.
|
||
|
my %words;
|
||
|
foreach my $word (@{$word_list}) {
|
||
|
$words{$word}++;
|
||
|
}
|
||
|
|
||
|
# Add the words in, or update frequency.
|
||
|
my %word_ids = ();
|
||
|
while (my ($word, $freq) = each %words) {
|
||
|
$self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
my $word_r = $self->{sel_word}->fetchrow_arrayref; # Word_ID, Word, Frequency
|
||
|
if ($word_r) {
|
||
|
$word_r->[2] += $freq;
|
||
|
$word_ids{$word} = $word_r->[0];
|
||
|
$self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
else {
|
||
|
$self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
$word_ids{$word} = $self->{ins_word}->insert_id();
|
||
|
}
|
||
|
}
|
||
|
# now that we have the word ids, insert each of the word-points
|
||
|
my $weight = $weights{$column};
|
||
|
foreach my $word ( @{$word_list} ) {
|
||
|
$self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
$index++;
|
||
|
}
|
||
|
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
sub reindex_all {
|
||
|
# -------------------------------------------------------
|
||
|
my $self = shift;
|
||
|
my $table = shift;
|
||
|
my $opts = shift;
|
||
|
my $tick = $opts->{tick} || 0;
|
||
|
my $max = $opts->{max} || 5000;
|
||
|
|
||
|
my %weights = $self->{table}->_weight_cols() or return;
|
||
|
my @weight_list = keys %weights;
|
||
|
my @weight_arr = map { $weights{$_} } @weight_list;
|
||
|
my ($pk) = $self->{table}->pk();
|
||
|
my $index = 0;
|
||
|
my $word_id = 1;
|
||
|
$self->{init} or $self->init_queries;
|
||
|
|
||
|
# first nuke the current index
|
||
|
$self->dump_index();
|
||
|
|
||
|
# Go through the table and index each field.
|
||
|
my $iterations = 1;
|
||
|
my $count = 0;
|
||
|
|
||
|
while (1) {
|
||
|
if ($max) {
|
||
|
my $offset = ($iterations-1) * $max;
|
||
|
$table->select_options ( "LIMIT $offset,$max");
|
||
|
}
|
||
|
my $cond = $opts->{cond} || {};
|
||
|
my $sth = $table->select($cond, [ $pk, @weight_list] );
|
||
|
my $done = 1;
|
||
|
|
||
|
while ( my $arrayref = $sth->fetchrow_arrayref() ) {
|
||
|
# the primary key value
|
||
|
my $i = 0;
|
||
|
my $item_id = $arrayref->[($i++)];
|
||
|
$index = 0;
|
||
|
$done = 0;
|
||
|
|
||
|
# start going through the record data
|
||
|
foreach my $weight ( @weight_arr ) {
|
||
|
my ($word_list, $junk) = $self->_tokenize( $arrayref->[$i++] );
|
||
|
$word_list or next;
|
||
|
|
||
|
# Build a hash of word => frequency.
|
||
|
my %words;
|
||
|
foreach my $word (@{$word_list}) {
|
||
|
$words{$word}++;
|
||
|
}
|
||
|
|
||
|
# Add the words in, or update frequency.
|
||
|
my %word_ids = ();
|
||
|
while (my ($word, $freq) = each %words) {
|
||
|
$self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
my $word_r = $self->{sel_word}->fetchrow_arrayref; # WordID,Word,Freq
|
||
|
if ($word_r) {
|
||
|
$word_r->[2] += $freq;
|
||
|
$word_ids{$word} = $word_r->[0];
|
||
|
$self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
else {
|
||
|
$self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
$word_ids{$word} = $self->{ins_word}->insert_id();
|
||
|
}
|
||
|
}
|
||
|
# now that we have the word ids, insert each of the word-points
|
||
|
foreach my $word ( @{$word_list} ) {
|
||
|
$self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
$index++;
|
||
|
}
|
||
|
if ($tick) {
|
||
|
$count++;
|
||
|
$count % $tick or (print "$count ");
|
||
|
$count % ($tick*10) or (print "\n");
|
||
|
}
|
||
|
}
|
||
|
return if ($done);
|
||
|
$iterations++;
|
||
|
return if (! $max);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub pre_delete_record {
|
||
|
# -------------------------------------------------------
|
||
|
# Delete a records index values.
|
||
|
#
|
||
|
my $self = shift;
|
||
|
my $where = shift;
|
||
|
|
||
|
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||
|
my %weights = $tbl->_weight_cols() or return;
|
||
|
my ($pk) = $tbl->pk();
|
||
|
my $q = $tbl->select( $where, [ $pk ] );
|
||
|
|
||
|
while ( my $aref = $q->fetchrow_arrayref() ) {
|
||
|
my $item_id = $aref->[0] or next;
|
||
|
my @weight_list = keys %weights;
|
||
|
my $index = 0;
|
||
|
$self->{init} or $self->init_queries;
|
||
|
|
||
|
# Get a frequency count for each word
|
||
|
$self->{item_cnt}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
|
||
|
# Now go through and either decrement the freq, or remove the entry.
|
||
|
while ( my ($word_id, $frequency) = $self->{item_cnt}->fetchrow_array() ) {
|
||
|
$self->{sel_freq}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
$self->debug( "Deleting frequencies for $word_id. decreasing by $frequency" ) if ($self->{_debug});
|
||
|
if (my $freq = $self->{sel_freq}->fetchrow_arrayref) {
|
||
|
if ($freq->[0] == $frequency) {
|
||
|
$self->{del_word}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
else {
|
||
|
$self->{mod_word}->execute($frequency, $word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
# Remove the listings from the scores table.
|
||
|
$self->{scr_del}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
sub post_update_record {
|
||
|
# -------------------------------------------------------
|
||
|
my ( $self, $set_cond, $where_cond, $tmp ) = @_;
|
||
|
|
||
|
# delete the previous record
|
||
|
$self->pre_delete_record( $where_cond ) or return;
|
||
|
#
|
||
|
# the new record
|
||
|
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||
|
my $q = $tbl->select( $where_cond );
|
||
|
while ( my $href = $q->fetchrow_hashref() ) {
|
||
|
$self->post_add_record( $href );
|
||
|
}
|
||
|
|
||
|
return 1;
|
||
|
|
||
|
}
|
||
|
|
||
|
sub reindex_record {
|
||
|
# -------------------------------------------------------
|
||
|
# reindexes a record. basically deletes all associated records from current db abnd does an index.
|
||
|
# it's safe to use this
|
||
|
my $self = shift;
|
||
|
my $rec = shift;
|
||
|
|
||
|
$self->delete_record($rec);
|
||
|
$self->index_record($rec);
|
||
|
}
|
||
|
|
||
|
sub dump_index {
|
||
|
# -------------------------------------------------------
|
||
|
my $self = shift;
|
||
|
$self->{init} or $self->init_queries;
|
||
|
|
||
|
$self->{dump_word}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
$self->{dump_scor}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||
|
}
|
||
|
|
||
|
|
||
|
sub debug_dumper {
|
||
|
# ------------------------------------------------------------------------------
|
||
|
# calls debug but also dumps all the messages
|
||
|
my $self = shift;
|
||
|
my $message = shift;
|
||
|
my $level = ref $_[0] ? 1 : shift;
|
||
|
|
||
|
if ( $self->{_debug} >= $level ) {
|
||
|
require GT::Dumper;
|
||
|
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ ));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub DESTROY {
|
||
|
# ------------------------------------------------------------------------------
|
||
|
# Calls finish on init queries.
|
||
|
#
|
||
|
my $self = shift;
|
||
|
return unless ($self->{init});
|
||
|
$self->{upd_word}->finish;
|
||
|
# $self->{ins_word}->finish; will get finished automatically
|
||
|
$self->{sel_word}->finish;
|
||
|
$self->{sel_freq}->finish;
|
||
|
$self->{del_word}->finish;
|
||
|
$self->{mod_word}->finish;
|
||
|
$self->{ins_scor}->finish;
|
||
|
$self->{item_cnt}->finish;
|
||
|
$self->{scr_del}->finish;
|
||
|
$self->{dump_word}->finish;
|
||
|
$self->{dump_scor}->finish;
|
||
|
$self->{init} = 0;
|
||
|
}
|
||
|
|
||
|
1;
|