# ==================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::SQL::Search::INTERNAL::Indexer # Author: Aki Mimoto # CVS Info : 087,071,086,086,085 # $Id: Indexer.pm,v 1.11 2004/01/13 01:35:19 jagerman Exp $ # # Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. # ==================================================================== # package GT::SQL::Search::INTERNAL::Indexer; # ------------------------------------------------------------------------------ # Preamble information related to the object use strict; use vars qw/@ISA $ATTRIBS $VERSION $DEBUG /; use GT::SQL::Search::Base::Indexer; @ISA = qw/ GT::SQL::Search::Base::Indexer /; $DEBUG = 0; $VERSION = sprintf "%d.%03d", q$Revision: 1.11 $ =~ /(\d+)\.(\d+)/; sub load { shift; return GT::SQL::Search::INTERNAL::Indexer->new(@_) } sub drop_search_driver { # ------------------------------------------------------------------------------ my $self = shift; my $table = $self->{table}->name; my $rc1 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Word_List"); my $rc2 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Score_List"); return 1; } sub add_search_driver { # ------------------------------------------------------------------------------ my $self = shift; my $name = $self->{table}->name; # first create the table that handles the words. my $creator = $self->{table}->creator ( $name . "_Word_List" ); $creator->cols( Word_ID => { pos => 1, type => 'int', not_null => 1, unsigned => 1 }, Word => { pos => 2, type => 'varchar', not_null=> 1, size => '50' }, Frequency => { pos => 3, type => 'int', not_null=> 1 } ); $creator->pk('Word_ID'); $creator->ai('Word_ID'); $creator->unique({ $name . "_wordndx" => ['Word'] }); $creator->create('force') or return; # now create the handler for scores $creator = $self->{table}->creator( $name . '_Score_List' ); $creator->cols( Word_ID => { pos => 1, type => 'int', not_null => 1, unsigned => 1 }, Item_ID => { pos => 2, type => 'int', not_null => 1, unsigned => 1 }, Score => { pos => 3, type => 'int', not_null => 1 }, Word_Pos => { pos => 4, type => 'int', not_null => 1 } ); $creator->index({ 'wndx' => ['Word_ID', 'Item_ID', 'Score'], 'itndx' => ['Item_ID'] }); $creator->create('force') or return; return 1; } sub post_create_table { # ------------------------------------------------------------------------------ # creates the index tables.. # return $_[0]->add_search_driver(@_); } sub post_drop_table { # ------------------------------------------------------- # Remove the index tables. # return $_[0]->drop_search_driver(@_); } sub init_queries { # ------------------------------------------------------- # Pre-load all our queries. # my $self = shift; my $queries = shift; my $driver = $self->{table}->{driver} or return $self->error ('NODRIVER', 'FATAL'); my $table_name = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL'); my $wtable = $table_name . '_Word_List'; my $seq = $wtable . '_seq'; my $stable = $table_name . '_Score_List'; my %ai_queries = ( ins_word_ORACLE => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES ($seq.NEXTVAL, ?, ?)", ins_word_PG => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES (NEXTVAL('$seq'), ?, ?)", ins_word => "INSERT INTO $wtable (Word, Frequency) VALUES (?, ?)" ); my %queries = ( upd_word => "UPDATE $wtable SET Frequency = ? WHERE Word_ID = ?", sel_word => "SELECT Word_ID,Word,Frequency FROM $wtable WHERE Word = ?", sel_freq => "SELECT Frequency FROM $wtable WHERE Word_ID = ?", del_word => "DELETE FROM $wtable WHERE Word_ID = ?", mod_word => "UPDATE $wtable SET Frequency = Frequency - ? WHERE Word_ID = ?", ins_scor => "INSERT INTO $stable (Word_ID, Item_ID, Score, Word_Pos) VALUES (?, ?, ?, ?)", item_cnt => "SELECT Word_ID, COUNT(*) FROM $stable WHERE Item_ID = ? GROUP BY Word_ID", scr_del => "DELETE FROM $stable WHERE Item_ID = ?", dump_word => "DELETE FROM $wtable", dump_scor => "DELETE FROM $stable" ); my $type = uc $self->{table}->{connect}->{driver}; $self->{ins_word} = $driver->prepare($ai_queries{"ins_word_$type"} || $ai_queries{"ins_word"}); # check to see if the table exist $self->{table}->new_table( $wtable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error); $self->{table}->new_table( $stable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error); if ($type eq 'MYSQL') { foreach my $query (keys %queries) { $self->{$query} = $driver->prepare_raw ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error); } } else { foreach my $query (keys %queries) { $self->{$query} = $driver->prepare ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error); } } } sub post_add_record { # ------------------------------------------------------- # indexes a single record my ($self, $rec, $insert_sth ) = @_; # Only continue if we have weights and a primary key. my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' ); my %weights = $tbl->_weight_cols() or return; my ($pk) = $tbl->pk(); my $item_id = ( $tbl->ai() and $insert_sth ) ? $insert_sth->insert_id() : $rec->{$pk}; my $index = 0; $self->{init} or $self->init_queries; # Go through each column and index it. foreach my $column ( keys %weights ) { my ($word_list, $rejected) = $self->_tokenize( $rec->{$column} ); $word_list or next; # Build a hash of word => frequency. my %words; foreach my $word (@{$word_list}) { $words{$word}++; } # Add the words in, or update frequency. my %word_ids = (); while (my ($word, $freq) = each %words) { $self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); my $word_r = $self->{sel_word}->fetchrow_arrayref; # Word_ID, Word, Frequency if ($word_r) { $word_r->[2] += $freq; $word_ids{$word} = $word_r->[0]; $self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); } else { $self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); $word_ids{$word} = $self->{ins_word}->insert_id(); } } # now that we have the word ids, insert each of the word-points my $weight = $weights{$column}; foreach my $word ( @{$word_list} ) { $self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); } $index++; } return 1; } sub reindex_all { # ------------------------------------------------------- my $self = shift; my $table = shift; my $opts = shift; my $tick = $opts->{tick} || 0; my $max = $opts->{max} || 5000; my %weights = $self->{table}->_weight_cols() or return; my @weight_list = keys %weights; my @weight_arr = map { $weights{$_} } @weight_list; my ($pk) = $self->{table}->pk(); my $index = 0; my $word_id = 1; $self->{init} or $self->init_queries; # first nuke the current index $self->dump_index(); # Go through the table and index each field. my $iterations = 1; my $count = 0; while (1) { if ($max) { my $offset = ($iterations-1) * $max; $table->select_options ( "LIMIT $offset,$max"); } my $cond = $opts->{cond} || {}; my $sth = $table->select($cond, [ $pk, @weight_list] ); my $done = 1; while ( my $arrayref = $sth->fetchrow_arrayref() ) { # the primary key value my $i = 0; my $item_id = $arrayref->[($i++)]; $index = 0; $done = 0; # start going through the record data foreach my $weight ( @weight_arr ) { my ($word_list, $junk) = $self->_tokenize( $arrayref->[$i++] ); $word_list or next; # Build a hash of word => frequency. my %words; foreach my $word (@{$word_list}) { $words{$word}++; } # Add the words in, or update frequency. my %word_ids = (); while (my ($word, $freq) = each %words) { $self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); my $word_r = $self->{sel_word}->fetchrow_arrayref; # WordID,Word,Freq if ($word_r) { $word_r->[2] += $freq; $word_ids{$word} = $word_r->[0]; $self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); } else { $self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); $word_ids{$word} = $self->{ins_word}->insert_id(); } } # now that we have the word ids, insert each of the word-points foreach my $word ( @{$word_list} ) { $self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); } $index++; } if ($tick) { $count++; $count % $tick or (print "$count "); $count % ($tick*10) or (print "\n"); } } return if ($done); $iterations++; return if (! $max); } } sub pre_delete_record { # ------------------------------------------------------- # Delete a records index values. # my $self = shift; my $where = shift; my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' ); my %weights = $tbl->_weight_cols() or return; my ($pk) = $tbl->pk(); my $q = $tbl->select( $where, [ $pk ] ); while ( my $aref = $q->fetchrow_arrayref() ) { my $item_id = $aref->[0] or next; my @weight_list = keys %weights; my $index = 0; $self->{init} or $self->init_queries; # Get a frequency count for each word $self->{item_cnt}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); # Now go through and either decrement the freq, or remove the entry. while ( my ($word_id, $frequency) = $self->{item_cnt}->fetchrow_array() ) { $self->{sel_freq}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); $self->debug( "Deleting frequencies for $word_id. decreasing by $frequency" ) if ($self->{_debug}); if (my $freq = $self->{sel_freq}->fetchrow_arrayref) { if ($freq->[0] == $frequency) { $self->{del_word}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); } else { $self->{mod_word}->execute($frequency, $word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); } } } # Remove the listings from the scores table. $self->{scr_del}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr); } return 1; } sub post_update_record { # ------------------------------------------------------- my ( $self, $set_cond, $where_cond, $tmp ) = @_; # delete the previous record $self->pre_delete_record( $where_cond ) or return; # # the new record my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' ); my $q = $tbl->select( $where_cond ); while ( my $href = $q->fetchrow_hashref() ) { $self->post_add_record( $href ); } return 1; } sub reindex_record { # ------------------------------------------------------- # reindexes a record. basically deletes all associated records from current db abnd does an index. # it's safe to use this my $self = shift; my $rec = shift; $self->delete_record($rec); $self->index_record($rec); } sub dump_index { # ------------------------------------------------------- my $self = shift; $self->{init} or $self->init_queries; $self->{dump_word}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr); $self->{dump_scor}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr); } sub debug_dumper { # ------------------------------------------------------------------------------ # calls debug but also dumps all the messages my $self = shift; my $message = shift; my $level = ref $_[0] ? 1 : shift; if ( $self->{_debug} >= $level ) { require GT::Dumper; $self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )); } } sub DESTROY { # ------------------------------------------------------------------------------ # Calls finish on init queries. # my $self = shift; return unless ($self->{init}); $self->{upd_word}->finish; # $self->{ins_word}->finish; will get finished automatically $self->{sel_word}->finish; $self->{sel_freq}->finish; $self->{del_word}->finish; $self->{mod_word}->finish; $self->{ins_scor}->finish; $self->{item_cnt}->finish; $self->{scr_del}->finish; $self->{dump_word}->finish; $self->{dump_scor}->finish; $self->{init} = 0; } 1;