# ================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::SQL::Indexer # Author : Aki Mimoto # CVS Info : 087,071,086,086,085 # $Id: Search.pm,v 1.18 2004/08/28 03:53:47 jagerman Exp $ # # Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. # ================================================================== # # Description: # Class used to make changes to tables and create tables. # package GT::SQL::Search::INTERNAL::Search; # ------------------------------------------------------------------------------ use strict; use vars qw/@ISA $VERSION $DEBUG $ATTRIBS /; use GT::SQL::Search::Base::Search; @ISA = qw( GT::SQL::Search::Base::Search ); # ------------------------------------------------------------------------------ # Preamble information related to the object $DEBUG = 0; $VERSION = sprintf "%d.%03d", q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/; $ATTRIBS = { # the max number of links that can be handled by UNION before it should simply # shunt the searching pipe to NONINDEXED system 'union_shunt_threshold' => '5000', 'phrase_shunt_threshold' => '1000', }; ################################################################################ # Internal functions ################################################################################ sub load { shift; return GT::SQL::Search::INTERNAL::Search->new(@_) } sub _query { # ------------------------------------------------------------------------------ # this just checks to ensure that the words are not all search keywords # my ( $self, $input, $buckets ) = @_; # calculate wordids and frequencies foreach ( keys %$buckets ) { $buckets->{$_} = $self->get_wordids( $buckets->{$_}, ( /phrase/ ? "phrases" : "keywords" ) ); } # the following is a bit tricky and will be replaced however, if the number # of results from a union is more than the maximum shunt value, it will # simply do a nonindexed query if ( $buckets->{keywords} ) { my $rec = _count_frequencies( $buckets->{keywords} ); my $count = 0; foreach ( values %$rec ) { $count += $_; } if ($count > $self->{union_shunt_threshold}) { $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug}); return $self->alternate_driver_query( 'NONINDEXED', $input ); } } # Now test the phrases. Just due to how the phrase searching works, the queries # can grow in size extremely rapidly, and slowdown the search. So the limit for # phrase searching is separate as it requires a different cutoff value than # the keyword search which is usually much lower! if ($buckets->{phrases}) { foreach my $phrase ( keys %{$buckets->{phrases} || {} } ) { my $rec = _count_frequencies( $buckets->{phrases}->{$phrase}->{word_info} ); my ( $count ) = sort values %$rec; # Get smallest frequency. if ( $count > $self->{phrase_shunt_threshold} ) { $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug}); return $self->alternate_driver_query( 'NONINDEXED', $input ); } } } if ($buckets->{phrases_must}) { foreach my $phrase ( keys %{$buckets->{phrases_must} || {} } ) { my $rec = _count_frequencies( $buckets->{phrases_must}->{$phrase}->{word_info} ); my ( $count ) = sort values %$rec; # Get smallest frequency. if ( $count > $self->{phrase_shunt_threshold} ) { $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug}); return $self->alternate_driver_query( 'NONINDEXED', $input ); } } } return $self->SUPER::_query( $input, $buckets ); } sub _count_frequencies { # ------------------------------------------------------------------------------ my $word_info = shift; my $rec = {}; foreach my $word ( keys %$word_info ) { my $freq = 0; foreach ( values %{$word_info->{$word}->{word_info}} ) { $freq += $_; } $rec->{$word} = $freq; } return $rec; } sub _table_names { # ------------------------------------------------------------------------------ # return the table names # my $self = shift; my $table = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL'); my $wtable = $table . '_Word_List'; my $stable = $table . '_Score_List'; return ( $table, $wtable, $stable); } sub _word_infos { # ------------------------------------------------------------------------------ # get the word ids and frequencies # my $self = shift; my $word_infos = shift; my $rec = {}; foreach my $word ( keys %$word_infos ) { my $wi = $word_infos->{$word}->{word_info}; $rec->{$word} = [ map { [ $_, $wi->{$_} ] } keys %$wi ]; } return $rec; } sub _union_query { # ------------------------------------------------------------------------------ # Takes a list of words and gets all words that match # returns { itemid -> score } of hits that match # my $self = shift; my $words = shift; my $results = shift || {}; my ( $query, $where, $db, $word_infos ); my ( $table, $wtable, $stable) = $self->_table_names(); $db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' ); $word_infos = $self->_word_infos( $words ) or return $results; return $results unless (keys %{$word_infos}); $self->debug_dumper( "Getting words: ", $words) if ($self->{_debug}); # build the where clause my @word_ids; foreach my $word_synonym_list ( values %$word_infos ) { next unless ( $word_synonym_list ); foreach my $word_id ( @{$word_synonym_list }) { next unless ( ref $word_id eq 'ARRAY' ); # ensure it's a reference push @word_ids, $word_id->[0]; # we need to shed the word quantities } } return $results unless ( @word_ids ); $where = 'Word_ID IN(' . join(",", @word_ids) . ")"; # build the query $query = qq! SELECT Item_ID, SUM(Score) FROM $stable WHERE $where GROUP BY Item_ID !; $self->debug( "Union Query: $query" ) if ($self->{_debug}); # prepare the query my $sth = $db->prepare( $query ) or return; $sth->execute() or return; # get the results my %word_infos = $sth->fetchall_list; # merge the current result set into found foreach my $item ( keys %{$results} ) { $word_infos{$item} += $results->{$item}; }; return \%word_infos; } sub _intersect_query { # ------------------------------------------------------------------------------ # Takes a list of words and gets all words that match all the keywords # returns { itemid -> score } of hits that match # my $self = shift; my $words = shift; my $results = shift || {}; $words or return $results; keys %{$words} or return $results; my ( $query, $where, $db, $word_infos, $word_hits ); my ( $table, $wtable, $stable) = $self->_table_names(); # have we left any of our words out? $db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' ); $word_infos = $self->_word_infos( $words ) or return {}; if ( keys %{$word_infos} < keys %{$words} ) { return {}; } $self->debug_dumper( "Keyword Intersect words: ", $word_infos ) if ($self->{_debug}); # take the words and get a hash of the word scores foreach my $word ( keys %{$word_infos} ) { my $total_freq = 0; foreach my $word_synonyms ( @{$word_infos->{$word}} ) { $total_freq += $word_synonyms->[1]; } $word_hits->{$word} = $total_freq or return; } # so now, sort out the words from lowest frequency to highest frequency my @search_order = sort { $word_hits->{$a} <=> $word_hits->{$b} } keys %{$word_hits}; $self->debug_dumper( "Searching words in this order: ", \@search_order) if ($self->{_debug}); # find out how we're going to handle the searching, if the first elements ################################################################################ ### The following part is for smaller intersect subsets ################################################################################ my $intersect = $results; foreach my $word ( @search_order ) { # setup the where clause to get all the words associated my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")"; # setup the intersect for the previous if required. for iterative intersecting if ( keys %{$intersect} ) { $where .= " AND Item_ID in(" . join(",",keys %{$intersect}) . ")"; } # make the database engine work a little bit $query = qq! SELECT Item_ID, SUM(Score) AS Score FROM $stable WHERE $where GROUP BY Item_ID !; $self->debug( "Intersect Query: $query" ) if ($self->{_debug}); my $intersect_sth = $db->prepare( $query ); $intersect_sth->execute(); # get a list of all the matches my $matches = $intersect_sth->fetchall_arrayref(); $self->debug_dumper( "Matches found for $word: ", $matches ) if ($self->{_debug}); # go through all the matches and intersect them my %tmp = (); foreach my $row ( @{$matches} ) { my ( $itemid, $score ) = @{$row}; $intersect->{$itemid} ||= 0; $tmp{ $itemid } = $intersect->{$itemid} + $score; } # inform the system of that development %tmp or return; $intersect = \%tmp; } return $intersect; } sub _disjoin_query { #------------------------------------------------------------ my $self = shift; my $words = shift; my $results = shift || {}; $words or return $results; my ( $query, $where, $db, $word_infos, $word_hits ); my ( $table, $wtable, $stable) = $self->_table_names(); $db = $self->{table}->{driver} or return $results; # have we left any of our words out? $word_infos = $self->_word_infos( $words ) or return $results; # if ( keys %{$word_infos} < keys %{$words} ) { # return $results; # } # take the words and get a hash of the word scores foreach my $word ( keys %{$word_infos} ) { my $total_freq = 0; foreach my $word_synonyms ( $word_infos->{$word} ) { $total_freq += ( $word_synonyms->[0] || 0 ); } # if the value is null this mean there is actually no results, whoops! $total_freq and $word_hits->{$word} = $total_freq; } # so now, sort out the words from lowest frequency to highest frequency my @search_order = sort { $word_hits->{$b} <=> $word_hits->{$b} } keys %{$word_hits}; $self->debug_dumper( "Disjoining words in the following order: ", \@search_order) if ($self->{_debug}); ################################################################################ ### This following part is for smaller disjoin presets ################################################################################ foreach my $word ( @search_order ) { # setup the where clause to get all the words associated my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")"; # setup the intersect for the previous if required. for iterative intersecting if ( keys %{$results} ) { $where .= " AND Item_ID in(" . join(",", keys %{$results}) . ")"; } # make the database engine work a little bit $query = qq! SELECT Item_ID FROM $stable WHERE $where GROUP BY Item_ID !; $self->debug($query) if ($self->{_debug}); my $intersect_sth = $db->prepare( $query ); $intersect_sth->execute(); # get a list of all the matches my $matches = $intersect_sth->fetchall_arrayref(); # strip the matches from the current result set foreach my $word ( map { $_->[0] } @{$matches}) { delete $results->{$word}; } } return $results; } sub _phrase_disjoin_query { #------------------------------------------------------------ # subtracts the found phrases from the list my $self = shift; my $phrases = shift; my $results = shift || {}; $phrases or return $results; foreach my $phrase ( values %{$phrases} ) { my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} ); # perform disjoin foreach my $itemid ( keys %{$temp} ) { $self->debug( "Deleting $itemid from list" ) if ($self->{_debug}); delete $results->{$itemid}; } } return $results; } sub _phrase_intersect_query { #------------------------------------------------------------ # intersects phrases together my $self = shift; my $phrases = shift; my $results = shift || {}; $phrases or return $results; foreach my $phrase ( values %{$phrases} ) { my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} ); # perform intersect foreach my $itemid ( keys %{$temp} ) { $temp->{$itemid} += $results->{$itemid} || 0; } $results = $temp; } return $results; } sub _phrase_query { #------------------------------------------------------------ # this is a phrase union query my $self = shift; my $phrases = shift or return; my $results = shift || {}; foreach my $phrase ( values %{$phrases} ) { $self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug}); $results = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info}, $results ); } return $results; } sub _get_phrase { #------------------------------------------------------------ my $self = shift; my $wordlist= shift; my $word_info = shift; my $results = shift || {}; $wordlist or return $results; my ( $query, $where, $db, $word_infos, %word_hits ); my ( $table, $wtable, $stable) = $self->_table_names(); my ($pk) = $self->{table}->pk; $self->debug_dumper( "Getting words: ", $wordlist ) if ($self->{_debug}); # get all the word ids that we want to handle $db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' ); $word_infos = $self->_word_infos( $word_info ) or return; $self->debug_dumper( "Word infos: ", $word_infos ) if ($self->{_debug}); # take the words and get a hash of the word scores foreach my $word ( keys %{$word_infos} ) { @{$word_infos->{$word} || []} or return; my $total_freq = 0; foreach my $word_synonyms ( @{$word_infos->{$word}} ) { $total_freq += $word_synonyms->[1]; } # if the value is null this mean there is actually no results, whoops! $word_hits{$word} = $total_freq; } $self->debug_dumper( "With synonyms tallied: ", \%word_hits ) if ($self->{_debug}); # so now, setup the order of search my $i = 0; my %word_order = map { $_ => $i++ } @{$wordlist}; my @search_order = sort { $word_hits{$a} <=> $word_hits{$b} } keys %word_hits; $self->debug_dumper( "Word search order: ", \@search_order ) if ($self->{_debug}); ################################################################################ ### This following part is for smaller phrases ################################################################################ # start getting words in order of their frequency my %matches = (); my $index = 0; foreach my $word ( @search_order ) { # setup the where clause for the individual words, firstly if ( keys %matches ) { my $vector = $word_order{$word} - $index; $where = '('; $where = '(' . join( " OR ", map( "Item_ID = $_ AND Word_Pos IN(" . join(",", map $_->[0] + $vector, @{$matches{$_}}) . ')', keys %matches ) ) . ") AND "; } else { $where = ''; } $where .= "Word_ID IN(" . ( join ",", map { $_->[0] || () } @{$word_infos->{$word}} or return $results ) . ')'; $query = qq! SELECT Item_ID, Score, Word_Pos FROM $stable WHERE $where !; $self->debug( "Phrase get for '$word': " . $query ) if ($self->{_debug}); my $sth = $db->prepare( $query ); $sth->execute(); %matches = (); while (my $hit = $sth->fetchrow_arrayref) { push @{$matches{$hit->[0]}}, [ $hit->[2], $hit->[1] ]; } # If there are no values stored in %matches, it means that for # this keyword, there have been no hits based upon position. # In that case, terminate and return a null result keys %matches or last; # where were we in the string? $index = $word_order{$word}; } # now tally up all the scores and merge the new records in foreach my $itemid ( keys %matches ) { my $score = 0; foreach my $sub_total ( @{$matches{$itemid}} ) { $score += $sub_total->[1]; } $results->{$itemid} += $score; } return $results; } sub get_wordids { # ------------------------------------------------------------------------------ # Get a list of words # my $self = shift; my $elements = shift or return; my $mode = lc shift || 'keywords'; if ( $mode eq 'keywords' ) { $elements = $self->_get_wordid($elements); } else { foreach my $phrase ( keys %$elements ) { my $results = $self->_get_wordid({ map { ($_ => { substring => 0 }) } @{$elements->{$phrase}->{phrase}} }); $elements->{$phrase}->{word_info} = $results; } } return $elements; } sub _get_wordid { # ------------------------------------------------------------------------------ # Get a list of words # my $self = shift; my $words = shift; my $tbl = $self->{table}; my ( $table, $wtable, $stable) = $self->_table_names(); foreach my $word ( keys %$words ) { my $query = qq!SELECT Word_ID, Frequency FROM $wtable WHERE Word LIKE '! . quotemeta($word) . ( $words->{$word}->{substring} ? '%' : '' ) . "'"; my $sth = $tbl->do_query($query) or next; my $tmp = { $sth->fetchall_list }; $words->{$word}->{word_info} = $tmp; } return $words; } ## # Internal Use # $self->_cgi_to_hash ($in); # -------------------------- # Creates a hash ref from a cgi object. ## sub _cgi_to_hash { my ($self, $cgi) = @_; $cgi and UNIVERSAL::can($cgi, 'param') or return $self->error(NODRIVER => 'FATAL'); my @keys = $cgi->param; my $result = {}; foreach my $key (@keys) { my @values = $cgi->param($key); if (@values == 1) { $result->{$key} = $values[0] } else { $result->{$key} = \@values } } return $result; } 1;