First pass at adding key files
This commit is contained in:
@ -0,0 +1,604 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Indexer
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.18 2004/08/28 03:53:47 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to make changes to tables and create tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::INTERNAL::Search;
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $VERSION $DEBUG $ATTRIBS /;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/;
|
||||
$ATTRIBS = {
|
||||
# the max number of links that can be handled by UNION before it should simply
|
||||
# shunt the searching pipe to NONINDEXED system
|
||||
'union_shunt_threshold' => '5000',
|
||||
'phrase_shunt_threshold' => '1000',
|
||||
};
|
||||
|
||||
|
||||
################################################################################
|
||||
# Internal functions
|
||||
################################################################################
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::INTERNAL::Search->new(@_)
|
||||
}
|
||||
|
||||
sub _query {
|
||||
# ------------------------------------------------------------------------------
|
||||
# this just checks to ensure that the words are not all search keywords
|
||||
#
|
||||
my ( $self, $input, $buckets ) = @_;
|
||||
|
||||
# calculate wordids and frequencies
|
||||
foreach ( keys %$buckets ) {
|
||||
$buckets->{$_} = $self->get_wordids( $buckets->{$_}, ( /phrase/ ? "phrases" : "keywords" ) );
|
||||
}
|
||||
|
||||
# the following is a bit tricky and will be replaced however, if the number
|
||||
# of results from a union is more than the maximum shunt value, it will
|
||||
# simply do a nonindexed query
|
||||
if ( $buckets->{keywords} ) {
|
||||
my $rec = _count_frequencies( $buckets->{keywords} );
|
||||
my $count = 0;
|
||||
foreach ( values %$rec ) { $count += $_; }
|
||||
if ($count > $self->{union_shunt_threshold}) {
|
||||
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
|
||||
return $self->alternate_driver_query( 'NONINDEXED', $input );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Now test the phrases. Just due to how the phrase searching works, the queries
|
||||
# can grow in size extremely rapidly, and slowdown the search. So the limit for
|
||||
# phrase searching is separate as it requires a different cutoff value than
|
||||
# the keyword search which is usually much lower!
|
||||
if ($buckets->{phrases}) {
|
||||
foreach my $phrase ( keys %{$buckets->{phrases} || {} } ) {
|
||||
my $rec = _count_frequencies( $buckets->{phrases}->{$phrase}->{word_info} );
|
||||
my ( $count ) = sort values %$rec; # Get smallest frequency.
|
||||
if ( $count > $self->{phrase_shunt_threshold} ) {
|
||||
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
|
||||
return $self->alternate_driver_query( 'NONINDEXED', $input );
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($buckets->{phrases_must}) {
|
||||
foreach my $phrase ( keys %{$buckets->{phrases_must} || {} } ) {
|
||||
my $rec = _count_frequencies( $buckets->{phrases_must}->{$phrase}->{word_info} );
|
||||
my ( $count ) = sort values %$rec; # Get smallest frequency.
|
||||
if ( $count > $self->{phrase_shunt_threshold} ) {
|
||||
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
|
||||
return $self->alternate_driver_query( 'NONINDEXED', $input );
|
||||
}
|
||||
}
|
||||
}
|
||||
return $self->SUPER::_query( $input, $buckets );
|
||||
}
|
||||
|
||||
sub _count_frequencies {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $word_info = shift;
|
||||
my $rec = {};
|
||||
foreach my $word ( keys %$word_info ) {
|
||||
my $freq = 0;
|
||||
foreach ( values %{$word_info->{$word}->{word_info}} ) {
|
||||
$freq += $_;
|
||||
}
|
||||
$rec->{$word} = $freq;
|
||||
}
|
||||
|
||||
return $rec;
|
||||
}
|
||||
|
||||
sub _table_names {
|
||||
# ------------------------------------------------------------------------------
|
||||
# return the table names
|
||||
#
|
||||
my $self = shift;
|
||||
my $table = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
|
||||
my $wtable = $table . '_Word_List';
|
||||
my $stable = $table . '_Score_List';
|
||||
|
||||
return ( $table, $wtable, $stable);
|
||||
}
|
||||
|
||||
sub _word_infos {
|
||||
# ------------------------------------------------------------------------------
|
||||
# get the word ids and frequencies
|
||||
#
|
||||
my $self = shift;
|
||||
my $word_infos = shift;
|
||||
|
||||
my $rec = {};
|
||||
|
||||
foreach my $word ( keys %$word_infos ) {
|
||||
my $wi = $word_infos->{$word}->{word_info};
|
||||
$rec->{$word} = [ map { [ $_, $wi->{$_} ] } keys %$wi ];
|
||||
}
|
||||
|
||||
return $rec;
|
||||
|
||||
}
|
||||
|
||||
sub _union_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Takes a list of words and gets all words that match
|
||||
# returns { itemid -> score } of hits that match
|
||||
#
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $results = shift || {};
|
||||
my ( $query, $where, $db, $word_infos );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
|
||||
$word_infos = $self->_word_infos( $words ) or return $results;
|
||||
|
||||
return $results unless (keys %{$word_infos});
|
||||
|
||||
$self->debug_dumper( "Getting words: ", $words) if ($self->{_debug});
|
||||
|
||||
# build the where clause
|
||||
my @word_ids;
|
||||
foreach my $word_synonym_list ( values %$word_infos ) {
|
||||
next unless ( $word_synonym_list );
|
||||
foreach my $word_id ( @{$word_synonym_list }) {
|
||||
next unless ( ref $word_id eq 'ARRAY' ); # ensure it's a reference
|
||||
push @word_ids, $word_id->[0]; # we need to shed the word quantities
|
||||
}
|
||||
}
|
||||
|
||||
return $results unless ( @word_ids );
|
||||
$where = 'Word_ID IN(' . join(",", @word_ids) . ")";
|
||||
|
||||
# build the query
|
||||
$query = qq!
|
||||
SELECT Item_ID, SUM(Score)
|
||||
FROM $stable
|
||||
WHERE
|
||||
$where
|
||||
GROUP BY Item_ID
|
||||
!;
|
||||
|
||||
$self->debug( "Union Query: $query" ) if ($self->{_debug});
|
||||
|
||||
# prepare the query
|
||||
my $sth = $db->prepare( $query ) or return;
|
||||
$sth->execute() or return;
|
||||
|
||||
# get the results
|
||||
my %word_infos = $sth->fetchall_list;
|
||||
|
||||
# merge the current result set into found
|
||||
foreach my $item ( keys %{$results} ) {
|
||||
$word_infos{$item} += $results->{$item};
|
||||
};
|
||||
|
||||
return \%word_infos;
|
||||
}
|
||||
|
||||
sub _intersect_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Takes a list of words and gets all words that match all the keywords
|
||||
# returns { itemid -> score } of hits that match
|
||||
#
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $results = shift || {};
|
||||
|
||||
$words or return $results;
|
||||
keys %{$words} or return $results;
|
||||
|
||||
my ( $query, $where, $db, $word_infos, $word_hits );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
# have we left any of our words out?
|
||||
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
|
||||
$word_infos = $self->_word_infos( $words ) or return {};
|
||||
if ( keys %{$word_infos} < keys %{$words} ) {
|
||||
return {};
|
||||
}
|
||||
|
||||
$self->debug_dumper( "Keyword Intersect words: ", $word_infos ) if ($self->{_debug});
|
||||
|
||||
# take the words and get a hash of the word scores
|
||||
foreach my $word ( keys %{$word_infos} ) {
|
||||
|
||||
my $total_freq = 0;
|
||||
foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
|
||||
$total_freq += $word_synonyms->[1];
|
||||
}
|
||||
|
||||
$word_hits->{$word} = $total_freq or return;
|
||||
|
||||
}
|
||||
|
||||
# so now, sort out the words from lowest frequency to highest frequency
|
||||
my @search_order = sort { $word_hits->{$a} <=> $word_hits->{$b} } keys %{$word_hits};
|
||||
|
||||
$self->debug_dumper( "Searching words in this order: ", \@search_order) if ($self->{_debug});
|
||||
|
||||
# find out how we're going to handle the searching, if the first elements
|
||||
|
||||
################################################################################
|
||||
### The following part is for smaller intersect subsets
|
||||
################################################################################
|
||||
my $intersect = $results;
|
||||
foreach my $word ( @search_order ) {
|
||||
|
||||
# setup the where clause to get all the words associated
|
||||
my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
|
||||
|
||||
# setup the intersect for the previous if required. for iterative intersecting
|
||||
if ( keys %{$intersect} ) {
|
||||
$where .= " AND Item_ID in(" . join(",",keys %{$intersect}) . ")";
|
||||
}
|
||||
|
||||
# make the database engine work a little bit
|
||||
$query = qq!
|
||||
SELECT Item_ID, SUM(Score) AS Score
|
||||
FROM $stable
|
||||
WHERE
|
||||
$where
|
||||
GROUP BY Item_ID
|
||||
!;
|
||||
$self->debug( "Intersect Query: $query" ) if ($self->{_debug});
|
||||
my $intersect_sth = $db->prepare( $query );
|
||||
|
||||
$intersect_sth->execute();
|
||||
|
||||
# get a list of all the matches
|
||||
my $matches = $intersect_sth->fetchall_arrayref();
|
||||
|
||||
$self->debug_dumper( "Matches found for $word: ", $matches ) if ($self->{_debug});
|
||||
|
||||
# go through all the matches and intersect them
|
||||
my %tmp = ();
|
||||
foreach my $row ( @{$matches} ) {
|
||||
my ( $itemid, $score ) = @{$row};
|
||||
$intersect->{$itemid} ||= 0;
|
||||
$tmp{ $itemid } = $intersect->{$itemid} + $score;
|
||||
}
|
||||
|
||||
# inform the system of that development
|
||||
%tmp or return;
|
||||
$intersect = \%tmp;
|
||||
}
|
||||
|
||||
return $intersect;
|
||||
}
|
||||
|
||||
sub _disjoin_query {
|
||||
#------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $results = shift || {};
|
||||
$words or return $results;
|
||||
|
||||
my ( $query, $where, $db, $word_infos, $word_hits );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
$db = $self->{table}->{driver} or return $results;
|
||||
|
||||
# have we left any of our words out?
|
||||
$word_infos = $self->_word_infos( $words ) or return $results;
|
||||
# if ( keys %{$word_infos} < keys %{$words} ) {
|
||||
# return $results;
|
||||
# }
|
||||
|
||||
# take the words and get a hash of the word scores
|
||||
foreach my $word ( keys %{$word_infos} ) {
|
||||
my $total_freq = 0;
|
||||
foreach my $word_synonyms ( $word_infos->{$word} ) {
|
||||
$total_freq += ( $word_synonyms->[0] || 0 );
|
||||
}
|
||||
# if the value is null this mean there is actually no results, whoops!
|
||||
$total_freq and $word_hits->{$word} = $total_freq;
|
||||
}
|
||||
|
||||
# so now, sort out the words from lowest frequency to highest frequency
|
||||
my @search_order = sort { $word_hits->{$b} <=> $word_hits->{$b} } keys %{$word_hits};
|
||||
$self->debug_dumper( "Disjoining words in the following order: ", \@search_order) if ($self->{_debug});
|
||||
|
||||
################################################################################
|
||||
### This following part is for smaller disjoin presets
|
||||
################################################################################
|
||||
foreach my $word ( @search_order ) {
|
||||
|
||||
# setup the where clause to get all the words associated
|
||||
my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
|
||||
|
||||
# setup the intersect for the previous if required. for iterative intersecting
|
||||
if ( keys %{$results} ) {
|
||||
$where .= " AND Item_ID in(" . join(",", keys %{$results}) . ")";
|
||||
}
|
||||
|
||||
# make the database engine work a little bit
|
||||
$query = qq!
|
||||
SELECT Item_ID
|
||||
FROM $stable
|
||||
WHERE
|
||||
$where
|
||||
GROUP BY Item_ID
|
||||
!;
|
||||
$self->debug($query) if ($self->{_debug});
|
||||
my $intersect_sth = $db->prepare( $query );
|
||||
|
||||
$intersect_sth->execute();
|
||||
|
||||
# get a list of all the matches
|
||||
my $matches = $intersect_sth->fetchall_arrayref();
|
||||
|
||||
# strip the matches from the current result set
|
||||
foreach my $word ( map { $_->[0] } @{$matches}) {
|
||||
delete $results->{$word};
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _phrase_disjoin_query {
|
||||
#------------------------------------------------------------
|
||||
# subtracts the found phrases from the list
|
||||
my $self = shift;
|
||||
my $phrases = shift;
|
||||
my $results = shift || {};
|
||||
$phrases or return $results;
|
||||
|
||||
foreach my $phrase ( values %{$phrases} ) {
|
||||
my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
|
||||
|
||||
# perform disjoin
|
||||
foreach my $itemid ( keys %{$temp} ) {
|
||||
$self->debug( "Deleting $itemid from list" ) if ($self->{_debug});
|
||||
delete $results->{$itemid};
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _phrase_intersect_query {
|
||||
#------------------------------------------------------------
|
||||
# intersects phrases together
|
||||
my $self = shift;
|
||||
my $phrases = shift;
|
||||
my $results = shift || {};
|
||||
|
||||
$phrases or return $results;
|
||||
|
||||
foreach my $phrase ( values %{$phrases} ) {
|
||||
my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
|
||||
|
||||
# perform intersect
|
||||
foreach my $itemid ( keys %{$temp} ) {
|
||||
$temp->{$itemid} += $results->{$itemid} || 0;
|
||||
}
|
||||
$results = $temp;
|
||||
|
||||
}
|
||||
|
||||
return $results;
|
||||
|
||||
}
|
||||
|
||||
sub _phrase_query {
|
||||
#------------------------------------------------------------
|
||||
# this is a phrase union query
|
||||
my $self = shift;
|
||||
my $phrases = shift or return;
|
||||
my $results = shift || {};
|
||||
|
||||
foreach my $phrase ( values %{$phrases} ) {
|
||||
$self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug});
|
||||
$results = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info}, $results );
|
||||
}
|
||||
|
||||
return $results;
|
||||
|
||||
}
|
||||
|
||||
sub _get_phrase {
|
||||
#------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $wordlist= shift;
|
||||
my $word_info = shift;
|
||||
my $results = shift || {};
|
||||
|
||||
$wordlist or return $results;
|
||||
|
||||
my ( $query, $where, $db, $word_infos, %word_hits );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
my ($pk) = $self->{table}->pk;
|
||||
|
||||
$self->debug_dumper( "Getting words: ", $wordlist ) if ($self->{_debug});
|
||||
|
||||
# get all the word ids that we want to handle
|
||||
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
|
||||
$word_infos = $self->_word_infos( $word_info ) or return;
|
||||
|
||||
|
||||
$self->debug_dumper( "Word infos: ", $word_infos ) if ($self->{_debug});
|
||||
|
||||
# take the words and get a hash of the word scores
|
||||
foreach my $word ( keys %{$word_infos} ) {
|
||||
|
||||
@{$word_infos->{$word} || []} or return;
|
||||
|
||||
my $total_freq = 0;
|
||||
foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
|
||||
$total_freq += $word_synonyms->[1];
|
||||
}
|
||||
|
||||
# if the value is null this mean there is actually no results, whoops!
|
||||
$word_hits{$word} = $total_freq;
|
||||
}
|
||||
|
||||
$self->debug_dumper( "With synonyms tallied: ", \%word_hits ) if ($self->{_debug});
|
||||
|
||||
# so now, setup the order of search
|
||||
my $i = 0;
|
||||
my %word_order = map { $_ => $i++ } @{$wordlist};
|
||||
my @search_order = sort { $word_hits{$a} <=> $word_hits{$b} } keys %word_hits;
|
||||
|
||||
$self->debug_dumper( "Word search order: ", \@search_order ) if ($self->{_debug});
|
||||
|
||||
################################################################################
|
||||
### This following part is for smaller phrases
|
||||
################################################################################
|
||||
# start getting words in order of their frequency
|
||||
my %matches = ();
|
||||
my $index = 0;
|
||||
foreach my $word ( @search_order ) {
|
||||
|
||||
# setup the where clause for the individual words, firstly
|
||||
if ( keys %matches ) {
|
||||
my $vector = $word_order{$word} - $index;
|
||||
$where = '(';
|
||||
$where =
|
||||
'(' .
|
||||
join(
|
||||
" OR ",
|
||||
map(
|
||||
"Item_ID = $_ AND Word_Pos IN(" . join(",", map $_->[0] + $vector, @{$matches{$_}}) . ')',
|
||||
keys %matches
|
||||
)
|
||||
) .
|
||||
") AND ";
|
||||
}
|
||||
else {
|
||||
$where = '';
|
||||
}
|
||||
|
||||
$where .= "Word_ID IN(" . ( join ",", map { $_->[0] || () } @{$word_infos->{$word}} or return $results ) . ')';
|
||||
|
||||
$query = qq!
|
||||
SELECT
|
||||
Item_ID, Score, Word_Pos
|
||||
FROM
|
||||
$stable
|
||||
WHERE
|
||||
$where
|
||||
!;
|
||||
|
||||
$self->debug( "Phrase get for '$word': " . $query ) if ($self->{_debug});
|
||||
my $sth = $db->prepare( $query );
|
||||
$sth->execute();
|
||||
|
||||
%matches = ();
|
||||
|
||||
while (my $hit = $sth->fetchrow_arrayref) {
|
||||
push @{$matches{$hit->[0]}}, [ $hit->[2], $hit->[1] ];
|
||||
}
|
||||
|
||||
# If there are no values stored in %matches, it means that for
|
||||
# this keyword, there have been no hits based upon position.
|
||||
# In that case, terminate and return a null result
|
||||
keys %matches or last;
|
||||
|
||||
# where were we in the string?
|
||||
$index = $word_order{$word};
|
||||
}
|
||||
|
||||
# now tally up all the scores and merge the new records in
|
||||
foreach my $itemid ( keys %matches ) {
|
||||
my $score = 0;
|
||||
foreach my $sub_total ( @{$matches{$itemid}} ) {
|
||||
$score += $sub_total->[1];
|
||||
}
|
||||
$results->{$itemid} += $score;
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub get_wordids {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Get a list of words
|
||||
#
|
||||
my $self = shift;
|
||||
my $elements = shift or return;
|
||||
my $mode = lc shift || 'keywords';
|
||||
|
||||
if ( $mode eq 'keywords' ) {
|
||||
$elements = $self->_get_wordid($elements);
|
||||
}
|
||||
else {
|
||||
foreach my $phrase ( keys %$elements ) {
|
||||
my $results = $self->_get_wordid({
|
||||
map { ($_ => { substring => 0 }) } @{$elements->{$phrase}->{phrase}}
|
||||
});
|
||||
|
||||
$elements->{$phrase}->{word_info} = $results;
|
||||
}
|
||||
}
|
||||
|
||||
return $elements;
|
||||
}
|
||||
|
||||
sub _get_wordid {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Get a list of words
|
||||
#
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $tbl = $self->{table};
|
||||
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
foreach my $word ( keys %$words ) {
|
||||
my $query =
|
||||
qq!SELECT Word_ID, Frequency FROM $wtable WHERE Word LIKE '! .
|
||||
quotemeta($word) .
|
||||
( $words->{$word}->{substring} ? '%' : '' ) .
|
||||
"'";
|
||||
my $sth = $tbl->do_query($query) or next;
|
||||
my $tmp = { $sth->fetchall_list };
|
||||
|
||||
$words->{$word}->{word_info} = $tmp;
|
||||
}
|
||||
|
||||
return $words;
|
||||
}
|
||||
|
||||
##
|
||||
# Internal Use
|
||||
# $self->_cgi_to_hash ($in);
|
||||
# --------------------------
|
||||
# Creates a hash ref from a cgi object.
|
||||
##
|
||||
sub _cgi_to_hash {
|
||||
my ($self, $cgi) = @_;
|
||||
$cgi and UNIVERSAL::can($cgi, 'param') or return $self->error(NODRIVER => 'FATAL');
|
||||
my @keys = $cgi->param;
|
||||
my $result = {};
|
||||
foreach my $key (@keys) {
|
||||
my @values = $cgi->param($key);
|
||||
if (@values == 1) { $result->{$key} = $values[0] }
|
||||
else { $result->{$key} = \@values }
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
|
||||
1;
|
Reference in New Issue
Block a user