First pass at adding key files

2024-06-17 21:49:12 +10:00
commit aa25e9347f
1274 changed files with 392549 additions and 0 deletions
--- a/site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/INTERNAL/Search.pm
+++ b/site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/INTERNAL/Search.pm
@ -0,0 +1,604 @@
+# ==================================================================
+# Gossamer Threads Module Library - http://gossamer-threads.com/
+#
+#   GT::SQL::Indexer
+#   Author  : Aki Mimoto
+#   CVS Info : 087,071,086,086,085      
+#   $Id: Search.pm,v 1.18 2004/08/28 03:53:47 jagerman Exp $
+#
+# Copyright (c) 2004 Gossamer Threads Inc.  All Rights Reserved.
+# ==================================================================
+#
+# Description:
+#   Class used to make changes to tables and create tables.
+#
+
+package GT::SQL::Search::INTERNAL::Search;
+
+# ------------------------------------------------------------------------------
+    use strict;
+    use vars qw/@ISA $VERSION $DEBUG $ATTRIBS /;
+    use GT::SQL::Search::Base::Search;
+    @ISA = qw( GT::SQL::Search::Base::Search );
+
+# ------------------------------------------------------------------------------
+# Preamble information related to the object
+
+    $DEBUG   = 0;
+    $VERSION = sprintf "%d.%03d", q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/;
+    $ATTRIBS = {
+# the max number of links that can be handled by UNION before it should simply 
+# shunt the searching pipe to NONINDEXED system
+        'union_shunt_threshold'  => '5000',
+        'phrase_shunt_threshold' => '1000',
+    };
+
+
+################################################################################
+# Internal functions
+################################################################################
+
+sub load {
+    shift;
+    return GT::SQL::Search::INTERNAL::Search->new(@_)
+}
+
+sub _query {
+# ------------------------------------------------------------------------------
+# this just checks to ensure that the words are not all search keywords
+#
+    my ( $self, $input, $buckets ) = @_;
+
+# calculate wordids and frequencies
+    foreach ( keys %$buckets ) {
+        $buckets->{$_} = $self->get_wordids( $buckets->{$_}, ( /phrase/ ? "phrases" : "keywords" ) );
+    }
+
+# the following is a bit tricky and will be replaced however, if the number 
+# of results from a union is more than the maximum shunt value, it will 
+# simply do a nonindexed query
+    if ( $buckets->{keywords} ) {
+        my $rec       = _count_frequencies( $buckets->{keywords} );
+        my $count     = 0;
+        foreach ( values %$rec ) { $count +=  $_; }
+        if ($count > $self->{union_shunt_threshold}) {
+            $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
+            return $self->alternate_driver_query( 'NONINDEXED', $input );
+        }
+    }
+
+
+# Now test the phrases. Just due to how the phrase searching works, the queries
+# can grow in size extremely rapidly, and slowdown the search. So the limit for
+# phrase searching is separate as it requires a different cutoff value than
+# the keyword search which is usually much lower!
+    if ($buckets->{phrases}) {
+        foreach my $phrase ( keys %{$buckets->{phrases} || {} } ) {
+            my $rec       = _count_frequencies( $buckets->{phrases}->{$phrase}->{word_info} );
+            my ( $count ) = sort values %$rec; # Get smallest frequency.
+            if ( $count > $self->{phrase_shunt_threshold} ) {
+                $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
+                return $self->alternate_driver_query( 'NONINDEXED', $input );
+            }
+        }
+    }
+    if ($buckets->{phrases_must}) {
+        foreach my $phrase ( keys %{$buckets->{phrases_must} || {} } ) {
+            my $rec       = _count_frequencies( $buckets->{phrases_must}->{$phrase}->{word_info} );
+            my ( $count ) = sort values %$rec; # Get smallest frequency.
+            if ( $count > $self->{phrase_shunt_threshold} ) {
+                $self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
+                return $self->alternate_driver_query( 'NONINDEXED', $input );
+            }
+        }
+    }
+    return $self->SUPER::_query( $input, $buckets );
+}
+
+sub _count_frequencies {
+# ------------------------------------------------------------------------------
+    my $word_info = shift;
+    my $rec       = {};
+    foreach my $word ( keys %$word_info ) {
+        my $freq  = 0;
+        foreach ( values %{$word_info->{$word}->{word_info}} ) {
+            $freq += $_;
+        }
+        $rec->{$word} = $freq;
+    }
+
+    return $rec;
+}
+
+sub _table_names {
+# ------------------------------------------------------------------------------
+# return the table names
+#
+    my $self    =  shift;
+    my $table   = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
+    my $wtable  = $table . '_Word_List';
+    my $stable  = $table . '_Score_List';
+
+    return ( $table, $wtable, $stable);
+}
+
+sub _word_infos {
+# ------------------------------------------------------------------------------
+# get the word ids and frequencies 
+#
+    my $self       = shift;
+    my $word_infos = shift;
+
+    my $rec        = {};
+
+    foreach my $word ( keys %$word_infos ) {
+        my $wi     = $word_infos->{$word}->{word_info};
+        $rec->{$word} = [ map { [ $_, $wi->{$_} ] } keys %$wi ];
+    }
+
+    return $rec;
+    
+}
+
+sub _union_query {
+# ------------------------------------------------------------------------------
+# Takes a list of words and gets all words that match
+# returns { itemid -> score } of hits that match
+#
+    my $self    = shift;
+    my $words   = shift;
+    my $results = shift || {};
+    my ( $query, $where, $db, $word_infos );
+    my ( $table, $wtable, $stable) = $self->_table_names();
+
+    $db         = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
+    $word_infos = $self->_word_infos( $words ) or return $results;
+
+    return $results unless (keys %{$word_infos});
+
+    $self->debug_dumper( "Getting words: ", $words) if ($self->{_debug});
+
+# build the where clause
+    my @word_ids;
+    foreach my $word_synonym_list  ( values %$word_infos ) {
+        next unless ( $word_synonym_list );
+        foreach my $word_id ( @{$word_synonym_list }) {
+            next unless ( ref $word_id eq 'ARRAY' ); # ensure it's a reference
+            push @word_ids, $word_id->[0]; # we need to shed the word quantities
+        }
+    }
+
+    return $results unless ( @word_ids );
+    $where = 'Word_ID IN(' . join(",", @word_ids) . ")";
+
+# build the query
+    $query = qq!
+        SELECT Item_ID, SUM(Score)
+            FROM $stable
+            WHERE
+                $where
+            GROUP BY Item_ID
+    !;
+
+    $self->debug( "Union Query: $query" ) if ($self->{_debug});
+
+# prepare the query
+    my $sth = $db->prepare( $query ) or return;
+    $sth->execute() or return;
+
+# get the results
+    my %word_infos = $sth->fetchall_list;
+
+# merge the current result set into found
+    foreach my $item ( keys %{$results} ) {
+        $word_infos{$item} += $results->{$item};
+    };
+
+    return \%word_infos;
+}
+
+sub _intersect_query {
+# ------------------------------------------------------------------------------
+# Takes a list of words and gets all words that match all the keywords
+# returns { itemid -> score } of hits that match
+#
+    my $self    = shift;
+    my $words   = shift;
+    my $results = shift || {};
+
+    $words or return $results;
+    keys %{$words} or return $results;
+
+    my ( $query, $where, $db, $word_infos, $word_hits );
+    my ( $table, $wtable, $stable) = $self->_table_names();
+
+# have we left any of our words out?
+    $db         = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
+    $word_infos = $self->_word_infos( $words ) or return {};
+    if ( keys %{$word_infos} < keys %{$words} ) {
+        return {};
+    }
+
+    $self->debug_dumper( "Keyword Intersect words: ", $word_infos ) if ($self->{_debug});
+
+# take the words and get a hash of the word scores
+    foreach my $word ( keys %{$word_infos} ) {
+
+        my $total_freq = 0;
+        foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
+            $total_freq += $word_synonyms->[1];
+        }
+
+        $word_hits->{$word} = $total_freq or return;
+
+    }
+
+# so now, sort out the words from lowest frequency to highest frequency
+    my @search_order = sort { $word_hits->{$a} <=> $word_hits->{$b} } keys %{$word_hits};
+
+    $self->debug_dumper( "Searching words in this order: ", \@search_order) if ($self->{_debug});
+
+# find out how we're going to handle the searching, if the first elements
+
+################################################################################
+### The following part is for smaller intersect subsets
+################################################################################
+    my $intersect = $results;
+    foreach my $word ( @search_order ) {
+
+# setup the where clause to get all the words associated
+        my $where   = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
+
+# setup the intersect for the previous if required. for iterative intersecting
+        if ( keys %{$intersect} ) {
+            $where  .= " AND Item_ID in(" . join(",",keys %{$intersect}) . ")";
+        }
+
+# make the database engine work a little bit
+        $query = qq!
+            SELECT Item_ID, SUM(Score) AS Score
+            FROM $stable
+            WHERE
+                $where
+            GROUP BY Item_ID
+        !;
+        $self->debug( "Intersect Query: $query" ) if ($self->{_debug});
+        my $intersect_sth = $db->prepare( $query );
+
+        $intersect_sth->execute();
+
+# get a list of all the matches
+        my $matches = $intersect_sth->fetchall_arrayref();
+
+        $self->debug_dumper( "Matches found for $word: ", $matches ) if ($self->{_debug});
+
+# go through all the matches and intersect them
+        my %tmp = ();
+        foreach my $row ( @{$matches} ) {
+            my ( $itemid, $score ) = @{$row};
+            $intersect->{$itemid} ||= 0;
+            $tmp{ $itemid } = $intersect->{$itemid} + $score;
+        }
+
+# inform the system of that development
+        %tmp or return;
+        $intersect = \%tmp;
+    }
+
+    return $intersect;
+}
+
+sub _disjoin_query {
+#------------------------------------------------------------
+    my $self    = shift;
+    my $words   = shift;
+    my $results = shift || {};
+    $words or return $results;
+
+    my ( $query, $where, $db, $word_infos, $word_hits );
+    my ( $table, $wtable, $stable) = $self->_table_names();
+
+    $db = $self->{table}->{driver} or return $results;
+
+# have we left any of our words out?
+    $word_infos = $self->_word_infos( $words ) or return $results;
+#   if ( keys %{$word_infos} < keys %{$words} ) {
+#       return $results;
+#   }
+
+# take the words and get a hash of the word scores
+    foreach my $word ( keys %{$word_infos} ) {
+        my $total_freq = 0;
+        foreach my $word_synonyms ( $word_infos->{$word} ) {
+            $total_freq += ( $word_synonyms->[0] || 0 );
+        }
+# if the value is null this mean there is actually no results, whoops!
+        $total_freq and $word_hits->{$word} = $total_freq;
+    }
+
+# so now, sort out the words from lowest frequency to highest frequency
+    my @search_order = sort { $word_hits->{$b} <=> $word_hits->{$b} } keys %{$word_hits};
+    $self->debug_dumper( "Disjoining words in the following order: ", \@search_order) if ($self->{_debug});
+
+################################################################################
+### This following part is for smaller disjoin presets
+################################################################################
+    foreach my $word ( @search_order ) {
+
+# setup the where clause to get all the words associated
+        my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
+
+# setup the intersect for the previous if required. for iterative intersecting
+        if ( keys %{$results} ) {
+            $where .= " AND Item_ID in(" . join(",", keys %{$results}) . ")";
+        }
+
+# make the database engine work a little bit
+        $query = qq!
+            SELECT Item_ID
+            FROM $stable
+            WHERE
+                $where
+            GROUP BY Item_ID
+        !;
+        $self->debug($query) if ($self->{_debug});
+        my $intersect_sth = $db->prepare( $query );
+
+        $intersect_sth->execute();
+
+# get a list of all the matches
+        my $matches = $intersect_sth->fetchall_arrayref();
+
+# strip the matches from the current result set
+        foreach my $word ( map { $_->[0] } @{$matches}) {
+            delete $results->{$word};
+        }
+    }
+
+    return $results;
+}
+
+sub _phrase_disjoin_query {
+#------------------------------------------------------------
+# subtracts the found phrases from the list
+    my $self    = shift;
+    my $phrases = shift;
+    my $results = shift || {};
+    $phrases or return $results;
+
+    foreach my $phrase ( values %{$phrases} ) {
+        my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
+
+# perform disjoin
+        foreach my $itemid ( keys %{$temp} ) {
+            $self->debug( "Deleting $itemid from list" ) if ($self->{_debug});
+            delete $results->{$itemid};
+        }
+    }
+
+    return $results;
+}
+
+sub _phrase_intersect_query {
+#------------------------------------------------------------
+# intersects phrases together
+    my $self    = shift;
+    my $phrases = shift;
+    my $results = shift || {};
+
+    $phrases or return $results;
+
+    foreach my $phrase ( values %{$phrases} ) {
+        my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
+
+# perform intersect
+        foreach my $itemid ( keys %{$temp} ) {
+            $temp->{$itemid} += $results->{$itemid} || 0;
+        }
+        $results = $temp;
+
+    }
+
+    return $results;
+
+}
+
+sub _phrase_query {
+#------------------------------------------------------------
+# this is a phrase union query
+    my $self    = shift;
+    my $phrases = shift or return;
+    my $results = shift || {};
+
+    foreach my $phrase ( values %{$phrases} ) {
+        $self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug});
+        $results = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info}, $results );
+    }
+
+    return $results;
+
+}
+
+sub _get_phrase {
+#------------------------------------------------------------
+    my $self    = shift;
+    my $wordlist= shift;
+    my $word_info = shift;
+    my $results = shift || {};
+
+    $wordlist or return $results;
+
+    my ( $query, $where, $db, $word_infos, %word_hits );
+    my ( $table, $wtable, $stable) = $self->_table_names();
+    my ($pk) = $self->{table}->pk;
+
+    $self->debug_dumper( "Getting words: ", $wordlist ) if ($self->{_debug});
+
+# get all the word ids that we want to handle   
+    $db         = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
+    $word_infos = $self->_word_infos( $word_info ) or return;
+
+
+    $self->debug_dumper( "Word infos: ", $word_infos ) if ($self->{_debug});
+
+# take the words and get a hash of the word scores
+    foreach my $word ( keys %{$word_infos} ) {
+
+        @{$word_infos->{$word} || []} or return;
+
+        my $total_freq = 0;
+        foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
+            $total_freq += $word_synonyms->[1];
+        }
+
+# if the value is null this mean there is actually no results, whoops!
+        $word_hits{$word} = $total_freq;
+    }
+
+    $self->debug_dumper( "With synonyms tallied: ",  \%word_hits ) if ($self->{_debug});
+
+# so now, setup the order of search
+    my $i = 0;
+    my %word_order = map { $_ => $i++ } @{$wordlist};
+    my @search_order = sort { $word_hits{$a} <=> $word_hits{$b} } keys %word_hits;
+
+    $self->debug_dumper( "Word search order: ", \@search_order ) if ($self->{_debug});
+
+################################################################################
+### This following part is for smaller phrases
+################################################################################
+# start getting words in order of their frequency
+    my %matches = ();
+    my $index = 0;
+    foreach my $word ( @search_order ) {
+
+# setup the where clause for the individual words, firstly
+        if ( keys %matches ) {
+            my $vector  = $word_order{$word} - $index;
+            $where = '(';
+            $where =
+                '(' .
+                join(
+                    " OR ",
+                    map(
+                        "Item_ID = $_ AND Word_Pos IN(" . join(",", map $_->[0] + $vector, @{$matches{$_}}) . ')',
+                        keys %matches
+                    )
+                ) .
+                ") AND ";
+        }
+        else {
+            $where = '';
+        }
+
+        $where .= "Word_ID IN(" . ( join ",", map { $_->[0] || () } @{$word_infos->{$word}} or return $results ) . ')';
+
+        $query = qq!
+            SELECT 
+                Item_ID, Score, Word_Pos
+            FROM 
+                $stable 
+            WHERE
+                $where
+        !;
+
+        $self->debug( "Phrase get for '$word': " . $query ) if ($self->{_debug});
+        my $sth = $db->prepare( $query );
+        $sth->execute();
+
+        %matches = ();
+
+        while (my $hit = $sth->fetchrow_arrayref) {
+            push @{$matches{$hit->[0]}}, [ $hit->[2], $hit->[1] ];
+        }
+
+# If there are no values stored in %matches, it means that for
+# this keyword, there have been no hits based upon position.
+# In that case, terminate and return a null result
+        keys %matches or last;
+
+# where were we in the string?
+        $index = $word_order{$word};
+    }
+
+# now tally up all the scores and merge the new records in
+    foreach my $itemid ( keys %matches ) {
+        my $score = 0;
+        foreach my $sub_total ( @{$matches{$itemid}} ) {
+            $score += $sub_total->[1];
+        }
+        $results->{$itemid} += $score;
+    }
+
+    return $results;
+}
+
+sub get_wordids {
+# ------------------------------------------------------------------------------
+# Get a list of words 
+#
+    my $self     = shift;
+    my $elements = shift or return;
+    my $mode     = lc shift || 'keywords';
+
+    if ( $mode eq 'keywords' ) {
+        $elements = $self->_get_wordid($elements);
+    }
+    else {
+        foreach my $phrase ( keys %$elements ) {
+            my $results = $self->_get_wordid({
+                map { ($_ => { substring => 0 }) } @{$elements->{$phrase}->{phrase}}
+            });
+
+            $elements->{$phrase}->{word_info} = $results;
+        }
+    }
+
+    return $elements;
+}
+
+sub _get_wordid {
+# ------------------------------------------------------------------------------
+# Get a list of words 
+#
+    my $self  = shift;
+    my $words = shift;
+    my $tbl   = $self->{table};
+    
+    my ( $table, $wtable, $stable) = $self->_table_names();
+
+    foreach my $word ( keys %$words ) {
+        my $query =
+            qq!SELECT Word_ID, Frequency FROM $wtable WHERE Word LIKE '! .
+            quotemeta($word) .
+            ( $words->{$word}->{substring} ? '%' : '' ) .
+            "'";
+        my $sth = $tbl->do_query($query) or next;
+        my $tmp = { $sth->fetchall_list };
+
+        $words->{$word}->{word_info} = $tmp;
+    }
+
+    return $words;
+}
+
+##
+# Internal Use
+# $self->_cgi_to_hash ($in);
+# --------------------------
+#   Creates a hash ref from a cgi object.
+##
+sub _cgi_to_hash {
+    my ($self, $cgi) = @_;
+    $cgi and UNIVERSAL::can($cgi, 'param') or return $self->error(NODRIVER => 'FATAL');
+    my @keys = $cgi->param;
+    my $result = {};
+    foreach my $key (@keys) {
+        my @values = $cgi->param($key);
+        if (@values == 1) { $result->{$key} = $values[0] }
+        else              { $result->{$key} = \@values   }
+    }
+    return $result;
+}
+
+1;