discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/Base/Search.pm

573 lines
18 KiB
Perl
Raw Permalink Normal View History

2024-06-17 11:49:12 +00:00
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::Base
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.25 2004/08/28 03:53:46 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Base classes upon which all search drivers are based
#
package GT::SQL::Search::Base::Search;
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
use GT::Base;
use GT::SQL::Search::Base::Common;
@ISA = qw( GT::Base GT::SQL::Search::Base::Common);
#--------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.25 $ =~ /(\d+)\.(\d+)/;
@ISA = qw/ GT::Base /;
$ATTRIBS = {
'stopwords' => $STOPWORDS,
'mh' => 25,
'nh' => 1,
'ww' => undef,
'ma' => undef,
'bool' => undef,
'substring' => 0,
'query' => '',
'sb' => 'score',
'so' => '',
'score_col' => 'SCORE',
'score_sort'=> 0,
'debug' => 0,
'_debug' => 0,
# query related
'db' => undef,
'table' => undef,
'filter' => undef,
'callback' => undef,
# strict matching of indexed words, accents on words do count
'sm' => 0,
'min_word_size' => 3,
'max_word_size' => 50,
};
sub init {
#--------------------------------------------------------------------------------
# Initialises the Search object
#
my $self = shift;
my $input = $self->common_param(@_);
$self->set($input);
# now handle filters...,
my $tbl = $self->{table};
my $cols = $tbl->cols();
my %filters = map {
(my $tmp = $_) =~ s/-[lg]t$//;
exists $cols->{$tmp} ? ($_ => $input->{$_}) : ()
} keys %{$input};
if ( keys %filters ) {
$self->debug_dumper( "INIT: Creating Filters: ", \%filters ) if ( $self->{_debug} );
$self->filter(\%filters);
}
$self->{table}->connect;
}
sub query {
#--------------------------------------------------------------------------------
# Returns a sth based on a query
#
# Options:
# - paging
# mh : max hits
# nh : number hit (or page of hits)
#
# - searching
# ww : whole word
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
# substring : search for substrings of words
# bool : 'and' => and search, 'or' => or search, '' => regular query
# query : the string of things to ask for
#
# - filtering
# field_name : value # Find all rows with field_name = value
# field_name : ">value" # Find all rows with field_name > value.
# field_name : "<value" # Find all rows with field_name < value.
# field_name-gt : value # Find all rows with field_name > value.
# field_name-lt : value # Find all rows with field_name < value.
#
# Parameters:
# ( $CGI ) : a single cgi object
# ( $HASH ) : a hash of the parameters
#
my $self = shift;
# find out what sort of a parameter we're dealing with
my $input = $self->common_param(@_);
# add additional parameters if required
foreach my $parameter ( keys %{$ATTRIBS} ) {
if ( not exists $input->{$parameter} ) {
$input->{$parameter} = $self->{$parameter};
}
}
# parse query...,
$self->debug( "Search Query: $$input{query}", 1 ) if ($self->{_debug});
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
$self->{'rejected_keywords'} = $rejected;
# setup the additional input parameters
$query = $self->_preset_options( $query, $input );
$self->debug_dumper( "Set the pre-options: ", $query ) if ($self->{_debug});
# now sort into distinct buckets
my $buckets = &_create_buckets( $query );
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
return $self->_query($input, $buckets);
}
sub _query {
#--------------------------------------------------------------------------------
my ( $self, $input, $buckets ) = @_;
# now handle the separate possibilities
my $results = {};
# query can have phrases
$results = $self->_phrase_query( $buckets->{phrases}, $results );
$self->debug_dumper("Did phrase union query. Current result set and scores: ", $results ) if ($self->{_debug});
# query have keywords
$results = $self->_union_query( $buckets->{keywords}, $results );
$self->debug_dumper("Did keyword union query. Current result set and scores: ", $results ) if ($self->{_debug});
# query must have phrases
$results = $self->_phrase_intersect_query( $buckets->{phrases_must}, $results );
$self->debug_dumper("Did phrase intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
# query must have keywords
$results = $self->_intersect_query( $buckets->{keywords_must}, $results );
$self->debug_dumper("Did keyword intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
# query cannot have keywords
$results = $self->_disjoin_query( $buckets->{keywords_cannot}, $results );
$self->debug_dumper("Did keyword disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
# query cannot have phrases
$results = $self->_phrase_disjoin_query( $buckets->{phrases_cannot}, $results);
$self->debug_dumper("Did phrase disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
# now handle filters
my $cols = $self->{'table'}->cols();
my %filters = map {
(my $tmp = $_) =~ s/-[lg]t$//;
$cols->{$tmp} ? ($_ => $input->{$_}) : ()
} keys %{$input};
if (keys %filters) {
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
$results = $self->filter(\%filters, $results);
}
elsif ($self->{filter}) {
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
$results = $self->_filter_query( $self->{filter}, $results );
}
else {
$self->debug( "No filters being used.") if ($self->{_debug});
}
# now this query should probably clear the filters once it's been used, so i'll dothat here
$self->{filter} = undef;
# now run through a callback function if needed.
if ($self->{callback}) {
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
return $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
}
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
$results = $self->{callback}->($self, $results);
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
}
# so how many hits did we get?
$self->{table}->{last_hits} = ( $self->{rows} = scalar($results ? keys %{$results} : ()) );
# and now create a search sth object to handle all this
return $self->sth( $results );
}
sub sth {
#--------------------------------------------------------------------------------
my $self = shift;
my $results = shift;
require GT::SQL::Search::Base::STH;
my $sth = GT::SQL::Search::STH->new(
'results' => $results,
'db' => $self->{table}->{driver},
# pass the following attributes down to the STH handler
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
);
return $sth;
}
sub rows {
#--------------------------------------------------------------------------------
# after a query is run, returns the number of rows
my $self = shift;
return $self->{rows} || 0;
}
sub _add_filters {
#--------------------------------------------------------------------------------
# creates the filter object
my $self = shift;
my $filter;
# find out how we're calling the parameters
if ( ref $_[0] eq 'GT::SQL::Condition' ) {
$filter = shift;
}
elsif ( ref $_[0] eq 'HASH' ) {
# setup the query condition using the build_query condition method
# build the condition object
my %opts = %{ shift() || {} };
delete $opts{query};
$filter = $self->{table}->build_query_cond( \%opts, $self->{table}{schema}{cols} );
}
else {
return $self->error ('BADARGS', 'FATAL', "Invalid argument: $_[0] passed to _add_filter");
}
# Use ref, as someone can pass in filter => 1 and mess things up.
ref $self->{filter} ? $self->{filter}->add ($filter) : ($self->{filter} = $filter);
$self->debug_dumper( "Filters: ", $self->{filter} ) if ($self->{_debug});
return $self->{filter};
}
sub _preset_options {
#--------------------------------------------------------------------------------
# sets up word parameters
my $self = shift;
my $query = shift or return;
my $input = shift or return $query;
# whole word searching
if ( defined $input->{'ww'} or defined $self->{'ww'}) {
if ( defined $input->{'ww'} ? $input->{'ww'} : $self->{'ww'} ) {
for ( keys %{$query} ) { $query->{$_}->{mode} = 'substring'; }
}
}
# substring searching
if ( defined $input->{'substring'} or defined $self->{'substring'}) {
if ( defined $input->{'substring'} ? $input->{'substring'} : $self->{'substring'} ) {
for ( keys %{$query} ) { $query->{$_}->{'substring'} = 1; }
}
}
if ( defined $input->{'ma'} or defined $self->{'ma'} ) {
# each keyword must be included
if ( defined $input->{'ma'} ? $input->{'ma'} : $self->{'ma'} ) {
for ( keys %{$query} ) {
next if $query->{$_}->{mode} eq 'cannot';
$query->{$_}->{mode} = 'must';
}
}
# each word can be included but is not necessary
else {
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
}
}
# some more and or searches, only if user hasn't put +word -word
if ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'and' ) {
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
for ( keys %{$query} ) {
next if $query->{$_}->{mode} eq 'cannot';
$query->{$_}->{mode} = 'must';
}
}
}
elsif ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'or' ) {
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
}
}
return $query;
}
sub _phrase_query { $_[1] }
sub _union_query { $_[1] }
sub _phrase_intersect_query { $_[1] }
sub _intersect_query { $_[1] }
sub _disjoin_query { $_[1] }
sub _phrase_disjoin_query { $_[1] }
sub filter {
#--------------------------------------------------------------------------------
# adds a filter
#
my $self = shift;
# add filters..,
my $filters = $self->_add_filters( shift );
my $results = shift;
# see if we need to execute a search, otherwise just return the current filterset
defined $results or return $results;
# start doing the filter stuff
return $self->_filter_query( $filters, $results );
}
sub _parse_query_string {
#------------------------------------------------------------
# from Mastering Regular Expressions altered a fair bit
# takes a space delimited string and breaks it up.
#
my $self = shift;
my $text = shift;
my %words = ();
my %reject = ();
my %mode = (
'+' => 'must',
'-' => 'cannot',
'<' => 'greater',
'>' => 'less'
);
# work on the individual elements
my @new = ();
while ( $text =~ m{
# the first part groups the phrase inside the quotes.
# see explanation of this pattern in MRE
([+-]?"[^\"\\]*(?:\\.[^\"\\]*)*)" ?
| (\+?[\w\x80-\xFF\-\*]+),?
| ' '
}gx ) {
my $match = lc $+;
# strip out buffering spaces
$match =~ s/^\s+//; $match =~ s/\s+$//;
# don't bother trying if there is nothing there
next unless $match;
# find out the searching mode
my ($mode, $substring, $phrase);
if (my $m = $mode{substr($match,0,1)}) {
$match = substr($match,1);
$mode = $m;
}
# do we need to substring match?
if ( substr( $match, -1, 1 ) eq "*" ) {
$match = substr($match,0,length($match)-1);
$substring = 1;
}
# find out if we're dealing with a phrase
if ( substr($match,0,1) eq '"' ) {
$self->debug( "Dealing with a phrase: $match" ) if ($self->{_debug});
$match = substr($match,1);
# however, we want to make sure it's a phrase and not something else
my ( $word_list, $rejected ) = $self->_tokenize( $match );
$self->debug_dumper( "Phrase reduced to: ", $word_list ) if ($self->{_debug});
$self->debug_dumper( "Phrase words rejected: ", $rejected ) if ($self->{_debug});
my $word_count = @$word_list;
if ( $word_count > 1 ) { $phrase = $word_list } # ok, standard phrase
elsif ($word_count == 1) { $match = $word_list->[0] } # just a standard word, don't worry about searching by phrase
}
# make sure we can use this word
if ( not $phrase and my $code = $self->_check_word( $match, $self->{stopwords} ) ) {
$reject{ $match } = $code;
next;
}
# now, see if we should toss this word
$words{$match} = {
mode => $mode,
phrase => $phrase,
substring => $substring,
keyword => not $phrase,
};
}
# words is a hashref of:
# {
# word => {
# paramaters => 'values'
# },
# word1 => {
# ...
# },
# ...
# }
#
return( \%words, \%reject );
}
sub _filter_query {
#--------------------------------------------------------------------------------
# get the results from the filter
#
my $self = shift;
my $filters = shift;
my $results = shift or return {};
keys %{$results} or return $results;
my $table = $self->{table};
my $tname = $table->name();
# setup the where clause
my $where = $filters->sql() or return $results;
my ($pk) = $table->pk;
$where .= qq! AND $pk IN (! . join(',', keys %$results) . ')';
# now do the filter
my $query = qq!
SELECT $pk
FROM
$tname
WHERE
$where
!;
$self->debug( "Filter Query: $query" ) if ($self->{_debug});
my $sth = $self->{table}->{driver}->prepare($query);
$sth->execute();
# get all the results
my $aref = $sth->fetchall_arrayref;
return {
map {
$_->[0] => $results->{$_->[0]}
} @$aref
};
}
sub _create_buckets {
#------------------------------------------------------------
# takes the output from _parse_query_string and creates a
# bucket hash of all the different types of searching
# possible
my $query = shift or return;
my %buckets;
# put each word in the appropriate hash bucket
foreach my $parameter ( keys %{$query} ) {
my $word_data = $query->{$parameter};
# the following is slower, however, done that way to be syntatically legible
if ( $word_data->{'phrase'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'} =~ /(must|cannot)/ ))) {
$buckets{"phrases_$1"}->{$parameter} = $word_data;
}
elsif ( $word_data->{'phrase'} ) {
$buckets{'phrases'}->{$parameter} = $word_data;
}
elsif ( $word_data->{'keyword'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'}=~ /(must|cannot|greater|less)/) )) {
$buckets{"keywords_$1"}->{$parameter} = $word_data;
}
else {
$buckets{'keywords'}->{$parameter} = $word_data;
}
}
return \%buckets;
}
sub alternate_driver_query {
#--------------------------------------------------------------------------------
my ( $self, $drivername, $input ) = @_;
$drivername = uc $drivername;
require GT::SQL::Search;
my $driver = GT::SQL::Search->load_search({ %$input, table => $self->{table}, driver => $drivername });
my $sth = $driver->query( $input );
foreach ( qw( rows rejected_keywords ) ) { $self->{$_} = $driver->{$_}; }
return $sth;
}
sub clean_sb {
# -------------------------------------------------------------------------------
# Convert the sort by, sort order into an sql string.
#
my ($class, $sb, $so) = @_;
my $output = '';
return $output unless ($sb);
# Remove score attribute, used only for internal indexes.
$sb =~ s/^\s*score\b//;
$sb =~ s/,?\s*\bscore\b//;
if ($sb and not ref $sb) {
if ($sb =~ /^[\w\s,]+$/) {
if ($sb =~ /\s(?:asc|desc)/i) {
$output = 'ORDER BY ' . $sb;
}
else {
$output = 'ORDER BY ' . $sb . ' ' . $so;
}
}
else {
$class->error('BADSB', 'WARN', $sb);
}
}
elsif (ref $sb eq 'ARRAY') {
foreach ( @$sb ) {
/^[\w\s,]+$/ or $class->error( 'BADSB', 'WARN', $sb ), next;
}
$output = 'ORDER BY ' . join(',', @$sb);
}
return $output;
}
sub debug_dumper {
#--------------------------------------------------------------------------------
# calls debug but also dumps all the messages
my $self = shift;
my $message = shift;
my $level = ref $_[0] ? 1 : defined $_[0] ? shift : 0;
if ( $self->{_debug} >= $level ) {
require GT::Dumper;
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )) if ($self->{_debug});
}
}
1;