573 lines
18 KiB
Perl
573 lines
18 KiB
Perl
# ==================================================================
|
|
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
|
#
|
|
# GT::SQL::Search::Base
|
|
# Author : Aki Mimoto
|
|
# CVS Info : 087,071,086,086,085
|
|
# $Id: Search.pm,v 1.25 2004/08/28 03:53:46 jagerman Exp $
|
|
#
|
|
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
|
# ==================================================================
|
|
#
|
|
# Description:
|
|
# Base classes upon which all search drivers are based
|
|
#
|
|
|
|
package GT::SQL::Search::Base::Search;
|
|
|
|
|
|
use strict;
|
|
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
|
|
use GT::Base;
|
|
use GT::SQL::Search::Base::Common;
|
|
@ISA = qw( GT::Base GT::SQL::Search::Base::Common);
|
|
|
|
#--------------------------------------------------------------------------------
|
|
# Preamble information related to the object
|
|
|
|
$DEBUG = 0;
|
|
$VERSION = sprintf "%d.%03d", q$Revision: 1.25 $ =~ /(\d+)\.(\d+)/;
|
|
@ISA = qw/ GT::Base /;
|
|
|
|
$ATTRIBS = {
|
|
'stopwords' => $STOPWORDS,
|
|
'mh' => 25,
|
|
'nh' => 1,
|
|
'ww' => undef,
|
|
'ma' => undef,
|
|
'bool' => undef,
|
|
'substring' => 0,
|
|
'query' => '',
|
|
'sb' => 'score',
|
|
'so' => '',
|
|
'score_col' => 'SCORE',
|
|
'score_sort'=> 0,
|
|
'debug' => 0,
|
|
'_debug' => 0,
|
|
|
|
# query related
|
|
'db' => undef,
|
|
'table' => undef,
|
|
'filter' => undef,
|
|
'callback' => undef,
|
|
|
|
# strict matching of indexed words, accents on words do count
|
|
'sm' => 0,
|
|
'min_word_size' => 3,
|
|
'max_word_size' => 50,
|
|
};
|
|
|
|
sub init {
|
|
#--------------------------------------------------------------------------------
|
|
# Initialises the Search object
|
|
#
|
|
my $self = shift;
|
|
my $input = $self->common_param(@_);
|
|
|
|
$self->set($input);
|
|
|
|
# now handle filters...,
|
|
my $tbl = $self->{table};
|
|
my $cols = $tbl->cols();
|
|
my %filters = map {
|
|
(my $tmp = $_) =~ s/-[lg]t$//;
|
|
exists $cols->{$tmp} ? ($_ => $input->{$_}) : ()
|
|
} keys %{$input};
|
|
|
|
if ( keys %filters ) {
|
|
$self->debug_dumper( "INIT: Creating Filters: ", \%filters ) if ( $self->{_debug} );
|
|
$self->filter(\%filters);
|
|
}
|
|
|
|
$self->{table}->connect;
|
|
}
|
|
|
|
sub query {
|
|
#--------------------------------------------------------------------------------
|
|
# Returns a sth based on a query
|
|
#
|
|
# Options:
|
|
# - paging
|
|
# mh : max hits
|
|
# nh : number hit (or page of hits)
|
|
#
|
|
# - searching
|
|
# ww : whole word
|
|
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
|
# substring : search for substrings of words
|
|
# bool : 'and' => and search, 'or' => or search, '' => regular query
|
|
# query : the string of things to ask for
|
|
#
|
|
# - filtering
|
|
# field_name : value # Find all rows with field_name = value
|
|
# field_name : ">value" # Find all rows with field_name > value.
|
|
# field_name : "<value" # Find all rows with field_name < value.
|
|
# field_name-gt : value # Find all rows with field_name > value.
|
|
# field_name-lt : value # Find all rows with field_name < value.
|
|
#
|
|
# Parameters:
|
|
# ( $CGI ) : a single cgi object
|
|
# ( $HASH ) : a hash of the parameters
|
|
#
|
|
my $self = shift;
|
|
# find out what sort of a parameter we're dealing with
|
|
my $input = $self->common_param(@_);
|
|
|
|
# add additional parameters if required
|
|
foreach my $parameter ( keys %{$ATTRIBS} ) {
|
|
if ( not exists $input->{$parameter} ) {
|
|
$input->{$parameter} = $self->{$parameter};
|
|
}
|
|
}
|
|
|
|
# parse query...,
|
|
$self->debug( "Search Query: $$input{query}", 1 ) if ($self->{_debug});
|
|
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
|
|
|
|
$self->{'rejected_keywords'} = $rejected;
|
|
|
|
# setup the additional input parameters
|
|
$query = $self->_preset_options( $query, $input );
|
|
|
|
$self->debug_dumper( "Set the pre-options: ", $query ) if ($self->{_debug});
|
|
|
|
# now sort into distinct buckets
|
|
my $buckets = &_create_buckets( $query );
|
|
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
|
|
|
|
return $self->_query($input, $buckets);
|
|
}
|
|
|
|
sub _query {
|
|
#--------------------------------------------------------------------------------
|
|
my ( $self, $input, $buckets ) = @_;
|
|
|
|
# now handle the separate possibilities
|
|
my $results = {};
|
|
|
|
# query can have phrases
|
|
$results = $self->_phrase_query( $buckets->{phrases}, $results );
|
|
$self->debug_dumper("Did phrase union query. Current result set and scores: ", $results ) if ($self->{_debug});
|
|
|
|
# query have keywords
|
|
$results = $self->_union_query( $buckets->{keywords}, $results );
|
|
$self->debug_dumper("Did keyword union query. Current result set and scores: ", $results ) if ($self->{_debug});
|
|
|
|
# query must have phrases
|
|
$results = $self->_phrase_intersect_query( $buckets->{phrases_must}, $results );
|
|
$self->debug_dumper("Did phrase intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
|
|
|
|
# query must have keywords
|
|
$results = $self->_intersect_query( $buckets->{keywords_must}, $results );
|
|
$self->debug_dumper("Did keyword intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
|
|
|
|
# query cannot have keywords
|
|
$results = $self->_disjoin_query( $buckets->{keywords_cannot}, $results );
|
|
$self->debug_dumper("Did keyword disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
|
|
|
|
# query cannot have phrases
|
|
$results = $self->_phrase_disjoin_query( $buckets->{phrases_cannot}, $results);
|
|
$self->debug_dumper("Did phrase disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
|
|
|
|
# now handle filters
|
|
my $cols = $self->{'table'}->cols();
|
|
my %filters = map {
|
|
(my $tmp = $_) =~ s/-[lg]t$//;
|
|
$cols->{$tmp} ? ($_ => $input->{$_}) : ()
|
|
} keys %{$input};
|
|
|
|
if (keys %filters) {
|
|
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
|
|
$results = $self->filter(\%filters, $results);
|
|
}
|
|
elsif ($self->{filter}) {
|
|
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
|
|
$results = $self->_filter_query( $self->{filter}, $results );
|
|
}
|
|
else {
|
|
$self->debug( "No filters being used.") if ($self->{_debug});
|
|
}
|
|
|
|
# now this query should probably clear the filters once it's been used, so i'll dothat here
|
|
$self->{filter} = undef;
|
|
|
|
# now run through a callback function if needed.
|
|
if ($self->{callback}) {
|
|
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
|
|
return $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
|
|
}
|
|
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
|
$results = $self->{callback}->($self, $results);
|
|
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
|
}
|
|
|
|
# so how many hits did we get?
|
|
$self->{table}->{last_hits} = ( $self->{rows} = scalar($results ? keys %{$results} : ()) );
|
|
|
|
# and now create a search sth object to handle all this
|
|
return $self->sth( $results );
|
|
}
|
|
|
|
sub sth {
|
|
#--------------------------------------------------------------------------------
|
|
my $self = shift;
|
|
my $results = shift;
|
|
|
|
require GT::SQL::Search::Base::STH;
|
|
my $sth = GT::SQL::Search::STH->new(
|
|
'results' => $results,
|
|
'db' => $self->{table}->{driver},
|
|
# pass the following attributes down to the STH handler
|
|
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
|
|
);
|
|
|
|
return $sth;
|
|
}
|
|
|
|
sub rows {
|
|
#--------------------------------------------------------------------------------
|
|
# after a query is run, returns the number of rows
|
|
my $self = shift;
|
|
return $self->{rows} || 0;
|
|
}
|
|
|
|
sub _add_filters {
|
|
#--------------------------------------------------------------------------------
|
|
# creates the filter object
|
|
my $self = shift;
|
|
my $filter;
|
|
|
|
# find out how we're calling the parameters
|
|
if ( ref $_[0] eq 'GT::SQL::Condition' ) {
|
|
$filter = shift;
|
|
}
|
|
elsif ( ref $_[0] eq 'HASH' ) {
|
|
|
|
|
|
# setup the query condition using the build_query condition method
|
|
# build the condition object
|
|
my %opts = %{ shift() || {} };
|
|
delete $opts{query};
|
|
|
|
$filter = $self->{table}->build_query_cond( \%opts, $self->{table}{schema}{cols} );
|
|
|
|
}
|
|
else {
|
|
return $self->error ('BADARGS', 'FATAL', "Invalid argument: $_[0] passed to _add_filter");
|
|
}
|
|
|
|
# Use ref, as someone can pass in filter => 1 and mess things up.
|
|
|
|
ref $self->{filter} ? $self->{filter}->add ($filter) : ($self->{filter} = $filter);
|
|
$self->debug_dumper( "Filters: ", $self->{filter} ) if ($self->{_debug});
|
|
|
|
return $self->{filter};
|
|
|
|
}
|
|
|
|
sub _preset_options {
|
|
#--------------------------------------------------------------------------------
|
|
# sets up word parameters
|
|
my $self = shift;
|
|
my $query = shift or return;
|
|
my $input = shift or return $query;
|
|
|
|
# whole word searching
|
|
if ( defined $input->{'ww'} or defined $self->{'ww'}) {
|
|
if ( defined $input->{'ww'} ? $input->{'ww'} : $self->{'ww'} ) {
|
|
for ( keys %{$query} ) { $query->{$_}->{mode} = 'substring'; }
|
|
}
|
|
}
|
|
|
|
# substring searching
|
|
if ( defined $input->{'substring'} or defined $self->{'substring'}) {
|
|
if ( defined $input->{'substring'} ? $input->{'substring'} : $self->{'substring'} ) {
|
|
for ( keys %{$query} ) { $query->{$_}->{'substring'} = 1; }
|
|
}
|
|
}
|
|
|
|
if ( defined $input->{'ma'} or defined $self->{'ma'} ) {
|
|
# each keyword must be included
|
|
if ( defined $input->{'ma'} ? $input->{'ma'} : $self->{'ma'} ) {
|
|
for ( keys %{$query} ) {
|
|
next if $query->{$_}->{mode} eq 'cannot';
|
|
$query->{$_}->{mode} = 'must';
|
|
}
|
|
}
|
|
# each word can be included but is not necessary
|
|
else {
|
|
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
|
|
}
|
|
}
|
|
|
|
# some more and or searches, only if user hasn't put +word -word
|
|
if ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'and' ) {
|
|
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
|
|
for ( keys %{$query} ) {
|
|
next if $query->{$_}->{mode} eq 'cannot';
|
|
$query->{$_}->{mode} = 'must';
|
|
}
|
|
}
|
|
}
|
|
elsif ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'or' ) {
|
|
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
|
|
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
|
|
}
|
|
}
|
|
|
|
return $query;
|
|
}
|
|
|
|
sub _phrase_query { $_[1] }
|
|
sub _union_query { $_[1] }
|
|
sub _phrase_intersect_query { $_[1] }
|
|
sub _intersect_query { $_[1] }
|
|
sub _disjoin_query { $_[1] }
|
|
sub _phrase_disjoin_query { $_[1] }
|
|
|
|
sub filter {
|
|
#--------------------------------------------------------------------------------
|
|
# adds a filter
|
|
#
|
|
my $self = shift;
|
|
|
|
# add filters..,
|
|
my $filters = $self->_add_filters( shift );
|
|
my $results = shift;
|
|
|
|
# see if we need to execute a search, otherwise just return the current filterset
|
|
defined $results or return $results;
|
|
|
|
# start doing the filter stuff
|
|
return $self->_filter_query( $filters, $results );
|
|
}
|
|
|
|
sub _parse_query_string {
|
|
#------------------------------------------------------------
|
|
# from Mastering Regular Expressions altered a fair bit
|
|
# takes a space delimited string and breaks it up.
|
|
#
|
|
my $self = shift;
|
|
my $text = shift;
|
|
|
|
my %words = ();
|
|
my %reject = ();
|
|
my %mode = (
|
|
'+' => 'must',
|
|
'-' => 'cannot',
|
|
'<' => 'greater',
|
|
'>' => 'less'
|
|
);
|
|
|
|
# work on the individual elements
|
|
my @new = ();
|
|
while ( $text =~ m{
|
|
# the first part groups the phrase inside the quotes.
|
|
# see explanation of this pattern in MRE
|
|
([+-]?"[^\"\\]*(?:\\.[^\"\\]*)*)" ?
|
|
| (\+?[\w\x80-\xFF\-\*]+),?
|
|
| ' '
|
|
}gx ) {
|
|
|
|
my $match = lc $+;
|
|
|
|
# strip out buffering spaces
|
|
$match =~ s/^\s+//; $match =~ s/\s+$//;
|
|
|
|
# don't bother trying if there is nothing there
|
|
next unless $match;
|
|
|
|
# find out the searching mode
|
|
my ($mode, $substring, $phrase);
|
|
if (my $m = $mode{substr($match,0,1)}) {
|
|
$match = substr($match,1);
|
|
$mode = $m;
|
|
}
|
|
|
|
# do we need to substring match?
|
|
if ( substr( $match, -1, 1 ) eq "*" ) {
|
|
$match = substr($match,0,length($match)-1);
|
|
$substring = 1;
|
|
}
|
|
|
|
# find out if we're dealing with a phrase
|
|
if ( substr($match,0,1) eq '"' ) {
|
|
$self->debug( "Dealing with a phrase: $match" ) if ($self->{_debug});
|
|
|
|
$match = substr($match,1);
|
|
|
|
# however, we want to make sure it's a phrase and not something else
|
|
my ( $word_list, $rejected ) = $self->_tokenize( $match );
|
|
$self->debug_dumper( "Phrase reduced to: ", $word_list ) if ($self->{_debug});
|
|
$self->debug_dumper( "Phrase words rejected: ", $rejected ) if ($self->{_debug});
|
|
my $word_count = @$word_list;
|
|
|
|
if ( $word_count > 1 ) { $phrase = $word_list } # ok, standard phrase
|
|
elsif ($word_count == 1) { $match = $word_list->[0] } # just a standard word, don't worry about searching by phrase
|
|
}
|
|
|
|
# make sure we can use this word
|
|
if ( not $phrase and my $code = $self->_check_word( $match, $self->{stopwords} ) ) {
|
|
$reject{ $match } = $code;
|
|
next;
|
|
}
|
|
|
|
# now, see if we should toss this word
|
|
$words{$match} = {
|
|
mode => $mode,
|
|
phrase => $phrase,
|
|
substring => $substring,
|
|
keyword => not $phrase,
|
|
};
|
|
}
|
|
|
|
# words is a hashref of:
|
|
# {
|
|
# word => {
|
|
# paramaters => 'values'
|
|
# },
|
|
# word1 => {
|
|
# ...
|
|
# },
|
|
# ...
|
|
# }
|
|
#
|
|
return( \%words, \%reject );
|
|
|
|
}
|
|
|
|
|
|
sub _filter_query {
|
|
#--------------------------------------------------------------------------------
|
|
# get the results from the filter
|
|
#
|
|
my $self = shift;
|
|
my $filters = shift;
|
|
my $results = shift or return {};
|
|
keys %{$results} or return $results;
|
|
|
|
my $table = $self->{table};
|
|
my $tname = $table->name();
|
|
|
|
# setup the where clause
|
|
my $where = $filters->sql() or return $results;
|
|
my ($pk) = $table->pk;
|
|
$where .= qq! AND $pk IN (! . join(',', keys %$results) . ')';
|
|
|
|
# now do the filter
|
|
my $query = qq!
|
|
SELECT $pk
|
|
FROM
|
|
$tname
|
|
WHERE
|
|
$where
|
|
!;
|
|
$self->debug( "Filter Query: $query" ) if ($self->{_debug});
|
|
my $sth = $self->{table}->{driver}->prepare($query);
|
|
$sth->execute();
|
|
|
|
# get all the results
|
|
my $aref = $sth->fetchall_arrayref;
|
|
return {
|
|
map {
|
|
$_->[0] => $results->{$_->[0]}
|
|
} @$aref
|
|
};
|
|
}
|
|
|
|
sub _create_buckets {
|
|
#------------------------------------------------------------
|
|
# takes the output from _parse_query_string and creates a
|
|
# bucket hash of all the different types of searching
|
|
# possible
|
|
my $query = shift or return;
|
|
|
|
my %buckets;
|
|
|
|
# put each word in the appropriate hash bucket
|
|
foreach my $parameter ( keys %{$query} ) {
|
|
|
|
my $word_data = $query->{$parameter};
|
|
|
|
# the following is slower, however, done that way to be syntatically legible
|
|
if ( $word_data->{'phrase'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'} =~ /(must|cannot)/ ))) {
|
|
$buckets{"phrases_$1"}->{$parameter} = $word_data;
|
|
}
|
|
elsif ( $word_data->{'phrase'} ) {
|
|
$buckets{'phrases'}->{$parameter} = $word_data;
|
|
}
|
|
elsif ( $word_data->{'keyword'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'}=~ /(must|cannot|greater|less)/) )) {
|
|
$buckets{"keywords_$1"}->{$parameter} = $word_data;
|
|
}
|
|
else {
|
|
$buckets{'keywords'}->{$parameter} = $word_data;
|
|
}
|
|
|
|
}
|
|
|
|
return \%buckets;
|
|
}
|
|
|
|
sub alternate_driver_query {
|
|
#--------------------------------------------------------------------------------
|
|
my ( $self, $drivername, $input ) = @_;
|
|
|
|
$drivername = uc $drivername;
|
|
require GT::SQL::Search;
|
|
my $driver = GT::SQL::Search->load_search({ %$input, table => $self->{table}, driver => $drivername });
|
|
my $sth = $driver->query( $input );
|
|
foreach ( qw( rows rejected_keywords ) ) { $self->{$_} = $driver->{$_}; }
|
|
return $sth;
|
|
|
|
}
|
|
|
|
sub clean_sb {
|
|
# -------------------------------------------------------------------------------
|
|
# Convert the sort by, sort order into an sql string.
|
|
#
|
|
my ($class, $sb, $so) = @_;
|
|
my $output = '';
|
|
|
|
return $output unless ($sb);
|
|
|
|
# Remove score attribute, used only for internal indexes.
|
|
$sb =~ s/^\s*score\b//;
|
|
$sb =~ s/,?\s*\bscore\b//;
|
|
|
|
if ($sb and not ref $sb) {
|
|
if ($sb =~ /^[\w\s,]+$/) {
|
|
if ($sb =~ /\s(?:asc|desc)/i) {
|
|
$output = 'ORDER BY ' . $sb;
|
|
}
|
|
else {
|
|
$output = 'ORDER BY ' . $sb . ' ' . $so;
|
|
}
|
|
}
|
|
else {
|
|
$class->error('BADSB', 'WARN', $sb);
|
|
}
|
|
}
|
|
elsif (ref $sb eq 'ARRAY') {
|
|
foreach ( @$sb ) {
|
|
/^[\w\s,]+$/ or $class->error( 'BADSB', 'WARN', $sb ), next;
|
|
}
|
|
$output = 'ORDER BY ' . join(',', @$sb);
|
|
}
|
|
return $output;
|
|
}
|
|
|
|
sub debug_dumper {
|
|
#--------------------------------------------------------------------------------
|
|
# calls debug but also dumps all the messages
|
|
my $self = shift;
|
|
my $message = shift;
|
|
my $level = ref $_[0] ? 1 : defined $_[0] ? shift : 0;
|
|
|
|
if ( $self->{_debug} >= $level ) {
|
|
require GT::Dumper;
|
|
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )) if ($self->{_debug});
|
|
}
|
|
}
|
|
|
|
1;
|