# ================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::SQL::Search::Base # Author : Aki Mimoto # CVS Info : 087,071,086,086,085 # $Id: Search.pm,v 1.25 2004/08/28 03:53:46 jagerman Exp $ # # Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. # ================================================================== # # Description: # Base classes upon which all search drivers are based # package GT::SQL::Search::Base::Search; use strict; use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /; use GT::Base; use GT::SQL::Search::Base::Common; @ISA = qw( GT::Base GT::SQL::Search::Base::Common); #-------------------------------------------------------------------------------- # Preamble information related to the object $DEBUG = 0; $VERSION = sprintf "%d.%03d", q$Revision: 1.25 $ =~ /(\d+)\.(\d+)/; @ISA = qw/ GT::Base /; $ATTRIBS = { 'stopwords' => $STOPWORDS, 'mh' => 25, 'nh' => 1, 'ww' => undef, 'ma' => undef, 'bool' => undef, 'substring' => 0, 'query' => '', 'sb' => 'score', 'so' => '', 'score_col' => 'SCORE', 'score_sort'=> 0, 'debug' => 0, '_debug' => 0, # query related 'db' => undef, 'table' => undef, 'filter' => undef, 'callback' => undef, # strict matching of indexed words, accents on words do count 'sm' => 0, 'min_word_size' => 3, 'max_word_size' => 50, }; sub init { #-------------------------------------------------------------------------------- # Initialises the Search object # my $self = shift; my $input = $self->common_param(@_); $self->set($input); # now handle filters..., my $tbl = $self->{table}; my $cols = $tbl->cols(); my %filters = map { (my $tmp = $_) =~ s/-[lg]t$//; exists $cols->{$tmp} ? ($_ => $input->{$_}) : () } keys %{$input}; if ( keys %filters ) { $self->debug_dumper( "INIT: Creating Filters: ", \%filters ) if ( $self->{_debug} ); $self->filter(\%filters); } $self->{table}->connect; } sub query { #-------------------------------------------------------------------------------- # Returns a sth based on a query # # Options: # - paging # mh : max hits # nh : number hit (or page of hits) # # - searching # ww : whole word # ma : 1 => OR match, 0 => AND match, undefined => QUERY # substring : search for substrings of words # bool : 'and' => and search, 'or' => or search, '' => regular query # query : the string of things to ask for # # - filtering # field_name : value # Find all rows with field_name = value # field_name : ">value" # Find all rows with field_name > value. # field_name : " value. # field_name-lt : value # Find all rows with field_name < value. # # Parameters: # ( $CGI ) : a single cgi object # ( $HASH ) : a hash of the parameters # my $self = shift; # find out what sort of a parameter we're dealing with my $input = $self->common_param(@_); # add additional parameters if required foreach my $parameter ( keys %{$ATTRIBS} ) { if ( not exists $input->{$parameter} ) { $input->{$parameter} = $self->{$parameter}; } } # parse query..., $self->debug( "Search Query: $$input{query}", 1 ) if ($self->{_debug}); my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} ); $self->{'rejected_keywords'} = $rejected; # setup the additional input parameters $query = $self->_preset_options( $query, $input ); $self->debug_dumper( "Set the pre-options: ", $query ) if ($self->{_debug}); # now sort into distinct buckets my $buckets = &_create_buckets( $query ); $self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug}); return $self->_query($input, $buckets); } sub _query { #-------------------------------------------------------------------------------- my ( $self, $input, $buckets ) = @_; # now handle the separate possibilities my $results = {}; # query can have phrases $results = $self->_phrase_query( $buckets->{phrases}, $results ); $self->debug_dumper("Did phrase union query. Current result set and scores: ", $results ) if ($self->{_debug}); # query have keywords $results = $self->_union_query( $buckets->{keywords}, $results ); $self->debug_dumper("Did keyword union query. Current result set and scores: ", $results ) if ($self->{_debug}); # query must have phrases $results = $self->_phrase_intersect_query( $buckets->{phrases_must}, $results ); $self->debug_dumper("Did phrase intersect query. Current result set and scores: ", $results ) if ($self->{_debug}); # query must have keywords $results = $self->_intersect_query( $buckets->{keywords_must}, $results ); $self->debug_dumper("Did keyword intersect query. Current result set and scores: ", $results ) if ($self->{_debug}); # query cannot have keywords $results = $self->_disjoin_query( $buckets->{keywords_cannot}, $results ); $self->debug_dumper("Did keyword disjoin query. Current result set and scores: ", $results ) if ($self->{_debug}); # query cannot have phrases $results = $self->_phrase_disjoin_query( $buckets->{phrases_cannot}, $results); $self->debug_dumper("Did phrase disjoin query. Current result set and scores: ", $results ) if ($self->{_debug}); # now handle filters my $cols = $self->{'table'}->cols(); my %filters = map { (my $tmp = $_) =~ s/-[lg]t$//; $cols->{$tmp} ? ($_ => $input->{$_}) : () } keys %{$input}; if (keys %filters) { $self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug}); $results = $self->filter(\%filters, $results); } elsif ($self->{filter}) { $self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug}); $results = $self->_filter_query( $self->{filter}, $results ); } else { $self->debug( "No filters being used.") if ($self->{_debug}); } # now this query should probably clear the filters once it's been used, so i'll dothat here $self->{filter} = undef; # now run through a callback function if needed. if ($self->{callback}) { unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') { return $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!"); } $self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug}); $results = $self->{callback}->($self, $results); $self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug}); } # so how many hits did we get? $self->{table}->{last_hits} = ( $self->{rows} = scalar($results ? keys %{$results} : ()) ); # and now create a search sth object to handle all this return $self->sth( $results ); } sub sth { #-------------------------------------------------------------------------------- my $self = shift; my $results = shift; require GT::SQL::Search::Base::STH; my $sth = GT::SQL::Search::STH->new( 'results' => $results, 'db' => $self->{table}->{driver}, # pass the following attributes down to the STH handler map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /) ); return $sth; } sub rows { #-------------------------------------------------------------------------------- # after a query is run, returns the number of rows my $self = shift; return $self->{rows} || 0; } sub _add_filters { #-------------------------------------------------------------------------------- # creates the filter object my $self = shift; my $filter; # find out how we're calling the parameters if ( ref $_[0] eq 'GT::SQL::Condition' ) { $filter = shift; } elsif ( ref $_[0] eq 'HASH' ) { # setup the query condition using the build_query condition method # build the condition object my %opts = %{ shift() || {} }; delete $opts{query}; $filter = $self->{table}->build_query_cond( \%opts, $self->{table}{schema}{cols} ); } else { return $self->error ('BADARGS', 'FATAL', "Invalid argument: $_[0] passed to _add_filter"); } # Use ref, as someone can pass in filter => 1 and mess things up. ref $self->{filter} ? $self->{filter}->add ($filter) : ($self->{filter} = $filter); $self->debug_dumper( "Filters: ", $self->{filter} ) if ($self->{_debug}); return $self->{filter}; } sub _preset_options { #-------------------------------------------------------------------------------- # sets up word parameters my $self = shift; my $query = shift or return; my $input = shift or return $query; # whole word searching if ( defined $input->{'ww'} or defined $self->{'ww'}) { if ( defined $input->{'ww'} ? $input->{'ww'} : $self->{'ww'} ) { for ( keys %{$query} ) { $query->{$_}->{mode} = 'substring'; } } } # substring searching if ( defined $input->{'substring'} or defined $self->{'substring'}) { if ( defined $input->{'substring'} ? $input->{'substring'} : $self->{'substring'} ) { for ( keys %{$query} ) { $query->{$_}->{'substring'} = 1; } } } if ( defined $input->{'ma'} or defined $self->{'ma'} ) { # each keyword must be included if ( defined $input->{'ma'} ? $input->{'ma'} : $self->{'ma'} ) { for ( keys %{$query} ) { next if $query->{$_}->{mode} eq 'cannot'; $query->{$_}->{mode} = 'must'; } } # each word can be included but is not necessary else { for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; } } } # some more and or searches, only if user hasn't put +word -word if ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'and' ) { unless ($input->{query} =~ /(?:^|\s)[+-]\w/) { for ( keys %{$query} ) { next if $query->{$_}->{mode} eq 'cannot'; $query->{$_}->{mode} = 'must'; } } } elsif ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'or' ) { unless ($input->{query} =~ /(?:^|\s)[+-]\w/) { for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; } } } return $query; } sub _phrase_query { $_[1] } sub _union_query { $_[1] } sub _phrase_intersect_query { $_[1] } sub _intersect_query { $_[1] } sub _disjoin_query { $_[1] } sub _phrase_disjoin_query { $_[1] } sub filter { #-------------------------------------------------------------------------------- # adds a filter # my $self = shift; # add filters.., my $filters = $self->_add_filters( shift ); my $results = shift; # see if we need to execute a search, otherwise just return the current filterset defined $results or return $results; # start doing the filter stuff return $self->_filter_query( $filters, $results ); } sub _parse_query_string { #------------------------------------------------------------ # from Mastering Regular Expressions altered a fair bit # takes a space delimited string and breaks it up. # my $self = shift; my $text = shift; my %words = (); my %reject = (); my %mode = ( '+' => 'must', '-' => 'cannot', '<' => 'greater', '>' => 'less' ); # work on the individual elements my @new = (); while ( $text =~ m{ # the first part groups the phrase inside the quotes. # see explanation of this pattern in MRE ([+-]?"[^\"\\]*(?:\\.[^\"\\]*)*)" ? | (\+?[\w\x80-\xFF\-\*]+),? | ' ' }gx ) { my $match = lc $+; # strip out buffering spaces $match =~ s/^\s+//; $match =~ s/\s+$//; # don't bother trying if there is nothing there next unless $match; # find out the searching mode my ($mode, $substring, $phrase); if (my $m = $mode{substr($match,0,1)}) { $match = substr($match,1); $mode = $m; } # do we need to substring match? if ( substr( $match, -1, 1 ) eq "*" ) { $match = substr($match,0,length($match)-1); $substring = 1; } # find out if we're dealing with a phrase if ( substr($match,0,1) eq '"' ) { $self->debug( "Dealing with a phrase: $match" ) if ($self->{_debug}); $match = substr($match,1); # however, we want to make sure it's a phrase and not something else my ( $word_list, $rejected ) = $self->_tokenize( $match ); $self->debug_dumper( "Phrase reduced to: ", $word_list ) if ($self->{_debug}); $self->debug_dumper( "Phrase words rejected: ", $rejected ) if ($self->{_debug}); my $word_count = @$word_list; if ( $word_count > 1 ) { $phrase = $word_list } # ok, standard phrase elsif ($word_count == 1) { $match = $word_list->[0] } # just a standard word, don't worry about searching by phrase } # make sure we can use this word if ( not $phrase and my $code = $self->_check_word( $match, $self->{stopwords} ) ) { $reject{ $match } = $code; next; } # now, see if we should toss this word $words{$match} = { mode => $mode, phrase => $phrase, substring => $substring, keyword => not $phrase, }; } # words is a hashref of: # { # word => { # paramaters => 'values' # }, # word1 => { # ... # }, # ... # } # return( \%words, \%reject ); } sub _filter_query { #-------------------------------------------------------------------------------- # get the results from the filter # my $self = shift; my $filters = shift; my $results = shift or return {}; keys %{$results} or return $results; my $table = $self->{table}; my $tname = $table->name(); # setup the where clause my $where = $filters->sql() or return $results; my ($pk) = $table->pk; $where .= qq! AND $pk IN (! . join(',', keys %$results) . ')'; # now do the filter my $query = qq! SELECT $pk FROM $tname WHERE $where !; $self->debug( "Filter Query: $query" ) if ($self->{_debug}); my $sth = $self->{table}->{driver}->prepare($query); $sth->execute(); # get all the results my $aref = $sth->fetchall_arrayref; return { map { $_->[0] => $results->{$_->[0]} } @$aref }; } sub _create_buckets { #------------------------------------------------------------ # takes the output from _parse_query_string and creates a # bucket hash of all the different types of searching # possible my $query = shift or return; my %buckets; # put each word in the appropriate hash bucket foreach my $parameter ( keys %{$query} ) { my $word_data = $query->{$parameter}; # the following is slower, however, done that way to be syntatically legible if ( $word_data->{'phrase'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'} =~ /(must|cannot)/ ))) { $buckets{"phrases_$1"}->{$parameter} = $word_data; } elsif ( $word_data->{'phrase'} ) { $buckets{'phrases'}->{$parameter} = $word_data; } elsif ( $word_data->{'keyword'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'}=~ /(must|cannot|greater|less)/) )) { $buckets{"keywords_$1"}->{$parameter} = $word_data; } else { $buckets{'keywords'}->{$parameter} = $word_data; } } return \%buckets; } sub alternate_driver_query { #-------------------------------------------------------------------------------- my ( $self, $drivername, $input ) = @_; $drivername = uc $drivername; require GT::SQL::Search; my $driver = GT::SQL::Search->load_search({ %$input, table => $self->{table}, driver => $drivername }); my $sth = $driver->query( $input ); foreach ( qw( rows rejected_keywords ) ) { $self->{$_} = $driver->{$_}; } return $sth; } sub clean_sb { # ------------------------------------------------------------------------------- # Convert the sort by, sort order into an sql string. # my ($class, $sb, $so) = @_; my $output = ''; return $output unless ($sb); # Remove score attribute, used only for internal indexes. $sb =~ s/^\s*score\b//; $sb =~ s/,?\s*\bscore\b//; if ($sb and not ref $sb) { if ($sb =~ /^[\w\s,]+$/) { if ($sb =~ /\s(?:asc|desc)/i) { $output = 'ORDER BY ' . $sb; } else { $output = 'ORDER BY ' . $sb . ' ' . $so; } } else { $class->error('BADSB', 'WARN', $sb); } } elsif (ref $sb eq 'ARRAY') { foreach ( @$sb ) { /^[\w\s,]+$/ or $class->error( 'BADSB', 'WARN', $sb ), next; } $output = 'ORDER BY ' . join(',', @$sb); } return $output; } sub debug_dumper { #-------------------------------------------------------------------------------- # calls debug but also dumps all the messages my $self = shift; my $message = shift; my $level = ref $_[0] ? 1 : defined $_[0] ? shift : 0; if ( $self->{_debug} >= $level ) { require GT::Dumper; $self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )) if ($self->{_debug}); } } 1;