# ================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::Search::LUCENE::Search # Author : Scott Beck # CVS Info : 087,071,086,086,085 # $Id: Search.pm,v 1.2 2006/12/07 22:42:16 aki Exp $ # # Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. # ================================================================== # # Description: # Class used to search indexed tables. # package GT::SQL::Search::LUCENE::Search; # ------------------------------------------------------------------------------ use strict; use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS $ERRORS $ERROR_MESSAGE /; use Lucene; use GT::TempFile; use GT::SQL::Search::LUCENE::STH; use GT::SQL::Search::Base::Search; @ISA = qw( GT::SQL::Search::Base::Search ); # ------------------------------------------------------------------------------ # Preamble information related to the object $DEBUG = 0; $VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/; $ERRORS = { SEARCH_ERROR => "Error searching: %s", QUERY_ERROR => "Query error: %s" }; $ERROR_MESSAGE = 'GT::SQL'; sub load { shift; return GT::SQL::Search::LUCENE::Search->new(@_) } sub _get_path { my $self = shift; my $name = $self->{table}->name; my $tmpdir = GT::TempFile::find_tmpdir(); my $path = $tmpdir . '/' . $name; $path = $1 if $path =~ /(.*)/; # XXX untaint return $path; } sub _get_store { my ($self, $create) = @_; my $path = $self->_get_path; return Lucene::Store::FSDirectory->getDirectory($path, $create); } sub query { # -------------------------------------------------- # Returns a sth based on a query # # Options: # - paging # mh : max hits # nh : number hit (or page of hits) # # - searching # ww : whole word # ma : 1 => OR match, 0 => AND match, undefined => QUERY # substring : search for substrings of words # bool : 'and' => and search, 'or' => or search, '' => regular query # query : the string of things to ask for # # - filtering # field_name : value # Find all rows with field_name = value # field_name : ">value" # Find all rows with field_name > value. # field_name : " value. # field_name-lt : value # Find all rows with field_name < value. # # Parameters: # ( $CGI ) : a single cgi object # ( $HASH ) : a hash of the parameters # my $self = shift; # create an easily accessible argument hash my $args = $self->common_param(@_); my $tbl = $self->{table}; # see if we can setup the filtering constraints my $filter = { %$args }; my $query = delete $args->{query} || $self->{query} || ''; my $ftr_cond; # parse query $self->debug( "Search Query: $query" ) if ($self->{_debug}); my ( $query_struct, $rejected ) = $self->_parse_query_string( $query ); $self->{rejected_keywords} = $rejected; # setup the additional input parameters $query_struct = $self->_preset_options( $query_struct, $args ); # now sort into distinct buckets my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct ); $self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug}); # with the buckets, it's now possible to create a query string # that can be passed directly into the Lucene search. my $query_string = ''; foreach my $search_type ( keys %$buckets ) { my $bucket = $buckets->{$search_type}; foreach my $token ( keys %$bucket ) { next unless $token; my $properties = $bucket->{$token} or next; $token =~ s/(["()])/\\$1/g; $token =~ s/\b(or|and)\b/ /g; my $e = ' '; # handle boolean operations $properties->{mode} ||= ''; if ( $properties->{mode} eq 'must' ) { $e .= '+'; } elsif ( $properties->{mode} eq 'cannot' ) { $e .= '-'; } # deal with phrase vs keyword if ( $properties->{phrase} ) { $e .= '"' . $token . '"' unless $token =~ /^"|"$/; } else { $e .= $token; # substring match if ($properties->{mode} ne 'substring') { $e .= '*' if $properties->{substring}; } } $query_string .= $e; } } # calculate the cursor constraints foreach my $k (qw( nh mh so sb )) { next if defined $args->{$k}; $args->{$k} = $self->{$k} || ''; } $args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1; $args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25; $args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score'; # Score is the default $args->{so} = (defined $args->{so} and $args->{so} =~ /^(asc|desc)(?:end)?$/i) ? lc($1) : 'asc'; my %weights = $tbl->_weight_cols(); my @sortfields; my $do_mysql_sort = 0; for (ref($args->{sb}) eq 'ARRAY' ? @{$args->{sb}} : $args->{sb}) { if (!exists $weights{$_}) { $do_mysql_sort = 1 if $_ ne 'score'; next; } push @sortfields, new Lucene::Search::SortField($_, $args->{so} ne 'asc'); } my $sort = @sortfields ? new Lucene::Search::Sort(@sortfields) : Lucene::Search::Sort->RELEVANCE; my $store = $self->_get_store(0); my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer; my $searcher = eval { new Lucene::Search::IndexSearcher($store); }; if ($@) { $self->{_debug} and $self->error('SEARCH_ERROR', 'WARN', "$@"); return $self->sth({}, 0); # no hits } # Random default field, it's not used my $parser = new Lucene::MultiFieldQueryParser((keys %weights)[0], $analyzer); my $pquery = eval { $parser->parse($query_string, [keys %weights], $analyzer); }; if ($@) { $self->{_debug} and $self->error('QUERY_ERROR', 'WARN', "$@"); return $self->sth({}, 0); # no hits } my $hits = $searcher->search($pquery, $sort); my $num_hits = $hits->length; ## Setup a limit only if there is no callback. The callback argument requires a full results list my ($offset, $max_hits) = (0, $num_hits); unless ($self->{callback} or $do_mysql_sort) { $offset = ( $args->{nh} - 1 ) * $args->{mh}; $max_hits = $offset + $args->{mh}; } $max_hits = $num_hits if $max_hits > $num_hits; my ($pk) = $self->{table}->pk; my @indexes; my $results = {}; for (my $i= $offset; $i < $max_hits; ++$i) { my $doc = $hits->doc($i); my $value = $doc->get($pk); my $score = $hits->score($i); $results->{$value} = $score; } # now handle filters my $cols = $self->{'table'}->cols(); my %filters = map { (my $tmp = $_) =~ s/-[lg]t$//; $cols->{$tmp} ? ($_ => $args->{$_}) : () } keys %{$args}; if (keys %filters) { $self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug}); $results = $self->filter(\%filters, $results); } elsif ($self->{filter}) { $self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug}); $results = $self->_filter_query( $self->{filter}, $results ); } else { $self->debug( "No filters being used.") if ($self->{_debug}); } # now this query should probably clear the filters once it's been used, so i'll dothat here $self->{filter} = undef; # now run through a callback function if needed. if ($self->{callback}) { unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') { $self->{_debug} and $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!"); return $self->sth({}, 0); # no hits } $self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug}); $results = $self->{callback}->($self, $results); $self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug}); } $self->{rows} = $num_hits; return $self->sth($results, $do_mysql_sort); } sub sth { #-------------------------------------------------------------------------------- my ($self, $results, $db_sort) = @_; my $sth = GT::SQL::Search::LUCENE::STH->new( 'results' => $results, 'hits' => $self->{rows}, 'db' => $self->{table}->{driver}, 'db_sort' => $db_sort, # pass the following attributes down to the STH handler map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /) ); return $sth; } 1;