261 lines
8.9 KiB
Perl
261 lines
8.9 KiB
Perl
|
# ==================================================================
|
||
|
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||
|
#
|
||
|
# GT::Search::LUCENE::Search
|
||
|
# Author : Scott Beck
|
||
|
# CVS Info : 087,071,086,086,085
|
||
|
# $Id: Search.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
|
||
|
#
|
||
|
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||
|
# ==================================================================
|
||
|
#
|
||
|
# Description:
|
||
|
# Class used to search indexed tables.
|
||
|
#
|
||
|
|
||
|
package GT::SQL::Search::LUCENE::Search;
|
||
|
# ------------------------------------------------------------------------------
|
||
|
use strict;
|
||
|
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS $ERRORS $ERROR_MESSAGE /;
|
||
|
use Lucene;
|
||
|
use GT::TempFile;
|
||
|
use GT::SQL::Search::LUCENE::STH;
|
||
|
use GT::SQL::Search::Base::Search;
|
||
|
@ISA = qw( GT::SQL::Search::Base::Search );
|
||
|
|
||
|
# ------------------------------------------------------------------------------
|
||
|
# Preamble information related to the object
|
||
|
|
||
|
$DEBUG = 0;
|
||
|
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
|
||
|
$ERRORS = {
|
||
|
SEARCH_ERROR => "Error searching: %s",
|
||
|
QUERY_ERROR => "Query error: %s"
|
||
|
};
|
||
|
$ERROR_MESSAGE = 'GT::SQL';
|
||
|
|
||
|
sub load {
|
||
|
shift;
|
||
|
return GT::SQL::Search::LUCENE::Search->new(@_)
|
||
|
}
|
||
|
|
||
|
sub _get_path {
|
||
|
my $self = shift;
|
||
|
my $name = $self->{table}->name;
|
||
|
my $tmpdir = GT::TempFile::find_tmpdir();
|
||
|
my $path = $tmpdir . '/' . $name;
|
||
|
$path = $1 if $path =~ /(.*)/; # XXX untaint
|
||
|
return $path;
|
||
|
}
|
||
|
|
||
|
sub _get_store {
|
||
|
my ($self, $create) = @_;
|
||
|
my $path = $self->_get_path;
|
||
|
return Lucene::Store::FSDirectory->getDirectory($path, $create);
|
||
|
}
|
||
|
|
||
|
sub query {
|
||
|
# --------------------------------------------------
|
||
|
# Returns a sth based on a query
|
||
|
#
|
||
|
# Options:
|
||
|
# - paging
|
||
|
# mh : max hits
|
||
|
# nh : number hit (or page of hits)
|
||
|
#
|
||
|
# - searching
|
||
|
# ww : whole word
|
||
|
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
||
|
# substring : search for substrings of words
|
||
|
# bool : 'and' => and search, 'or' => or search, '' => regular query
|
||
|
# query : the string of things to ask for
|
||
|
#
|
||
|
# - filtering
|
||
|
# field_name : value # Find all rows with field_name = value
|
||
|
# field_name : ">value" # Find all rows with field_name > value.
|
||
|
# field_name : "<value" # Find all rows with field_name < value.
|
||
|
# field_name-gt : value # Find all rows with field_name > value.
|
||
|
# field_name-lt : value # Find all rows with field_name < value.
|
||
|
#
|
||
|
# Parameters:
|
||
|
# ( $CGI ) : a single cgi object
|
||
|
# ( $HASH ) : a hash of the parameters
|
||
|
#
|
||
|
my $self = shift;
|
||
|
|
||
|
# create an easily accessible argument hash
|
||
|
my $args = $self->common_param(@_);
|
||
|
my $tbl = $self->{table};
|
||
|
|
||
|
# see if we can setup the filtering constraints
|
||
|
my $filter = { %$args };
|
||
|
my $query = delete $args->{query} || $self->{query} || '';
|
||
|
my $ftr_cond;
|
||
|
|
||
|
# parse query
|
||
|
$self->debug( "Search Query: $query" ) if ($self->{_debug});
|
||
|
|
||
|
my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
|
||
|
|
||
|
$self->{rejected_keywords} = $rejected;
|
||
|
|
||
|
# setup the additional input parameters
|
||
|
$query_struct = $self->_preset_options( $query_struct, $args );
|
||
|
|
||
|
# now sort into distinct buckets
|
||
|
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
|
||
|
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
|
||
|
|
||
|
# with the buckets, it's now possible to create a query string
|
||
|
# that can be passed directly into the Lucene search.
|
||
|
my $query_string = '';
|
||
|
|
||
|
foreach my $search_type ( keys %$buckets ) {
|
||
|
my $bucket = $buckets->{$search_type};
|
||
|
foreach my $token ( keys %$bucket ) {
|
||
|
next unless $token;
|
||
|
my $properties = $bucket->{$token} or next;
|
||
|
$token =~ s/(["()])/\\$1/g;
|
||
|
$token =~ s/\b(or|and)\b/ /g;
|
||
|
|
||
|
my $e = ' ';
|
||
|
|
||
|
# handle boolean operations
|
||
|
$properties->{mode} ||= '';
|
||
|
if ( $properties->{mode} eq 'must' ) {
|
||
|
$e .= '+';
|
||
|
}
|
||
|
elsif ( $properties->{mode} eq 'cannot' ) {
|
||
|
$e .= '-';
|
||
|
}
|
||
|
|
||
|
# deal with phrase vs keyword
|
||
|
if ( $properties->{phrase} ) {
|
||
|
$e .= '"' . $token . '"' unless $token =~ /^"|"$/;
|
||
|
}
|
||
|
else {
|
||
|
$e .= $token;
|
||
|
|
||
|
# substring match
|
||
|
if ($properties->{mode} ne 'substring') {
|
||
|
$e .= '*' if $properties->{substring};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$query_string .= $e;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# calculate the cursor constraints
|
||
|
foreach my $k (qw( nh mh so sb )) {
|
||
|
next if defined $args->{$k};
|
||
|
$args->{$k} = $self->{$k} || '';
|
||
|
}
|
||
|
$args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
|
||
|
$args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
|
||
|
$args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score';
|
||
|
|
||
|
# Score is the default
|
||
|
$args->{so} = (defined $args->{so} and $args->{so} =~ /^(asc|desc)(?:end)?$/i) ? lc($1) : 'asc';
|
||
|
|
||
|
my %weights = $tbl->_weight_cols();
|
||
|
my @sortfields;
|
||
|
my $do_mysql_sort = 0;
|
||
|
for (ref($args->{sb}) eq 'ARRAY' ? @{$args->{sb}} : $args->{sb}) {
|
||
|
if (!exists $weights{$_}) {
|
||
|
$do_mysql_sort = 1 if $_ ne 'score';
|
||
|
next;
|
||
|
}
|
||
|
push @sortfields, new Lucene::Search::SortField($_, $args->{so} ne 'asc');
|
||
|
}
|
||
|
my $sort = @sortfields ? new Lucene::Search::Sort(@sortfields) : Lucene::Search::Sort->RELEVANCE;
|
||
|
my $store = $self->_get_store(0);
|
||
|
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
|
||
|
my $searcher = eval { new Lucene::Search::IndexSearcher($store); };
|
||
|
if ($@) {
|
||
|
$self->{_debug} and $self->error('SEARCH_ERROR', 'WARN', "$@");
|
||
|
return $self->sth({}, 0); # no hits
|
||
|
}
|
||
|
# Random default field, it's not used
|
||
|
my $parser = new Lucene::MultiFieldQueryParser((keys %weights)[0], $analyzer);
|
||
|
my $pquery = eval { $parser->parse($query_string, [keys %weights], $analyzer); };
|
||
|
if ($@) {
|
||
|
$self->{_debug} and $self->error('QUERY_ERROR', 'WARN', "$@");
|
||
|
return $self->sth({}, 0); # no hits
|
||
|
}
|
||
|
my $hits = $searcher->search($pquery, $sort);
|
||
|
my $num_hits = $hits->length;
|
||
|
|
||
|
## Setup a limit only if there is no callback. The callback argument requires a full results list
|
||
|
my ($offset, $max_hits) = (0, $num_hits);
|
||
|
unless ($self->{callback} or $do_mysql_sort) {
|
||
|
$offset = ( $args->{nh} - 1 ) * $args->{mh};
|
||
|
$max_hits = $offset + $args->{mh};
|
||
|
}
|
||
|
$max_hits = $num_hits if $max_hits > $num_hits;
|
||
|
my ($pk) = $self->{table}->pk;
|
||
|
my @indexes;
|
||
|
my $results = {};
|
||
|
for (my $i= $offset; $i < $max_hits; ++$i) {
|
||
|
my $doc = $hits->doc($i);
|
||
|
my $value = $doc->get($pk);
|
||
|
my $score = $hits->score($i);
|
||
|
$results->{$value} = $score;
|
||
|
}
|
||
|
|
||
|
# now handle filters
|
||
|
my $cols = $self->{'table'}->cols();
|
||
|
my %filters = map {
|
||
|
(my $tmp = $_) =~ s/-[lg]t$//;
|
||
|
$cols->{$tmp} ? ($_ => $args->{$_}) : ()
|
||
|
} keys %{$args};
|
||
|
|
||
|
if (keys %filters) {
|
||
|
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
|
||
|
$results = $self->filter(\%filters, $results);
|
||
|
}
|
||
|
elsif ($self->{filter}) {
|
||
|
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
|
||
|
$results = $self->_filter_query( $self->{filter}, $results );
|
||
|
}
|
||
|
else {
|
||
|
$self->debug( "No filters being used.") if ($self->{_debug});
|
||
|
}
|
||
|
|
||
|
# now this query should probably clear the filters once it's been used, so i'll dothat here
|
||
|
$self->{filter} = undef;
|
||
|
|
||
|
# now run through a callback function if needed.
|
||
|
if ($self->{callback}) {
|
||
|
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
|
||
|
$self->{_debug} and $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
|
||
|
return $self->sth({}, 0); # no hits
|
||
|
}
|
||
|
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||
|
$results = $self->{callback}->($self, $results);
|
||
|
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||
|
}
|
||
|
|
||
|
$self->{rows} = $num_hits;
|
||
|
|
||
|
return $self->sth($results, $do_mysql_sort);
|
||
|
}
|
||
|
|
||
|
sub sth {
|
||
|
#--------------------------------------------------------------------------------
|
||
|
my ($self, $results, $db_sort) = @_;
|
||
|
|
||
|
my $sth = GT::SQL::Search::LUCENE::STH->new(
|
||
|
'results' => $results,
|
||
|
'hits' => $self->{rows},
|
||
|
'db' => $self->{table}->{driver},
|
||
|
'db_sort' => $db_sort,
|
||
|
# pass the following attributes down to the STH handler
|
||
|
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
|
||
|
);
|
||
|
|
||
|
return $sth;
|
||
|
}
|
||
|
|
||
|
1;
|