discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/LUCENE/Search.pm
2024-06-17 21:49:12 +10:00

261 lines
8.9 KiB
Perl

# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::LUCENE::Search
# Author : Scott Beck
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to search indexed tables.
#
package GT::SQL::Search::LUCENE::Search;
# ------------------------------------------------------------------------------
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS $ERRORS $ERROR_MESSAGE /;
use Lucene;
use GT::TempFile;
use GT::SQL::Search::LUCENE::STH;
use GT::SQL::Search::Base::Search;
@ISA = qw( GT::SQL::Search::Base::Search );
# ------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
$ERRORS = {
SEARCH_ERROR => "Error searching: %s",
QUERY_ERROR => "Query error: %s"
};
$ERROR_MESSAGE = 'GT::SQL';
sub load {
shift;
return GT::SQL::Search::LUCENE::Search->new(@_)
}
sub _get_path {
my $self = shift;
my $name = $self->{table}->name;
my $tmpdir = GT::TempFile::find_tmpdir();
my $path = $tmpdir . '/' . $name;
$path = $1 if $path =~ /(.*)/; # XXX untaint
return $path;
}
sub _get_store {
my ($self, $create) = @_;
my $path = $self->_get_path;
return Lucene::Store::FSDirectory->getDirectory($path, $create);
}
sub query {
# --------------------------------------------------
# Returns a sth based on a query
#
# Options:
# - paging
# mh : max hits
# nh : number hit (or page of hits)
#
# - searching
# ww : whole word
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
# substring : search for substrings of words
# bool : 'and' => and search, 'or' => or search, '' => regular query
# query : the string of things to ask for
#
# - filtering
# field_name : value # Find all rows with field_name = value
# field_name : ">value" # Find all rows with field_name > value.
# field_name : "<value" # Find all rows with field_name < value.
# field_name-gt : value # Find all rows with field_name > value.
# field_name-lt : value # Find all rows with field_name < value.
#
# Parameters:
# ( $CGI ) : a single cgi object
# ( $HASH ) : a hash of the parameters
#
my $self = shift;
# create an easily accessible argument hash
my $args = $self->common_param(@_);
my $tbl = $self->{table};
# see if we can setup the filtering constraints
my $filter = { %$args };
my $query = delete $args->{query} || $self->{query} || '';
my $ftr_cond;
# parse query
$self->debug( "Search Query: $query" ) if ($self->{_debug});
my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
$self->{rejected_keywords} = $rejected;
# setup the additional input parameters
$query_struct = $self->_preset_options( $query_struct, $args );
# now sort into distinct buckets
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
# with the buckets, it's now possible to create a query string
# that can be passed directly into the Lucene search.
my $query_string = '';
foreach my $search_type ( keys %$buckets ) {
my $bucket = $buckets->{$search_type};
foreach my $token ( keys %$bucket ) {
next unless $token;
my $properties = $bucket->{$token} or next;
$token =~ s/(["()])/\\$1/g;
$token =~ s/\b(or|and)\b/ /g;
my $e = ' ';
# handle boolean operations
$properties->{mode} ||= '';
if ( $properties->{mode} eq 'must' ) {
$e .= '+';
}
elsif ( $properties->{mode} eq 'cannot' ) {
$e .= '-';
}
# deal with phrase vs keyword
if ( $properties->{phrase} ) {
$e .= '"' . $token . '"' unless $token =~ /^"|"$/;
}
else {
$e .= $token;
# substring match
if ($properties->{mode} ne 'substring') {
$e .= '*' if $properties->{substring};
}
}
$query_string .= $e;
}
}
# calculate the cursor constraints
foreach my $k (qw( nh mh so sb )) {
next if defined $args->{$k};
$args->{$k} = $self->{$k} || '';
}
$args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
$args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
$args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score';
# Score is the default
$args->{so} = (defined $args->{so} and $args->{so} =~ /^(asc|desc)(?:end)?$/i) ? lc($1) : 'asc';
my %weights = $tbl->_weight_cols();
my @sortfields;
my $do_mysql_sort = 0;
for (ref($args->{sb}) eq 'ARRAY' ? @{$args->{sb}} : $args->{sb}) {
if (!exists $weights{$_}) {
$do_mysql_sort = 1 if $_ ne 'score';
next;
}
push @sortfields, new Lucene::Search::SortField($_, $args->{so} ne 'asc');
}
my $sort = @sortfields ? new Lucene::Search::Sort(@sortfields) : Lucene::Search::Sort->RELEVANCE;
my $store = $self->_get_store(0);
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
my $searcher = eval { new Lucene::Search::IndexSearcher($store); };
if ($@) {
$self->{_debug} and $self->error('SEARCH_ERROR', 'WARN', "$@");
return $self->sth({}, 0); # no hits
}
# Random default field, it's not used
my $parser = new Lucene::MultiFieldQueryParser((keys %weights)[0], $analyzer);
my $pquery = eval { $parser->parse($query_string, [keys %weights], $analyzer); };
if ($@) {
$self->{_debug} and $self->error('QUERY_ERROR', 'WARN', "$@");
return $self->sth({}, 0); # no hits
}
my $hits = $searcher->search($pquery, $sort);
my $num_hits = $hits->length;
## Setup a limit only if there is no callback. The callback argument requires a full results list
my ($offset, $max_hits) = (0, $num_hits);
unless ($self->{callback} or $do_mysql_sort) {
$offset = ( $args->{nh} - 1 ) * $args->{mh};
$max_hits = $offset + $args->{mh};
}
$max_hits = $num_hits if $max_hits > $num_hits;
my ($pk) = $self->{table}->pk;
my @indexes;
my $results = {};
for (my $i= $offset; $i < $max_hits; ++$i) {
my $doc = $hits->doc($i);
my $value = $doc->get($pk);
my $score = $hits->score($i);
$results->{$value} = $score;
}
# now handle filters
my $cols = $self->{'table'}->cols();
my %filters = map {
(my $tmp = $_) =~ s/-[lg]t$//;
$cols->{$tmp} ? ($_ => $args->{$_}) : ()
} keys %{$args};
if (keys %filters) {
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
$results = $self->filter(\%filters, $results);
}
elsif ($self->{filter}) {
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
$results = $self->_filter_query( $self->{filter}, $results );
}
else {
$self->debug( "No filters being used.") if ($self->{_debug});
}
# now this query should probably clear the filters once it's been used, so i'll dothat here
$self->{filter} = undef;
# now run through a callback function if needed.
if ($self->{callback}) {
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
$self->{_debug} and $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
return $self->sth({}, 0); # no hits
}
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
$results = $self->{callback}->($self, $results);
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
}
$self->{rows} = $num_hits;
return $self->sth($results, $do_mysql_sort);
}
sub sth {
#--------------------------------------------------------------------------------
my ($self, $results, $db_sort) = @_;
my $sth = GT::SQL::Search::LUCENE::STH->new(
'results' => $results,
'hits' => $self->{rows},
'db' => $self->{table}->{driver},
'db_sort' => $db_sort,
# pass the following attributes down to the STH handler
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
);
return $sth;
}
1;