First pass at adding key files
This commit is contained in:
@ -0,0 +1,239 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::LUCENE::Indexer
|
||||
# Author: Scott Beck
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Indexer.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
|
||||
package GT::SQL::Search::LUCENE::Indexer;
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
|
||||
use Lucene;
|
||||
use GT::SQL::Search::Base::Indexer;
|
||||
use GT::TempFile;
|
||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
|
||||
$ERRORS = {
|
||||
INDEX_CORRUPT => 'Could not create an Indexer, this probably means your index is corrupted and you should rebuild it. The error was: %s',
|
||||
DELETE_FAILED => 'Could not delete some records: %s'
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
|
||||
sub load {
|
||||
my $class = shift;
|
||||
return $class->new(@_)
|
||||
}
|
||||
|
||||
sub _get_path {
|
||||
my $self = shift;
|
||||
my $name = $self->{table}->name;
|
||||
my $tmpdir = GT::TempFile::find_tmpdir();
|
||||
my $path = $tmpdir . '/' . $name;
|
||||
$path = $1 if $path =~ /(.*)/; # XXX untaint
|
||||
return $path;
|
||||
}
|
||||
|
||||
sub _get_store {
|
||||
my ($self, $create) = @_;
|
||||
my $path = $self->_get_path;
|
||||
return Lucene::Store::FSDirectory->getDirectory($path, $create);
|
||||
}
|
||||
|
||||
sub _get_indexer {
|
||||
my ($self, $create) = @_;
|
||||
my %weights = $self->{table}->_weight_cols() or return $self->error(NOWEIGHTS => 'WARN');
|
||||
|
||||
my ($pk) = $self->{table}->pk;
|
||||
if (!$pk) {
|
||||
return $self->error('NOPRIMARYKEY','WARN');
|
||||
}
|
||||
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
|
||||
my $store = $self->_get_store($create);
|
||||
|
||||
my $iw;
|
||||
eval { $iw = new Lucene::Index::IndexWriter($store, $analyzer, $create); };
|
||||
if ($@) {
|
||||
return $self->error('INDEX_CORRUPT', 'WARN', "$@");
|
||||
}
|
||||
return $iw;
|
||||
}
|
||||
|
||||
sub drop_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $path = $self->_get_path;
|
||||
require File::Tools;
|
||||
File::Tools::deldir($path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub add_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
$self->_get_indexer(1) or return;
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub post_create_table {
|
||||
# ------------------------------------------------------------------------------
|
||||
# creates the index tables..
|
||||
#
|
||||
return $_[0]->add_search_driver(@_);
|
||||
}
|
||||
|
||||
sub post_drop_table {
|
||||
# -------------------------------------------------------
|
||||
# Remove the index tables.
|
||||
#
|
||||
return $_[0]->drop_search_driver(@_);
|
||||
}
|
||||
|
||||
|
||||
sub post_add_record {
|
||||
# -------------------------------------------------------
|
||||
# indexes a single record
|
||||
my ($self, $rec, $insert_sth, $no_optimize) = @_;
|
||||
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my %weights = $tbl->_weight_cols() or return;
|
||||
|
||||
my $indexer = $self->_get_indexer(0) or return $self->{_debug} ? () : 1;
|
||||
my $doc = new Lucene::Document;
|
||||
my ($pk) = $self->{table}->pk;
|
||||
delete $weights{$pk};
|
||||
for my $column_name (keys %weights) {
|
||||
my $field = Lucene::Document::Field->UnStored($column_name, $rec->{$column_name});
|
||||
$field->setBoost($weights{$column_name});
|
||||
$doc->add($field);
|
||||
}
|
||||
$doc->add(Lucene::Document::Field->Keyword($pk, ($tbl->ai && $insert_sth ? $insert_sth->insert_id : $rec->{$pk})));
|
||||
$indexer->addDocument($doc);
|
||||
$indexer->optimize if !$no_optimize;
|
||||
$indexer->close;
|
||||
undef $indexer;
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub reindex_all {
|
||||
# -------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $table = shift;
|
||||
my $opts = shift;
|
||||
my $tick = $opts->{tick} || 0;
|
||||
my $max = $opts->{max} || 5000;
|
||||
|
||||
my $indexer = $self->_get_indexer(1) or return $self->{_debug} ? () : 1; # clobbers the old one
|
||||
$indexer->close;
|
||||
undef $indexer;
|
||||
|
||||
my %weights = $self->{table}->_weight_cols() or return;
|
||||
my @weight_list = keys %weights;
|
||||
my ($pk) = $self->{table}->pk();
|
||||
|
||||
# Go through the table and index each field.
|
||||
my $iterations = 1;
|
||||
my $count = 0;
|
||||
|
||||
while (1) {
|
||||
if ($max) {
|
||||
my $offset = ($iterations-1) * $max;
|
||||
$table->select_options("LIMIT $offset,$max");
|
||||
}
|
||||
my $cond = $opts->{cond} || {};
|
||||
my $sth = $table->select($cond, [$pk, @weight_list]);
|
||||
my $done = 1;
|
||||
|
||||
while (my $rec = $sth->fetchrow_hashref() ) {
|
||||
$self->post_add_record($rec, undef, 1);
|
||||
$done = 0;
|
||||
if ($tick) {
|
||||
$count++;
|
||||
$count % $tick or (print "$count ");
|
||||
$count % ($tick*10) or (print "\n");
|
||||
}
|
||||
}
|
||||
last if $done;
|
||||
$iterations++;
|
||||
last if !$max;
|
||||
}
|
||||
$indexer = $self->_get_indexer(0) or return;
|
||||
$indexer->optimize;
|
||||
$indexer->close;
|
||||
undef $indexer;
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub pre_delete_record {
|
||||
# -------------------------------------------------------
|
||||
# Delete a records index values.
|
||||
#
|
||||
my ($self, $where) = @_;
|
||||
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my ($pk) = $tbl->pk();
|
||||
my $q = $tbl->select($where, [$pk]);
|
||||
|
||||
my $reader = eval { Lucene::Index::IndexReader->open($self->_get_store(0)); };
|
||||
if ($@) {
|
||||
return $self->{_debug} ? $self->error('INDEX_CORRUPT', 'WARN', "$@") : 1;
|
||||
}
|
||||
|
||||
my @errors;
|
||||
while (my ($item_id) = $q->fetchrow) {
|
||||
my $t = new Lucene::Index::Term($pk => $item_id);
|
||||
eval { $reader->deleteDocuments($t); };
|
||||
if ($@) {
|
||||
push @errors, "$@";
|
||||
}
|
||||
}
|
||||
$reader->close;
|
||||
undef $reader;
|
||||
if (@errors) {
|
||||
return $self->{_debug} ? $self->error('DELETE_FAILED', 'WARN', join(", ", @errors)) : 1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub post_update_record {
|
||||
# -------------------------------------------------------
|
||||
my ( $self, $set_cond, $where_cond, $tmp ) = @_;
|
||||
|
||||
# delete the previous record
|
||||
eval {
|
||||
$self->pre_delete_record($where_cond) or return $self->{_debug} ? () : 1;
|
||||
};
|
||||
#
|
||||
# the new record
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my ($pk) = $tbl->pk();
|
||||
my %weights = $self->{table}->_weight_cols();
|
||||
my @weight_list = keys %weights;
|
||||
my $q = $tbl->select($where_cond, [$pk, @weight_list]);
|
||||
while (my $href = $q->fetchrow_hashref) {
|
||||
$self->post_add_record($href);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
sub reindex_record {
|
||||
# -------------------------------------------------------
|
||||
# reindexes a record. basically deletes all associated records from current db abnd does an index.
|
||||
# it's safe to use this
|
||||
my ($self, $rec) = @_;
|
||||
|
||||
$self->delete_record($rec);
|
||||
$self->index_record($rec);
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,206 @@
|
||||
NAME
|
||||
Lucene -- API to the C++ port of the Lucene search engine
|
||||
|
||||
SYNOPSIS
|
||||
Initialize/Empty Lucene index
|
||||
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 1);
|
||||
|
||||
my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
|
||||
$tmp_writer->close;
|
||||
undef $tmp_writer;
|
||||
|
||||
Choose your Analyzer (string tokenizer)
|
||||
# lowercases text and splits it at non-letter characters
|
||||
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||||
# same as before and removes stop words
|
||||
my $analyzer = Lucene::Analysis::StopAnalyzer();
|
||||
# splits text at whitespace characters
|
||||
my $analyzer = Lucene::Analysis::WhitespaceAnalyzer();
|
||||
# lowercases text, tokenized it based on a grammer that
|
||||
# leaves named authorities intact (e-mails, company names,
|
||||
# web hostnames, IP addresses, etc) and removed stop words
|
||||
my $analyzer = Lucene::Analysis::Standard::StandardAnalyzer();
|
||||
|
||||
Choose your Store (storage engine)
|
||||
# in-memory storage
|
||||
my $store = new Lucene::Store::RAMDirectory();
|
||||
# disk-based storage
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||||
|
||||
Open and configure an IndexWriter
|
||||
my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 0);
|
||||
# optional settings for power users
|
||||
$writer->setMergeFactor(100);
|
||||
$writer->setUseCompoundFile(0);
|
||||
$writer->setMaxFieldLength(255);
|
||||
$writer->setMinMergeDocs(10);
|
||||
$writer->setMaxMergeDocs(100);
|
||||
|
||||
Create Documents and add Fields
|
||||
my $doc = new Lucene::Document;
|
||||
# field gets analyzed, indexed and stored
|
||||
$doc->add(Lucene::Document::Field->Text("content", $content));
|
||||
# field gets indexed and stored
|
||||
$doc->add(Lucene::Document::Field->Keyword("isbn", $isbn));
|
||||
# field gets just stored
|
||||
$doc->add(Lucene::Document::Field->UnIndexed("sales_rank", $sales_rank));
|
||||
# field gets analyzed and indexed
|
||||
$doc->add(Lucene::Document::Field->UnStored("categories", $categories));
|
||||
|
||||
Add Documents to an IndexWriter
|
||||
$writer->addDocument($doc);
|
||||
|
||||
Optimize your index and close the IndexWriter
|
||||
$writer->optimize();
|
||||
$writer->close();
|
||||
undef $writer;
|
||||
|
||||
Delete Documents
|
||||
my $reader = Lucene::Index::IndexReader->open($store);
|
||||
my $term = new Lucene::Index::Term("isbn", $isbn);
|
||||
$reader->deleteDocuments($term);
|
||||
$reader->close();
|
||||
undef $reader;
|
||||
|
||||
Query index
|
||||
# initalize searcher and parser
|
||||
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||||
my $searcher = new Lucene::Search::IndexSearcher($store);
|
||||
my $parser = new Lucene::QueryParser("default_field", $analyzer);
|
||||
|
||||
# build a query on the default field
|
||||
my $query = $parser->parse("perl");
|
||||
|
||||
# build a query on another field
|
||||
my $query = $parser->parse("title:cookbook");
|
||||
|
||||
# define a sort on one field
|
||||
my $sortfield = new Lucene::Search::SortField("unixtime");
|
||||
my $reversed_sortfield = new Lucene::Search::SortField("unixtime", 1);
|
||||
my $sort = new Lucene::Search::Sort($sortfield);
|
||||
|
||||
# define a sort on two fields
|
||||
my $sort = new Lucene::Search::Sort($sortfield1, $sortfield2);
|
||||
|
||||
# use Lucene's INDEXORDER or RELEVANCE sort
|
||||
my $sort = Lucene::Search::Sort->INDEXORDER;
|
||||
my $sort = Lucene::Search::Sort->RELEVANCE;
|
||||
|
||||
# query index and get results
|
||||
my $hits = $searcher->search($query);
|
||||
my $sorted_hits = $searcher->search($query, $sort);
|
||||
|
||||
# get number of results
|
||||
my $num_hits = $hits->length();
|
||||
|
||||
# get fields and ranking score for each hit
|
||||
for (my $i = 0; $i < $num_hits; $i++) {
|
||||
my $doc = $hits->doc($i);
|
||||
my $score = $hits->score($i);
|
||||
my $title = $doc->get("title");
|
||||
my $isbn = $doc->get("isbn");
|
||||
}
|
||||
|
||||
# free memory and close searcher
|
||||
undef $hits;
|
||||
undef $query;
|
||||
undef $parser;
|
||||
undef $analyzer;
|
||||
$searcher->close();
|
||||
undef $fsdir;
|
||||
undef $searcher;
|
||||
}
|
||||
|
||||
Close your Store
|
||||
$store->close;
|
||||
undef $store;
|
||||
|
||||
DESCRIPTION
|
||||
Like it or not Apache Lucene has become the de-facto standard for
|
||||
open-source high-performance search. It has a large user-base, is well
|
||||
documented and has plenty of committers. Unfortunately Apache Lucene is
|
||||
entirely written in Java and therefore of relatively little use for perl
|
||||
programmers. Fortunately in the recent years a group of C++ programmers
|
||||
led by Ben van Klinken decided to port Java Lucene to C++.
|
||||
|
||||
The purpose of the module is to export the C++ Lucene API to perl and at
|
||||
the same time be as close as possible to the original Java API. This has
|
||||
the combined advantage of providing perl programmers with a
|
||||
well-documented API and giving them access to a C++ search engine
|
||||
library that is supposedly faster than the original.
|
||||
|
||||
CHARACTER SUPPORT
|
||||
Currently only ISO 8859-1 (Latin-1) characters are supported. Obviously
|
||||
this included all ASCII characters.
|
||||
|
||||
INDEX COMPATIBLITY
|
||||
For the moment indices produced by this module are not compatible with
|
||||
those from Apache Lucene. The reason for this is that this module uses
|
||||
1-byte character encoding as opposed to 2-byte (widechar) encoding with
|
||||
Apache Lucene.
|
||||
|
||||
INSTALLATION
|
||||
This module requires the clucene library to be installed. The best way
|
||||
to get it is to go to the following page
|
||||
|
||||
http://sourceforge.net/projects/clucene/
|
||||
|
||||
and download the latest STABLE clucene-core version. Currently it is
|
||||
clucene-core-0.9.15. Make sure you compile it in ASCII mode and install
|
||||
it in your standard library path.
|
||||
|
||||
On a Linux platform this goes as follows:
|
||||
|
||||
wget http://kent.dl.sourceforge.net/sourceforge/clucene/clucene-core-0.9.15.tar.gz
|
||||
cd clucene-core-0.9.15
|
||||
./autogen.sh
|
||||
./configure --disable-debug --prefix=/usr --exec-prefix=/usr --enable-ascii
|
||||
make
|
||||
make check
|
||||
(as root) make install
|
||||
|
||||
To install the perl module itself, run the following commands:
|
||||
|
||||
perl Makefile.PL
|
||||
make
|
||||
make test
|
||||
(as root) make install
|
||||
|
||||
AUTHOR
|
||||
Thomas Busch <tbusch at cpan dot org>
|
||||
|
||||
COPYRIGHT AND LICENSE
|
||||
Copyright (c) 2006 Thomas Busch
|
||||
|
||||
This library is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
SEE ALSO
|
||||
Plucene - a pure-Perl implementation of Lucene
|
||||
|
||||
KinoSearch - a search engine library inspired by Lucene
|
||||
|
||||
DISCLAIMER OF WARRANTY
|
||||
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
|
||||
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
|
||||
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
|
||||
NECESSARY SERVICING, REPAIR, OR CORRECTION.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
|
||||
TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
115
site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/LUCENE/STH.pm
Normal file
115
site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/LUCENE/STH.pm
Normal file
@ -0,0 +1,115 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::STH
|
||||
# Author: Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: STH.pm,v 1.1 2006/12/07 07:04:51 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
|
||||
package GT::SQL::Search::LUCENE::STH;
|
||||
#--------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE /;
|
||||
require GT::SQL::Search::Base::STH;
|
||||
|
||||
@ISA = ('GT::SQL::Search::STH');
|
||||
$ATTRIBS = {
|
||||
'db_sort' => 1,
|
||||
'hits' => undef
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL::Search::STH';
|
||||
|
||||
|
||||
|
||||
sub init {
|
||||
#--------------------------------------------------------------------------------
|
||||
# GT::SQL::Search::STH expects a full set of results in $self->{results}. For
|
||||
# Lucene the only time a full set of results is there is when we are sorting
|
||||
# on a field that is not weighted, otherwise the results in $self->{results} is
|
||||
# the proper page and number of results.
|
||||
my $self = shift;
|
||||
|
||||
$self->set(@_);
|
||||
|
||||
--$self->{nh} if $self->{nh};
|
||||
|
||||
# Here we allow hits to override our concept of rows. This is only useful
|
||||
# when !$self->{db_sort}
|
||||
$self->{rows} = $self->{hits}
|
||||
? $self->{hits}
|
||||
: $self->{results}
|
||||
? scalar(keys %{$self->{results}})
|
||||
: 0;
|
||||
|
||||
if ($self->{db_sort}) {
|
||||
$self->get_db_sorted_results;
|
||||
}
|
||||
else {
|
||||
$self->get_sorted_results;
|
||||
}
|
||||
}
|
||||
|
||||
sub get_sorted_results {
|
||||
# Just sorts the results out of $self->{results} which should have been setup
|
||||
# by a search driver
|
||||
my ($self) = @_;
|
||||
my $results = $self->{results};
|
||||
$self->{index} = 0;
|
||||
$self->{max_index} = $self->{mh} - 1;
|
||||
$self->{'order'} = [ sort {
|
||||
( $results->{$b} || return 0 ) <=> ( $results->{$a} || 0 )
|
||||
} keys %{$results} ];
|
||||
$self->debug_dumper( "Results will be presented in the following order: ", $self->{'order'} ) if ($self->{_debug});
|
||||
}
|
||||
|
||||
sub get_db_sorted_results {
|
||||
# This assumes $self->{results} has a full result set, i.e. without any LIMIT
|
||||
# It then selects the result set using the SQL driver to do the sorting. This
|
||||
# is for Search modules which can not handle their own sorting
|
||||
my ($self) = @_;
|
||||
|
||||
my $sb = GT::SQL::Search::Base::Search->clean_sb($self->{sb}, $self->{so});
|
||||
$self->{index} = $self->{nh} * $self->{mh} || 0;
|
||||
$self->{max_index} = $self->{index} + $self->{mh} - 1; # index + mh is the first record which should not be returned.
|
||||
if ($self->{max_index} > $self->{rows}) {
|
||||
$self->{max_index} = $self->{rows};
|
||||
$self->{rows} = $self->{rows} - $self->{index};
|
||||
$self->{rows} < 0 ? $self->{rows} = 0 : 0;
|
||||
}
|
||||
|
||||
else {
|
||||
$self->{rows} = $self->{mh};
|
||||
}
|
||||
my ( $table, $pk ) = $self->_table_info();
|
||||
my ( $query, $where, $st, $limit );
|
||||
|
||||
$where = qq!$pk in(! . join( ",", keys %{$self->{results}}) . ')';
|
||||
$self->{mh} and $limit = qq!LIMIT $self->{index}, $self->{rows}!;
|
||||
$query = qq!
|
||||
SELECT $pk
|
||||
FROM $table
|
||||
WHERE $where
|
||||
$sb
|
||||
$limit
|
||||
!;
|
||||
$self->debug( "Row fetch query: $query" ) if ($self->{_debug});
|
||||
my $sth = $self->{table}->{driver}->prepare( $query );
|
||||
$sth->execute();
|
||||
|
||||
$self->{index} = 0;
|
||||
$self->{max_hits} = $self->{rows};
|
||||
|
||||
# Fetch the results in sorted order
|
||||
my $order = $sth->fetchall_arrayref();
|
||||
$sth->finish();
|
||||
|
||||
$self->{'order'} = [ map { $_->[0] } @{$order} ];
|
||||
}
|
||||
|
||||
1;
|
||||
|
@ -0,0 +1,260 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::LUCENE::Search
|
||||
# Author : Scott Beck
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to search indexed tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::LUCENE::Search;
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS $ERRORS $ERROR_MESSAGE /;
|
||||
use Lucene;
|
||||
use GT::TempFile;
|
||||
use GT::SQL::Search::LUCENE::STH;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
|
||||
$ERRORS = {
|
||||
SEARCH_ERROR => "Error searching: %s",
|
||||
QUERY_ERROR => "Query error: %s"
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::LUCENE::Search->new(@_)
|
||||
}
|
||||
|
||||
sub _get_path {
|
||||
my $self = shift;
|
||||
my $name = $self->{table}->name;
|
||||
my $tmpdir = GT::TempFile::find_tmpdir();
|
||||
my $path = $tmpdir . '/' . $name;
|
||||
$path = $1 if $path =~ /(.*)/; # XXX untaint
|
||||
return $path;
|
||||
}
|
||||
|
||||
sub _get_store {
|
||||
my ($self, $create) = @_;
|
||||
my $path = $self->_get_path;
|
||||
return Lucene::Store::FSDirectory->getDirectory($path, $create);
|
||||
}
|
||||
|
||||
sub query {
|
||||
# --------------------------------------------------
|
||||
# Returns a sth based on a query
|
||||
#
|
||||
# Options:
|
||||
# - paging
|
||||
# mh : max hits
|
||||
# nh : number hit (or page of hits)
|
||||
#
|
||||
# - searching
|
||||
# ww : whole word
|
||||
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
||||
# substring : search for substrings of words
|
||||
# bool : 'and' => and search, 'or' => or search, '' => regular query
|
||||
# query : the string of things to ask for
|
||||
#
|
||||
# - filtering
|
||||
# field_name : value # Find all rows with field_name = value
|
||||
# field_name : ">value" # Find all rows with field_name > value.
|
||||
# field_name : "<value" # Find all rows with field_name < value.
|
||||
# field_name-gt : value # Find all rows with field_name > value.
|
||||
# field_name-lt : value # Find all rows with field_name < value.
|
||||
#
|
||||
# Parameters:
|
||||
# ( $CGI ) : a single cgi object
|
||||
# ( $HASH ) : a hash of the parameters
|
||||
#
|
||||
my $self = shift;
|
||||
|
||||
# create an easily accessible argument hash
|
||||
my $args = $self->common_param(@_);
|
||||
my $tbl = $self->{table};
|
||||
|
||||
# see if we can setup the filtering constraints
|
||||
my $filter = { %$args };
|
||||
my $query = delete $args->{query} || $self->{query} || '';
|
||||
my $ftr_cond;
|
||||
|
||||
# parse query
|
||||
$self->debug( "Search Query: $query" ) if ($self->{_debug});
|
||||
|
||||
my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
|
||||
|
||||
$self->{rejected_keywords} = $rejected;
|
||||
|
||||
# setup the additional input parameters
|
||||
$query_struct = $self->_preset_options( $query_struct, $args );
|
||||
|
||||
# now sort into distinct buckets
|
||||
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
|
||||
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
|
||||
|
||||
# with the buckets, it's now possible to create a query string
|
||||
# that can be passed directly into the Lucene search.
|
||||
my $query_string = '';
|
||||
|
||||
foreach my $search_type ( keys %$buckets ) {
|
||||
my $bucket = $buckets->{$search_type};
|
||||
foreach my $token ( keys %$bucket ) {
|
||||
next unless $token;
|
||||
my $properties = $bucket->{$token} or next;
|
||||
$token =~ s/(["()])/\\$1/g;
|
||||
$token =~ s/\b(or|and)\b/ /g;
|
||||
|
||||
my $e = ' ';
|
||||
|
||||
# handle boolean operations
|
||||
$properties->{mode} ||= '';
|
||||
if ( $properties->{mode} eq 'must' ) {
|
||||
$e .= '+';
|
||||
}
|
||||
elsif ( $properties->{mode} eq 'cannot' ) {
|
||||
$e .= '-';
|
||||
}
|
||||
|
||||
# deal with phrase vs keyword
|
||||
if ( $properties->{phrase} ) {
|
||||
$e .= '"' . $token . '"' unless $token =~ /^"|"$/;
|
||||
}
|
||||
else {
|
||||
$e .= $token;
|
||||
|
||||
# substring match
|
||||
if ($properties->{mode} ne 'substring') {
|
||||
$e .= '*' if $properties->{substring};
|
||||
}
|
||||
}
|
||||
|
||||
$query_string .= $e;
|
||||
}
|
||||
}
|
||||
|
||||
# calculate the cursor constraints
|
||||
foreach my $k (qw( nh mh so sb )) {
|
||||
next if defined $args->{$k};
|
||||
$args->{$k} = $self->{$k} || '';
|
||||
}
|
||||
$args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
|
||||
$args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
|
||||
$args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score';
|
||||
|
||||
# Score is the default
|
||||
$args->{so} = (defined $args->{so} and $args->{so} =~ /^(asc|desc)(?:end)?$/i) ? lc($1) : 'asc';
|
||||
|
||||
my %weights = $tbl->_weight_cols();
|
||||
my @sortfields;
|
||||
my $do_mysql_sort = 0;
|
||||
for (ref($args->{sb}) eq 'ARRAY' ? @{$args->{sb}} : $args->{sb}) {
|
||||
if (!exists $weights{$_}) {
|
||||
$do_mysql_sort = 1 if $_ ne 'score';
|
||||
next;
|
||||
}
|
||||
push @sortfields, new Lucene::Search::SortField($_, $args->{so} ne 'asc');
|
||||
}
|
||||
my $sort = @sortfields ? new Lucene::Search::Sort(@sortfields) : Lucene::Search::Sort->RELEVANCE;
|
||||
my $store = $self->_get_store(0);
|
||||
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
|
||||
my $searcher = eval { new Lucene::Search::IndexSearcher($store); };
|
||||
if ($@) {
|
||||
$self->{_debug} and $self->error('SEARCH_ERROR', 'WARN', "$@");
|
||||
return $self->sth({}, 0); # no hits
|
||||
}
|
||||
# Random default field, it's not used
|
||||
my $parser = new Lucene::MultiFieldQueryParser((keys %weights)[0], $analyzer);
|
||||
my $pquery = eval { $parser->parse($query_string, [keys %weights], $analyzer); };
|
||||
if ($@) {
|
||||
$self->{_debug} and $self->error('QUERY_ERROR', 'WARN', "$@");
|
||||
return $self->sth({}, 0); # no hits
|
||||
}
|
||||
my $hits = $searcher->search($pquery, $sort);
|
||||
my $num_hits = $hits->length;
|
||||
|
||||
## Setup a limit only if there is no callback. The callback argument requires a full results list
|
||||
my ($offset, $max_hits) = (0, $num_hits);
|
||||
unless ($self->{callback} or $do_mysql_sort) {
|
||||
$offset = ( $args->{nh} - 1 ) * $args->{mh};
|
||||
$max_hits = $offset + $args->{mh};
|
||||
}
|
||||
$max_hits = $num_hits if $max_hits > $num_hits;
|
||||
my ($pk) = $self->{table}->pk;
|
||||
my @indexes;
|
||||
my $results = {};
|
||||
for (my $i= $offset; $i < $max_hits; ++$i) {
|
||||
my $doc = $hits->doc($i);
|
||||
my $value = $doc->get($pk);
|
||||
my $score = $hits->score($i);
|
||||
$results->{$value} = $score;
|
||||
}
|
||||
|
||||
# now handle filters
|
||||
my $cols = $self->{'table'}->cols();
|
||||
my %filters = map {
|
||||
(my $tmp = $_) =~ s/-[lg]t$//;
|
||||
$cols->{$tmp} ? ($_ => $args->{$_}) : ()
|
||||
} keys %{$args};
|
||||
|
||||
if (keys %filters) {
|
||||
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
|
||||
$results = $self->filter(\%filters, $results);
|
||||
}
|
||||
elsif ($self->{filter}) {
|
||||
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
|
||||
$results = $self->_filter_query( $self->{filter}, $results );
|
||||
}
|
||||
else {
|
||||
$self->debug( "No filters being used.") if ($self->{_debug});
|
||||
}
|
||||
|
||||
# now this query should probably clear the filters once it's been used, so i'll dothat here
|
||||
$self->{filter} = undef;
|
||||
|
||||
# now run through a callback function if needed.
|
||||
if ($self->{callback}) {
|
||||
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
|
||||
$self->{_debug} and $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
|
||||
return $self->sth({}, 0); # no hits
|
||||
}
|
||||
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
$results = $self->{callback}->($self, $results);
|
||||
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
}
|
||||
|
||||
$self->{rows} = $num_hits;
|
||||
|
||||
return $self->sth($results, $do_mysql_sort);
|
||||
}
|
||||
|
||||
sub sth {
|
||||
#--------------------------------------------------------------------------------
|
||||
my ($self, $results, $db_sort) = @_;
|
||||
|
||||
my $sth = GT::SQL::Search::LUCENE::STH->new(
|
||||
'results' => $results,
|
||||
'hits' => $self->{rows},
|
||||
'db' => $self->{table}->{driver},
|
||||
'db_sort' => $db_sort,
|
||||
# pass the following attributes down to the STH handler
|
||||
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
|
||||
);
|
||||
|
||||
return $sth;
|
||||
}
|
||||
|
||||
1;
|
Reference in New Issue
Block a user