First pass at adding key files
This commit is contained in:
		@@ -0,0 +1,239 @@
 | 
			
		||||
# ====================================================================
 | 
			
		||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
 | 
			
		||||
#
 | 
			
		||||
#   GT::SQL::Search::LUCENE::Indexer
 | 
			
		||||
#   Author: Scott Beck
 | 
			
		||||
#   CVS Info : 087,071,086,086,085      
 | 
			
		||||
#   $Id: Indexer.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
 | 
			
		||||
#
 | 
			
		||||
# Copyright (c) 2004 Gossamer Threads Inc.  All Rights Reserved.
 | 
			
		||||
# ====================================================================
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
package GT::SQL::Search::LUCENE::Indexer;
 | 
			
		||||
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Preamble information related to the object
 | 
			
		||||
use strict;
 | 
			
		||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
 | 
			
		||||
use Lucene;
 | 
			
		||||
use GT::SQL::Search::Base::Indexer;
 | 
			
		||||
use GT::TempFile;
 | 
			
		||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
 | 
			
		||||
$DEBUG   = 0;
 | 
			
		||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
 | 
			
		||||
$ERRORS = {
 | 
			
		||||
    INDEX_CORRUPT => 'Could not create an Indexer, this probably means your index is corrupted and you should rebuild it. The error was: %s',
 | 
			
		||||
    DELETE_FAILED => 'Could not delete some records: %s'
 | 
			
		||||
};
 | 
			
		||||
$ERROR_MESSAGE = 'GT::SQL';
 | 
			
		||||
 | 
			
		||||
sub load {
 | 
			
		||||
    my $class = shift;
 | 
			
		||||
    return $class->new(@_)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _get_path {
 | 
			
		||||
    my $self = shift;
 | 
			
		||||
    my $name    = $self->{table}->name;
 | 
			
		||||
    my $tmpdir  = GT::TempFile::find_tmpdir();
 | 
			
		||||
    my $path = $tmpdir . '/' . $name;
 | 
			
		||||
    $path = $1 if $path =~ /(.*)/; # XXX untaint
 | 
			
		||||
    return $path;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _get_store {
 | 
			
		||||
    my ($self, $create) = @_;
 | 
			
		||||
    my $path = $self->_get_path;
 | 
			
		||||
    return Lucene::Store::FSDirectory->getDirectory($path, $create);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _get_indexer {
 | 
			
		||||
    my ($self, $create) = @_;
 | 
			
		||||
    my %weights = $self->{table}->_weight_cols() or return $self->error(NOWEIGHTS => 'WARN');
 | 
			
		||||
 | 
			
		||||
    my ($pk) = $self->{table}->pk;
 | 
			
		||||
    if (!$pk) {
 | 
			
		||||
        return $self->error('NOPRIMARYKEY','WARN');
 | 
			
		||||
    }
 | 
			
		||||
    my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
 | 
			
		||||
    my $store = $self->_get_store($create);
 | 
			
		||||
 | 
			
		||||
    my $iw;
 | 
			
		||||
    eval { $iw = new Lucene::Index::IndexWriter($store, $analyzer, $create); };
 | 
			
		||||
    if ($@) {
 | 
			
		||||
        return $self->error('INDEX_CORRUPT', 'WARN', "$@");
 | 
			
		||||
    }
 | 
			
		||||
    return $iw;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub drop_search_driver {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $path = $self->_get_path;
 | 
			
		||||
    require File::Tools;
 | 
			
		||||
    File::Tools::deldir($path);
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub add_search_driver {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    $self->_get_indexer(1) or return;
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub post_create_table {
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# creates the index tables..
 | 
			
		||||
#
 | 
			
		||||
    return $_[0]->add_search_driver(@_);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub post_drop_table {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# Remove the index tables.
 | 
			
		||||
#
 | 
			
		||||
    return $_[0]->drop_search_driver(@_);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
sub post_add_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# indexes a single record
 | 
			
		||||
    my ($self, $rec, $insert_sth, $no_optimize) = @_;
 | 
			
		||||
 | 
			
		||||
    my $tbl     = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    my %weights = $tbl->_weight_cols() or return;
 | 
			
		||||
 | 
			
		||||
    my $indexer = $self->_get_indexer(0) or return $self->{_debug} ? () : 1;
 | 
			
		||||
    my $doc = new Lucene::Document;
 | 
			
		||||
    my ($pk) = $self->{table}->pk;
 | 
			
		||||
    delete $weights{$pk};
 | 
			
		||||
    for my $column_name (keys %weights) {
 | 
			
		||||
        my $field = Lucene::Document::Field->UnStored($column_name, $rec->{$column_name});
 | 
			
		||||
        $field->setBoost($weights{$column_name});
 | 
			
		||||
        $doc->add($field);
 | 
			
		||||
    }
 | 
			
		||||
    $doc->add(Lucene::Document::Field->Keyword($pk, ($tbl->ai && $insert_sth ? $insert_sth->insert_id : $rec->{$pk})));
 | 
			
		||||
    $indexer->addDocument($doc);
 | 
			
		||||
    $indexer->optimize if !$no_optimize;
 | 
			
		||||
    $indexer->close;
 | 
			
		||||
    undef $indexer;
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub reindex_all {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
    my $table   = shift;
 | 
			
		||||
    my $opts    = shift;
 | 
			
		||||
    my $tick    = $opts->{tick} || 0;
 | 
			
		||||
    my $max     = $opts->{max}  || 5000;
 | 
			
		||||
 | 
			
		||||
    my $indexer = $self->_get_indexer(1) or return $self->{_debug} ? () : 1; # clobbers the old one
 | 
			
		||||
    $indexer->close;
 | 
			
		||||
    undef $indexer;
 | 
			
		||||
 | 
			
		||||
    my %weights     = $self->{table}->_weight_cols() or return;
 | 
			
		||||
    my @weight_list = keys %weights;
 | 
			
		||||
    my ($pk)    = $self->{table}->pk();
 | 
			
		||||
 | 
			
		||||
# Go through the table and index each field.
 | 
			
		||||
    my $iterations = 1;
 | 
			
		||||
    my $count = 0;
 | 
			
		||||
 | 
			
		||||
    while (1) {
 | 
			
		||||
        if ($max) {
 | 
			
		||||
            my $offset = ($iterations-1) * $max;
 | 
			
		||||
            $table->select_options("LIMIT $offset,$max");
 | 
			
		||||
        }
 | 
			
		||||
        my $cond     = $opts->{cond} || {};
 | 
			
		||||
        my $sth      = $table->select($cond, [$pk, @weight_list]);
 | 
			
		||||
        my $done     = 1;
 | 
			
		||||
 | 
			
		||||
        while (my $rec = $sth->fetchrow_hashref() ) {
 | 
			
		||||
            $self->post_add_record($rec, undef, 1);
 | 
			
		||||
            $done = 0;
 | 
			
		||||
            if ($tick) {
 | 
			
		||||
                $count++;
 | 
			
		||||
                $count % $tick      or (print "$count ");
 | 
			
		||||
                $count % ($tick*10) or (print "\n");
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        last if $done;
 | 
			
		||||
        $iterations++;
 | 
			
		||||
        last if !$max;
 | 
			
		||||
    }
 | 
			
		||||
    $indexer = $self->_get_indexer(0) or return;
 | 
			
		||||
    $indexer->optimize;
 | 
			
		||||
    $indexer->close;
 | 
			
		||||
    undef $indexer;
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub pre_delete_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# Delete a records index values.
 | 
			
		||||
#
 | 
			
		||||
    my ($self, $where) = @_; 
 | 
			
		||||
 | 
			
		||||
    my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    my ($pk) = $tbl->pk();
 | 
			
		||||
    my $q = $tbl->select($where, [$pk]);
 | 
			
		||||
 | 
			
		||||
    my $reader = eval { Lucene::Index::IndexReader->open($self->_get_store(0)); };
 | 
			
		||||
    if ($@) {
 | 
			
		||||
        return $self->{_debug} ? $self->error('INDEX_CORRUPT', 'WARN', "$@") : 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    my @errors;
 | 
			
		||||
    while (my ($item_id) = $q->fetchrow) {
 | 
			
		||||
        my $t =  new Lucene::Index::Term($pk => $item_id);
 | 
			
		||||
        eval { $reader->deleteDocuments($t); };
 | 
			
		||||
        if ($@) {
 | 
			
		||||
            push @errors, "$@";
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    $reader->close;
 | 
			
		||||
    undef $reader;
 | 
			
		||||
    if (@errors) {
 | 
			
		||||
        return $self->{_debug} ? $self->error('DELETE_FAILED', 'WARN', join(", ", @errors)) : 1;
 | 
			
		||||
    }
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub post_update_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
    my ( $self, $set_cond, $where_cond, $tmp ) = @_;
 | 
			
		||||
 | 
			
		||||
# delete the previous record
 | 
			
		||||
    eval {
 | 
			
		||||
        $self->pre_delete_record($where_cond) or return $self->{_debug} ? () : 1;
 | 
			
		||||
    };
 | 
			
		||||
#
 | 
			
		||||
# the new record
 | 
			
		||||
    my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
 | 
			
		||||
    my ($pk) = $tbl->pk();
 | 
			
		||||
    my %weights = $self->{table}->_weight_cols();
 | 
			
		||||
    my @weight_list = keys %weights;
 | 
			
		||||
    my $q = $tbl->select($where_cond, [$pk, @weight_list]);
 | 
			
		||||
    while (my $href = $q->fetchrow_hashref) {
 | 
			
		||||
        $self->post_add_record($href);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return 1;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub reindex_record {
 | 
			
		||||
# -------------------------------------------------------
 | 
			
		||||
# reindexes a record. basically deletes all associated records from current db abnd does an index.
 | 
			
		||||
# it's safe to use this
 | 
			
		||||
    my ($self, $rec) = @_;
 | 
			
		||||
 | 
			
		||||
    $self->delete_record($rec);
 | 
			
		||||
    $self->index_record($rec);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
1;
 | 
			
		||||
@@ -0,0 +1,206 @@
 | 
			
		||||
NAME
 | 
			
		||||
    Lucene -- API to the C++ port of the Lucene search engine
 | 
			
		||||
 | 
			
		||||
SYNOPSIS
 | 
			
		||||
  Initialize/Empty Lucene index
 | 
			
		||||
      my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
 | 
			
		||||
      my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 1);
 | 
			
		||||
 | 
			
		||||
      my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
 | 
			
		||||
      $tmp_writer->close;
 | 
			
		||||
      undef $tmp_writer;
 | 
			
		||||
 | 
			
		||||
  Choose your Analyzer (string tokenizer)
 | 
			
		||||
      # lowercases text and splits it at non-letter characters 
 | 
			
		||||
      my $analyzer = Lucene::Analysis::SimpleAnalyzer();
 | 
			
		||||
      # same as before and removes stop words
 | 
			
		||||
      my $analyzer = Lucene::Analysis::StopAnalyzer();
 | 
			
		||||
      # splits text at whitespace characters
 | 
			
		||||
      my $analyzer = Lucene::Analysis::WhitespaceAnalyzer();
 | 
			
		||||
      # lowercases text, tokenized it based on a grammer that 
 | 
			
		||||
      # leaves named authorities intact (e-mails, company names,
 | 
			
		||||
      # web hostnames, IP addresses, etc) and removed stop words
 | 
			
		||||
      my $analyzer = Lucene::Analysis::Standard::StandardAnalyzer();
 | 
			
		||||
 | 
			
		||||
  Choose your Store (storage engine)
 | 
			
		||||
      # in-memory storage
 | 
			
		||||
      my $store = new Lucene::Store::RAMDirectory();
 | 
			
		||||
      # disk-based storage
 | 
			
		||||
      my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
 | 
			
		||||
 | 
			
		||||
  Open and configure an IndexWriter
 | 
			
		||||
      my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 0);
 | 
			
		||||
      # optional settings for power users
 | 
			
		||||
      $writer->setMergeFactor(100);
 | 
			
		||||
      $writer->setUseCompoundFile(0);
 | 
			
		||||
      $writer->setMaxFieldLength(255);
 | 
			
		||||
      $writer->setMinMergeDocs(10);
 | 
			
		||||
      $writer->setMaxMergeDocs(100);
 | 
			
		||||
 | 
			
		||||
  Create Documents and add Fields
 | 
			
		||||
      my $doc = new Lucene::Document;
 | 
			
		||||
      # field gets analyzed, indexed and stored
 | 
			
		||||
      $doc->add(Lucene::Document::Field->Text("content", $content));
 | 
			
		||||
      # field gets indexed and stored
 | 
			
		||||
      $doc->add(Lucene::Document::Field->Keyword("isbn", $isbn));
 | 
			
		||||
      # field gets just stored
 | 
			
		||||
      $doc->add(Lucene::Document::Field->UnIndexed("sales_rank", $sales_rank));
 | 
			
		||||
      # field gets analyzed and indexed 
 | 
			
		||||
      $doc->add(Lucene::Document::Field->UnStored("categories", $categories));
 | 
			
		||||
 | 
			
		||||
  Add Documents to an IndexWriter
 | 
			
		||||
      $writer->addDocument($doc);
 | 
			
		||||
 | 
			
		||||
  Optimize your index and close the IndexWriter
 | 
			
		||||
      $writer->optimize();
 | 
			
		||||
      $writer->close();
 | 
			
		||||
      undef $writer;
 | 
			
		||||
 | 
			
		||||
  Delete Documents
 | 
			
		||||
      my $reader = Lucene::Index::IndexReader->open($store);
 | 
			
		||||
      my $term = new Lucene::Index::Term("isbn", $isbn);
 | 
			
		||||
      $reader->deleteDocuments($term);
 | 
			
		||||
      $reader->close();
 | 
			
		||||
      undef $reader;
 | 
			
		||||
 | 
			
		||||
  Query index
 | 
			
		||||
      # initalize searcher and parser
 | 
			
		||||
      my $analyzer = Lucene::Analysis::SimpleAnalyzer();
 | 
			
		||||
      my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
 | 
			
		||||
      my $searcher = new Lucene::Search::IndexSearcher($store);
 | 
			
		||||
      my $parser = new Lucene::QueryParser("default_field", $analyzer);
 | 
			
		||||
 | 
			
		||||
      # build a query on the default field
 | 
			
		||||
      my $query = $parser->parse("perl");
 | 
			
		||||
 | 
			
		||||
      # build a query on another field
 | 
			
		||||
      my $query = $parser->parse("title:cookbook");
 | 
			
		||||
 | 
			
		||||
      # define a sort on one field
 | 
			
		||||
      my $sortfield = new Lucene::Search::SortField("unixtime"); 
 | 
			
		||||
      my $reversed_sortfield = new Lucene::Search::SortField("unixtime", 1);
 | 
			
		||||
      my $sort = new Lucene::Search::Sort($sortfield);
 | 
			
		||||
 | 
			
		||||
      # define a sort on two fields
 | 
			
		||||
      my $sort = new Lucene::Search::Sort($sortfield1, $sortfield2);
 | 
			
		||||
 | 
			
		||||
      # use Lucene's INDEXORDER or RELEVANCE sort
 | 
			
		||||
      my $sort = Lucene::Search::Sort->INDEXORDER;
 | 
			
		||||
      my $sort = Lucene::Search::Sort->RELEVANCE;
 | 
			
		||||
 | 
			
		||||
      # query index and get results
 | 
			
		||||
      my $hits = $searcher->search($query);
 | 
			
		||||
      my $sorted_hits = $searcher->search($query, $sort);
 | 
			
		||||
 | 
			
		||||
      # get number of results
 | 
			
		||||
      my $num_hits = $hits->length();
 | 
			
		||||
 | 
			
		||||
      # get fields and ranking score for each hit
 | 
			
		||||
      for (my $i = 0; $i < $num_hits; $i++) {
 | 
			
		||||
        my $doc = $hits->doc($i);
 | 
			
		||||
        my $score = $hits->score($i);
 | 
			
		||||
        my $title = $doc->get("title");
 | 
			
		||||
        my $isbn = $doc->get("isbn");
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      # free memory and close searcher
 | 
			
		||||
      undef $hits;
 | 
			
		||||
      undef $query;
 | 
			
		||||
      undef $parser;
 | 
			
		||||
      undef $analyzer;
 | 
			
		||||
      $searcher->close();
 | 
			
		||||
      undef $fsdir;
 | 
			
		||||
      undef $searcher;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  Close your Store
 | 
			
		||||
      $store->close;
 | 
			
		||||
      undef $store;
 | 
			
		||||
 | 
			
		||||
DESCRIPTION
 | 
			
		||||
    Like it or not Apache Lucene has become the de-facto standard for
 | 
			
		||||
    open-source high-performance search. It has a large user-base, is well
 | 
			
		||||
    documented and has plenty of committers. Unfortunately Apache Lucene is
 | 
			
		||||
    entirely written in Java and therefore of relatively little use for perl
 | 
			
		||||
    programmers. Fortunately in the recent years a group of C++ programmers
 | 
			
		||||
    led by Ben van Klinken decided to port Java Lucene to C++.
 | 
			
		||||
 | 
			
		||||
    The purpose of the module is to export the C++ Lucene API to perl and at
 | 
			
		||||
    the same time be as close as possible to the original Java API. This has
 | 
			
		||||
    the combined advantage of providing perl programmers with a
 | 
			
		||||
    well-documented API and giving them access to a C++ search engine
 | 
			
		||||
    library that is supposedly faster than the original.
 | 
			
		||||
 | 
			
		||||
CHARACTER SUPPORT
 | 
			
		||||
    Currently only ISO 8859-1 (Latin-1) characters are supported. Obviously
 | 
			
		||||
    this included all ASCII characters.
 | 
			
		||||
 | 
			
		||||
INDEX COMPATIBLITY
 | 
			
		||||
    For the moment indices produced by this module are not compatible with
 | 
			
		||||
    those from Apache Lucene. The reason for this is that this module uses
 | 
			
		||||
    1-byte character encoding as opposed to 2-byte (widechar) encoding with
 | 
			
		||||
    Apache Lucene.
 | 
			
		||||
 | 
			
		||||
INSTALLATION
 | 
			
		||||
    This module requires the clucene library to be installed. The best way
 | 
			
		||||
    to get it is to go to the following page
 | 
			
		||||
 | 
			
		||||
        http://sourceforge.net/projects/clucene/
 | 
			
		||||
 | 
			
		||||
    and download the latest STABLE clucene-core version. Currently it is
 | 
			
		||||
    clucene-core-0.9.15. Make sure you compile it in ASCII mode and install
 | 
			
		||||
    it in your standard library path.
 | 
			
		||||
 | 
			
		||||
    On a Linux platform this goes as follows:
 | 
			
		||||
 | 
			
		||||
        wget http://kent.dl.sourceforge.net/sourceforge/clucene/clucene-core-0.9.15.tar.gz
 | 
			
		||||
        cd clucene-core-0.9.15
 | 
			
		||||
        ./autogen.sh
 | 
			
		||||
        ./configure --disable-debug --prefix=/usr --exec-prefix=/usr --enable-ascii
 | 
			
		||||
        make
 | 
			
		||||
        make check
 | 
			
		||||
        (as root) make install
 | 
			
		||||
 | 
			
		||||
    To install the perl module itself, run the following commands:
 | 
			
		||||
 | 
			
		||||
        perl Makefile.PL
 | 
			
		||||
        make
 | 
			
		||||
        make test
 | 
			
		||||
        (as root) make install
 | 
			
		||||
 | 
			
		||||
AUTHOR
 | 
			
		||||
    Thomas Busch <tbusch at cpan dot org>
 | 
			
		||||
 | 
			
		||||
COPYRIGHT AND LICENSE
 | 
			
		||||
    Copyright (c) 2006 Thomas Busch
 | 
			
		||||
 | 
			
		||||
    This library is free software; you can redistribute it and/or modify it
 | 
			
		||||
    under the same terms as Perl itself.
 | 
			
		||||
 | 
			
		||||
SEE ALSO
 | 
			
		||||
    Plucene - a pure-Perl implementation of Lucene
 | 
			
		||||
 | 
			
		||||
    KinoSearch - a search engine library inspired by Lucene
 | 
			
		||||
 | 
			
		||||
DISCLAIMER OF WARRANTY
 | 
			
		||||
    BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 | 
			
		||||
    FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
 | 
			
		||||
    OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
 | 
			
		||||
    PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
 | 
			
		||||
    EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 | 
			
		||||
    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
 | 
			
		||||
    ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
 | 
			
		||||
    YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
 | 
			
		||||
    NECESSARY SERVICING, REPAIR, OR CORRECTION.
 | 
			
		||||
 | 
			
		||||
    IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 | 
			
		||||
    WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
 | 
			
		||||
    REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
 | 
			
		||||
    TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
 | 
			
		||||
    CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
 | 
			
		||||
    SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
 | 
			
		||||
    RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
 | 
			
		||||
    FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
 | 
			
		||||
    SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 | 
			
		||||
    DAMAGES.
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										115
									
								
								site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/LUCENE/STH.pm
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/LUCENE/STH.pm
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,115 @@
 | 
			
		||||
# ====================================================================
 | 
			
		||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
 | 
			
		||||
#
 | 
			
		||||
#   GT::SQL::Search::STH
 | 
			
		||||
#   Author: Aki Mimoto
 | 
			
		||||
#   CVS Info : 087,071,086,086,085      
 | 
			
		||||
#   CVS Info : 087,071,086,086,085      
 | 
			
		||||
#   $Id: STH.pm,v 1.1 2006/12/07 07:04:51 aki Exp $
 | 
			
		||||
#
 | 
			
		||||
# Copyright (c) 2004 Gossamer Threads Inc.  All Rights Reserved.
 | 
			
		||||
# ====================================================================
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
package GT::SQL::Search::LUCENE::STH;
 | 
			
		||||
#--------------------------------------------------------------------------------
 | 
			
		||||
use strict;
 | 
			
		||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE /;
 | 
			
		||||
require GT::SQL::Search::Base::STH;
 | 
			
		||||
 | 
			
		||||
@ISA = ('GT::SQL::Search::STH');
 | 
			
		||||
$ATTRIBS = {
 | 
			
		||||
    'db_sort' => 1,
 | 
			
		||||
    'hits'    => undef
 | 
			
		||||
};
 | 
			
		||||
$ERROR_MESSAGE = 'GT::SQL::Search::STH';
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
sub init {
 | 
			
		||||
#--------------------------------------------------------------------------------
 | 
			
		||||
# GT::SQL::Search::STH expects a full set of results in $self->{results}. For
 | 
			
		||||
# Lucene the only time a full set of results is there is when we are sorting
 | 
			
		||||
# on a field that is not weighted, otherwise the results in $self->{results} is
 | 
			
		||||
# the proper page and number of results.
 | 
			
		||||
    my $self    = shift;
 | 
			
		||||
 | 
			
		||||
    $self->set(@_);
 | 
			
		||||
 | 
			
		||||
    --$self->{nh} if $self->{nh};
 | 
			
		||||
 | 
			
		||||
    # Here we allow hits to override our concept of rows. This is only useful
 | 
			
		||||
    # when !$self->{db_sort}
 | 
			
		||||
    $self->{rows} = $self->{hits}
 | 
			
		||||
        ? $self->{hits}
 | 
			
		||||
        : $self->{results}
 | 
			
		||||
            ? scalar(keys %{$self->{results}})
 | 
			
		||||
            : 0;
 | 
			
		||||
 | 
			
		||||
    if ($self->{db_sort}) {
 | 
			
		||||
        $self->get_db_sorted_results;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        $self->get_sorted_results;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub get_sorted_results {
 | 
			
		||||
# Just sorts the results out of $self->{results} which should have been setup
 | 
			
		||||
# by a search driver
 | 
			
		||||
    my ($self) = @_;
 | 
			
		||||
    my $results = $self->{results};
 | 
			
		||||
    $self->{index} = 0;
 | 
			
		||||
    $self->{max_index} = $self->{mh} - 1;
 | 
			
		||||
    $self->{'order'}  = [ sort { 
 | 
			
		||||
                                    ( $results->{$b} || return 0 ) <=> ( $results->{$a} || 0 )
 | 
			
		||||
                              } keys %{$results} ];
 | 
			
		||||
    $self->debug_dumper( "Results will be presented in the following order: ", $self->{'order'} ) if ($self->{_debug});
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub get_db_sorted_results {
 | 
			
		||||
# This assumes $self->{results} has a full result set, i.e. without any LIMIT
 | 
			
		||||
# It then selects the result set using the SQL driver to do the sorting. This
 | 
			
		||||
# is for Search modules which can not handle their own sorting
 | 
			
		||||
    my ($self) = @_;
 | 
			
		||||
 | 
			
		||||
    my $sb = GT::SQL::Search::Base::Search->clean_sb($self->{sb}, $self->{so});
 | 
			
		||||
    $self->{index}  = $self->{nh} * $self->{mh} || 0;
 | 
			
		||||
    $self->{max_index} = $self->{index} + $self->{mh} - 1; # index + mh is the first record which should not be returned.
 | 
			
		||||
    if ($self->{max_index} > $self->{rows}) {
 | 
			
		||||
        $self->{max_index}  = $self->{rows};
 | 
			
		||||
        $self->{rows}       = $self->{rows} - $self->{index};
 | 
			
		||||
        $self->{rows} < 0 ? $self->{rows} = 0 : 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    else {
 | 
			
		||||
        $self->{rows}       = $self->{mh};
 | 
			
		||||
    }
 | 
			
		||||
    my ( $table, $pk ) = $self->_table_info();
 | 
			
		||||
    my ( $query, $where, $st, $limit );
 | 
			
		||||
 | 
			
		||||
    $where      = qq!$pk in(! . join( ",", keys %{$self->{results}}) . ')';
 | 
			
		||||
    $self->{mh} and $limit = qq!LIMIT $self->{index}, $self->{rows}!;
 | 
			
		||||
    $query      = qq!
 | 
			
		||||
        SELECT $pk
 | 
			
		||||
        FROM   $table
 | 
			
		||||
        WHERE  $where
 | 
			
		||||
        $sb
 | 
			
		||||
        $limit
 | 
			
		||||
    !;
 | 
			
		||||
    $self->debug( "Row fetch query: $query" ) if ($self->{_debug});
 | 
			
		||||
    my $sth        = $self->{table}->{driver}->prepare( $query );
 | 
			
		||||
    $sth->execute();
 | 
			
		||||
 | 
			
		||||
    $self->{index}    = 0;
 | 
			
		||||
    $self->{max_hits} = $self->{rows};
 | 
			
		||||
 | 
			
		||||
    # Fetch the results in sorted order
 | 
			
		||||
    my $order = $sth->fetchall_arrayref();
 | 
			
		||||
    $sth->finish();
 | 
			
		||||
 | 
			
		||||
    $self->{'order'}  = [ map { $_->[0] } @{$order} ];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
1;
 | 
			
		||||
 | 
			
		||||
@@ -0,0 +1,260 @@
 | 
			
		||||
# ==================================================================
 | 
			
		||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
 | 
			
		||||
#
 | 
			
		||||
#   GT::Search::LUCENE::Search
 | 
			
		||||
#   Author  : Scott Beck
 | 
			
		||||
#   CVS Info : 087,071,086,086,085      
 | 
			
		||||
#   $Id: Search.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
 | 
			
		||||
#
 | 
			
		||||
# Copyright (c) 2004 Gossamer Threads Inc.  All Rights Reserved.
 | 
			
		||||
# ==================================================================
 | 
			
		||||
#
 | 
			
		||||
# Description:
 | 
			
		||||
#   Class used to search indexed tables.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
package GT::SQL::Search::LUCENE::Search;
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
use strict;
 | 
			
		||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS $ERRORS $ERROR_MESSAGE /;
 | 
			
		||||
use Lucene;
 | 
			
		||||
use GT::TempFile;
 | 
			
		||||
use GT::SQL::Search::LUCENE::STH;
 | 
			
		||||
use GT::SQL::Search::Base::Search;
 | 
			
		||||
@ISA = qw( GT::SQL::Search::Base::Search );
 | 
			
		||||
 | 
			
		||||
# ------------------------------------------------------------------------------
 | 
			
		||||
# Preamble information related to the object
 | 
			
		||||
 | 
			
		||||
$DEBUG   = 0;
 | 
			
		||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
 | 
			
		||||
$ERRORS = {
 | 
			
		||||
    SEARCH_ERROR => "Error searching: %s",
 | 
			
		||||
    QUERY_ERROR  => "Query error: %s"
 | 
			
		||||
};
 | 
			
		||||
$ERROR_MESSAGE = 'GT::SQL';
 | 
			
		||||
 | 
			
		||||
sub load {
 | 
			
		||||
    shift;
 | 
			
		||||
    return GT::SQL::Search::LUCENE::Search->new(@_)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _get_path {
 | 
			
		||||
    my $self = shift;
 | 
			
		||||
    my $name    = $self->{table}->name;
 | 
			
		||||
    my $tmpdir  = GT::TempFile::find_tmpdir();
 | 
			
		||||
    my $path = $tmpdir . '/' . $name;
 | 
			
		||||
    $path = $1 if $path =~ /(.*)/; # XXX untaint
 | 
			
		||||
    return $path;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub _get_store {
 | 
			
		||||
    my ($self, $create) = @_;
 | 
			
		||||
    my $path = $self->_get_path;
 | 
			
		||||
    return Lucene::Store::FSDirectory->getDirectory($path, $create);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub query {
 | 
			
		||||
# --------------------------------------------------
 | 
			
		||||
# Returns a sth based on a query
 | 
			
		||||
#
 | 
			
		||||
# Options:
 | 
			
		||||
#        - paging
 | 
			
		||||
#           mh            : max hits
 | 
			
		||||
#           nh            : number hit (or page of hits)
 | 
			
		||||
#
 | 
			
		||||
#        - searching
 | 
			
		||||
#           ww            : whole word
 | 
			
		||||
#           ma            : 1 => OR match, 0 => AND match, undefined => QUERY
 | 
			
		||||
#           substring     : search for substrings of words
 | 
			
		||||
#           bool          : 'and' => and search, 'or' => or search, '' => regular query
 | 
			
		||||
#           query         : the string of things to ask for
 | 
			
		||||
#
 | 
			
		||||
#        - filtering
 | 
			
		||||
#           field_name    : value       # Find all rows with field_name = value
 | 
			
		||||
#           field_name    : ">value"    # Find all rows with field_name > value.
 | 
			
		||||
#           field_name    : "<value"    # Find all rows with field_name < value.
 | 
			
		||||
#           field_name-gt : value       # Find all rows with field_name > value.
 | 
			
		||||
#           field_name-lt : value       # Find all rows with field_name < value.
 | 
			
		||||
#
 | 
			
		||||
# Parameters:
 | 
			
		||||
#        ( $CGI ) : a single cgi object
 | 
			
		||||
#        ( $HASH ) : a hash of the parameters
 | 
			
		||||
#
 | 
			
		||||
    my $self = shift;
 | 
			
		||||
 | 
			
		||||
# create an easily accessible argument hash
 | 
			
		||||
    my $args = $self->common_param(@_);
 | 
			
		||||
    my $tbl = $self->{table};
 | 
			
		||||
 | 
			
		||||
# see if we can setup the filtering constraints
 | 
			
		||||
    my $filter = { %$args }; 
 | 
			
		||||
    my $query  = delete $args->{query} || $self->{query} || '';
 | 
			
		||||
    my $ftr_cond;
 | 
			
		||||
 | 
			
		||||
# parse query
 | 
			
		||||
    $self->debug( "Search Query: $query" ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
    my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
 | 
			
		||||
 | 
			
		||||
    $self->{rejected_keywords} = $rejected;
 | 
			
		||||
 | 
			
		||||
# setup the additional input parameters
 | 
			
		||||
    $query_struct = $self->_preset_options( $query_struct, $args );
 | 
			
		||||
 | 
			
		||||
# now sort into distinct buckets
 | 
			
		||||
    my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
 | 
			
		||||
    $self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
 | 
			
		||||
 | 
			
		||||
# with the buckets, it's now possible to create a query string
 | 
			
		||||
# that can be passed directly into the Lucene search.
 | 
			
		||||
    my $query_string = '';
 | 
			
		||||
 | 
			
		||||
    foreach my $search_type ( keys %$buckets ) {
 | 
			
		||||
        my $bucket = $buckets->{$search_type};
 | 
			
		||||
        foreach my $token ( keys %$bucket ) {
 | 
			
		||||
            next unless $token;
 | 
			
		||||
            my $properties = $bucket->{$token} or next;
 | 
			
		||||
            $token =~ s/(["()])/\\$1/g;
 | 
			
		||||
            $token =~ s/\b(or|and)\b/ /g;
 | 
			
		||||
 | 
			
		||||
            my $e = ' ';
 | 
			
		||||
 | 
			
		||||
# handle boolean operations
 | 
			
		||||
            $properties->{mode} ||= '';
 | 
			
		||||
            if ( $properties->{mode} eq 'must' ) {
 | 
			
		||||
                $e .= '+';
 | 
			
		||||
            }
 | 
			
		||||
            elsif ( $properties->{mode} eq 'cannot' ) {
 | 
			
		||||
                $e .= '-';
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
# deal with phrase vs keyword
 | 
			
		||||
            if ( $properties->{phrase} ) {
 | 
			
		||||
                $e .= '"' . $token . '"' unless $token =~ /^"|"$/;
 | 
			
		||||
            }
 | 
			
		||||
            else {
 | 
			
		||||
                $e .= $token;
 | 
			
		||||
 | 
			
		||||
# substring match
 | 
			
		||||
                if ($properties->{mode} ne 'substring') {
 | 
			
		||||
                    $e .= '*' if $properties->{substring};
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            $query_string .= $e;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
# calculate the cursor constraints
 | 
			
		||||
    foreach my $k (qw( nh mh so sb )) {
 | 
			
		||||
        next if defined $args->{$k};
 | 
			
		||||
        $args->{$k} = $self->{$k} || '';
 | 
			
		||||
    }
 | 
			
		||||
    $args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
 | 
			
		||||
    $args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
 | 
			
		||||
    $args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ )  ? $1 : 'score';
 | 
			
		||||
 | 
			
		||||
    # Score is the default
 | 
			
		||||
    $args->{so} = (defined $args->{so} and $args->{so} =~ /^(asc|desc)(?:end)?$/i) ? lc($1) : 'asc';
 | 
			
		||||
 | 
			
		||||
    my %weights = $tbl->_weight_cols();
 | 
			
		||||
    my @sortfields;
 | 
			
		||||
    my $do_mysql_sort = 0;
 | 
			
		||||
    for (ref($args->{sb}) eq 'ARRAY' ? @{$args->{sb}} : $args->{sb}) {
 | 
			
		||||
        if (!exists $weights{$_}) {
 | 
			
		||||
            $do_mysql_sort = 1 if $_ ne 'score';
 | 
			
		||||
            next;
 | 
			
		||||
        }
 | 
			
		||||
        push @sortfields, new Lucene::Search::SortField($_, $args->{so} ne 'asc');
 | 
			
		||||
    }
 | 
			
		||||
    my $sort = @sortfields ? new Lucene::Search::Sort(@sortfields) : Lucene::Search::Sort->RELEVANCE;
 | 
			
		||||
    my $store = $self->_get_store(0);
 | 
			
		||||
    my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
 | 
			
		||||
    my $searcher = eval { new Lucene::Search::IndexSearcher($store); };
 | 
			
		||||
    if ($@) {
 | 
			
		||||
        $self->{_debug} and $self->error('SEARCH_ERROR', 'WARN', "$@");
 | 
			
		||||
        return $self->sth({}, 0); # no hits
 | 
			
		||||
    }
 | 
			
		||||
    # Random default field, it's not used
 | 
			
		||||
    my $parser = new Lucene::MultiFieldQueryParser((keys %weights)[0], $analyzer);
 | 
			
		||||
    my $pquery = eval { $parser->parse($query_string, [keys %weights], $analyzer); };
 | 
			
		||||
    if ($@) {
 | 
			
		||||
        $self->{_debug} and $self->error('QUERY_ERROR', 'WARN', "$@");
 | 
			
		||||
        return $self->sth({}, 0); # no hits
 | 
			
		||||
    }
 | 
			
		||||
    my $hits = $searcher->search($pquery, $sort);
 | 
			
		||||
    my $num_hits = $hits->length;
 | 
			
		||||
 | 
			
		||||
## Setup a limit only if there is no callback. The callback argument requires a full results list
 | 
			
		||||
    my ($offset, $max_hits) = (0, $num_hits);
 | 
			
		||||
    unless ($self->{callback} or $do_mysql_sort) {
 | 
			
		||||
        $offset = ( $args->{nh} - 1 ) * $args->{mh};
 | 
			
		||||
        $max_hits = $offset + $args->{mh};
 | 
			
		||||
    }
 | 
			
		||||
    $max_hits = $num_hits if $max_hits > $num_hits;
 | 
			
		||||
    my ($pk) = $self->{table}->pk;
 | 
			
		||||
    my @indexes;
 | 
			
		||||
    my $results = {};
 | 
			
		||||
    for (my $i= $offset; $i < $max_hits; ++$i) {
 | 
			
		||||
        my $doc = $hits->doc($i);
 | 
			
		||||
        my $value = $doc->get($pk);
 | 
			
		||||
        my $score = $hits->score($i);
 | 
			
		||||
        $results->{$value} = $score;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
# now handle filters
 | 
			
		||||
    my $cols    = $self->{'table'}->cols();
 | 
			
		||||
    my %filters = map {
 | 
			
		||||
        (my $tmp = $_) =~ s/-[lg]t$//;
 | 
			
		||||
        $cols->{$tmp} ? ($_ => $args->{$_}) : ()
 | 
			
		||||
    } keys %{$args};
 | 
			
		||||
 | 
			
		||||
    if (keys %filters) {
 | 
			
		||||
        $self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
 | 
			
		||||
        $results = $self->filter(\%filters, $results);
 | 
			
		||||
    }
 | 
			
		||||
    elsif ($self->{filter}) {
 | 
			
		||||
        $self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
 | 
			
		||||
        $results = $self->_filter_query( $self->{filter}, $results );
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        $self->debug( "No filters being used.") if ($self->{_debug});
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
# now this query should probably clear the filters once it's been used, so i'll dothat here
 | 
			
		||||
    $self->{filter} = undef;
 | 
			
		||||
 | 
			
		||||
# now run through a callback function if needed.
 | 
			
		||||
    if ($self->{callback}) {
 | 
			
		||||
        unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
 | 
			
		||||
            $self->{_debug} and $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
 | 
			
		||||
            return $self->sth({}, 0); # no hits
 | 
			
		||||
        }
 | 
			
		||||
        $self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
 | 
			
		||||
        $results = $self->{callback}->($self, $results);
 | 
			
		||||
        $self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    $self->{rows} = $num_hits;
 | 
			
		||||
 | 
			
		||||
    return $self->sth($results, $do_mysql_sort);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub sth {
 | 
			
		||||
#--------------------------------------------------------------------------------
 | 
			
		||||
    my ($self, $results, $db_sort) = @_;
 | 
			
		||||
 | 
			
		||||
    my $sth = GT::SQL::Search::LUCENE::STH->new(
 | 
			
		||||
        'results' => $results,
 | 
			
		||||
        'hits'    => $self->{rows},
 | 
			
		||||
        'db'      => $self->{table}->{driver},
 | 
			
		||||
        'db_sort' => $db_sort,
 | 
			
		||||
# pass the following attributes down to the STH handler
 | 
			
		||||
        map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    return $sth;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
1;
 | 
			
		||||
		Reference in New Issue
	
	Block a user