discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/GT/SQL/Search/LUCENE/Indexer.pm
2024-06-17 21:49:12 +10:00

240 lines
6.9 KiB
Perl

# ====================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::LUCENE::Indexer
# Author: Scott Beck
# CVS Info : 087,071,086,086,085
# $Id: Indexer.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ====================================================================
#
package GT::SQL::Search::LUCENE::Indexer;
# ------------------------------------------------------------------------------
# Preamble information related to the object
use strict;
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
use Lucene;
use GT::SQL::Search::Base::Indexer;
use GT::TempFile;
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
$ERRORS = {
INDEX_CORRUPT => 'Could not create an Indexer, this probably means your index is corrupted and you should rebuild it. The error was: %s',
DELETE_FAILED => 'Could not delete some records: %s'
};
$ERROR_MESSAGE = 'GT::SQL';
sub load {
my $class = shift;
return $class->new(@_)
}
sub _get_path {
my $self = shift;
my $name = $self->{table}->name;
my $tmpdir = GT::TempFile::find_tmpdir();
my $path = $tmpdir . '/' . $name;
$path = $1 if $path =~ /(.*)/; # XXX untaint
return $path;
}
sub _get_store {
my ($self, $create) = @_;
my $path = $self->_get_path;
return Lucene::Store::FSDirectory->getDirectory($path, $create);
}
sub _get_indexer {
my ($self, $create) = @_;
my %weights = $self->{table}->_weight_cols() or return $self->error(NOWEIGHTS => 'WARN');
my ($pk) = $self->{table}->pk;
if (!$pk) {
return $self->error('NOPRIMARYKEY','WARN');
}
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
my $store = $self->_get_store($create);
my $iw;
eval { $iw = new Lucene::Index::IndexWriter($store, $analyzer, $create); };
if ($@) {
return $self->error('INDEX_CORRUPT', 'WARN', "$@");
}
return $iw;
}
sub drop_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
my $path = $self->_get_path;
require File::Tools;
File::Tools::deldir($path);
return 1;
}
sub add_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
$self->_get_indexer(1) or return;
return 1;
}
sub post_create_table {
# ------------------------------------------------------------------------------
# creates the index tables..
#
return $_[0]->add_search_driver(@_);
}
sub post_drop_table {
# -------------------------------------------------------
# Remove the index tables.
#
return $_[0]->drop_search_driver(@_);
}
sub post_add_record {
# -------------------------------------------------------
# indexes a single record
my ($self, $rec, $insert_sth, $no_optimize) = @_;
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my %weights = $tbl->_weight_cols() or return;
my $indexer = $self->_get_indexer(0) or return $self->{_debug} ? () : 1;
my $doc = new Lucene::Document;
my ($pk) = $self->{table}->pk;
delete $weights{$pk};
for my $column_name (keys %weights) {
my $field = Lucene::Document::Field->UnStored($column_name, $rec->{$column_name});
$field->setBoost($weights{$column_name});
$doc->add($field);
}
$doc->add(Lucene::Document::Field->Keyword($pk, ($tbl->ai && $insert_sth ? $insert_sth->insert_id : $rec->{$pk})));
$indexer->addDocument($doc);
$indexer->optimize if !$no_optimize;
$indexer->close;
undef $indexer;
return 1;
}
sub reindex_all {
# -------------------------------------------------------
my $self = shift;
my $table = shift;
my $opts = shift;
my $tick = $opts->{tick} || 0;
my $max = $opts->{max} || 5000;
my $indexer = $self->_get_indexer(1) or return $self->{_debug} ? () : 1; # clobbers the old one
$indexer->close;
undef $indexer;
my %weights = $self->{table}->_weight_cols() or return;
my @weight_list = keys %weights;
my ($pk) = $self->{table}->pk();
# Go through the table and index each field.
my $iterations = 1;
my $count = 0;
while (1) {
if ($max) {
my $offset = ($iterations-1) * $max;
$table->select_options("LIMIT $offset,$max");
}
my $cond = $opts->{cond} || {};
my $sth = $table->select($cond, [$pk, @weight_list]);
my $done = 1;
while (my $rec = $sth->fetchrow_hashref() ) {
$self->post_add_record($rec, undef, 1);
$done = 0;
if ($tick) {
$count++;
$count % $tick or (print "$count ");
$count % ($tick*10) or (print "\n");
}
}
last if $done;
$iterations++;
last if !$max;
}
$indexer = $self->_get_indexer(0) or return;
$indexer->optimize;
$indexer->close;
undef $indexer;
return 1;
}
sub pre_delete_record {
# -------------------------------------------------------
# Delete a records index values.
#
my ($self, $where) = @_;
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my ($pk) = $tbl->pk();
my $q = $tbl->select($where, [$pk]);
my $reader = eval { Lucene::Index::IndexReader->open($self->_get_store(0)); };
if ($@) {
return $self->{_debug} ? $self->error('INDEX_CORRUPT', 'WARN', "$@") : 1;
}
my @errors;
while (my ($item_id) = $q->fetchrow) {
my $t = new Lucene::Index::Term($pk => $item_id);
eval { $reader->deleteDocuments($t); };
if ($@) {
push @errors, "$@";
}
}
$reader->close;
undef $reader;
if (@errors) {
return $self->{_debug} ? $self->error('DELETE_FAILED', 'WARN', join(", ", @errors)) : 1;
}
return 1;
}
sub post_update_record {
# -------------------------------------------------------
my ( $self, $set_cond, $where_cond, $tmp ) = @_;
# delete the previous record
eval {
$self->pre_delete_record($where_cond) or return $self->{_debug} ? () : 1;
};
#
# the new record
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my ($pk) = $tbl->pk();
my %weights = $self->{table}->_weight_cols();
my @weight_list = keys %weights;
my $q = $tbl->select($where_cond, [$pk, @weight_list]);
while (my $href = $q->fetchrow_hashref) {
$self->post_add_record($href);
}
return 1;
}
sub reindex_record {
# -------------------------------------------------------
# reindexes a record. basically deletes all associated records from current db abnd does an index.
# it's safe to use this
my ($self, $rec) = @_;
$self->delete_record($rec);
$self->index_record($rec);
}
1;