First pass at adding key files
This commit is contained in:
@ -0,0 +1,82 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::Base::Common
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Common.pm,v 1.8 2004/10/13 21:45:02 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Base classes upon which all search drivers are based
|
||||
#
|
||||
package GT::SQL::Search::Base::Common;
|
||||
|
||||
use strict;
|
||||
use Exporter;
|
||||
use vars qw/ @ISA @EXPORT $STOPWORDS /;
|
||||
|
||||
@ISA = qw( Exporter );
|
||||
@EXPORT = qw( &_tokenize &_check_word $STOPWORDS );
|
||||
|
||||
$STOPWORDS = { map { $_ => 1 } qw/
|
||||
of about or all several also she among since an some and such are than
|
||||
as that at the be them because there been these between they both this
|
||||
but those by to do toward during towards each upon either for from was
|
||||
had were has what have when he where her which his while however with if
|
||||
within in would into you your is it its many more most must on re it
|
||||
test not above add am pm jan january feb february mar march apr april
|
||||
may jun june jul july aug august sep sept september oct october nov
|
||||
november dec december find & > < we http com www inc other
|
||||
including
|
||||
/ };
|
||||
|
||||
sub _tokenize {
|
||||
#--------------------------------------------------------------------------------
|
||||
# takes a strings and chops it up into little bits
|
||||
my $self = shift;
|
||||
my $text = shift;
|
||||
my ( @words, $i, %rejected, $word, $code );
|
||||
|
||||
# split on any non-word (includes accents) characters
|
||||
@words = split /[^\w\x80-\xFF\-]+/, lc $text;
|
||||
$self->debug_dumper( "Words: ", \@words ) if ($self->{_debug});
|
||||
|
||||
# drop all words that are too small, etc.
|
||||
$i = 0;
|
||||
while ( $i <= $#words ) {
|
||||
$word = $words[ $i ];
|
||||
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
|
||||
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
|
||||
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
|
||||
splice( @words, $i, 1 );
|
||||
$rejected{$word} = $self->{'rejections'}->{$code};
|
||||
}
|
||||
else {
|
||||
$i++; # Words ok.
|
||||
}
|
||||
}
|
||||
$self->debug_dumper( "Accepted Words: ", \@words ) if ($self->{_debug});
|
||||
$self->debug_dumper( "Rejected Words: ", \%rejected ) if ($self->{_debug});
|
||||
|
||||
return ( \@words, \%rejected );
|
||||
}
|
||||
|
||||
sub _check_word {
|
||||
#--------------------------------------------------------------------------------
|
||||
# Returns an error code if it is an invalid word, otherwise returns nothing.
|
||||
#
|
||||
my $self = shift;
|
||||
my $word = shift;
|
||||
my $code;
|
||||
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
|
||||
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
|
||||
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
|
||||
return $code;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,78 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::Base::Indexer
|
||||
# Author: Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Indexer.pm,v 1.4 2004/01/13 01:35:19 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
# Description:
|
||||
#
|
||||
#
|
||||
|
||||
package GT::SQL::Search::Base::Indexer;
|
||||
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
|
||||
use GT::Base;
|
||||
use GT::SQL::Search::Base::Common;
|
||||
|
||||
#--------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.4 $ =~ /(\d+)\.(\d+)/;
|
||||
@ISA = qw/GT::Base GT::SQL::Search::Base::Common/;
|
||||
$ATTRIBS = {
|
||||
driver => undef,
|
||||
stopwords => $STOPWORDS,
|
||||
rejections => {
|
||||
STOPWORD => "is a stopword",
|
||||
TOOSMALL => "is too small a word",
|
||||
TOOBIG => "is too big a word"
|
||||
},
|
||||
table => '',
|
||||
init => 0,
|
||||
debug => 0,
|
||||
min_word_size => 3,
|
||||
max_word_size => 50,
|
||||
};
|
||||
|
||||
sub drop_search_driver { 1 }
|
||||
sub add_search_driver { 1 }
|
||||
|
||||
# found in GT::SQL::Creator
|
||||
sub pre_create_table { 1 }
|
||||
sub post_create_table { 1 }
|
||||
|
||||
# GT::SQL::Editor
|
||||
sub pre_add_column { 1 }
|
||||
sub post_add_column { 1 }
|
||||
|
||||
sub pre_delete_column { 1 }
|
||||
sub post_delete_column { 1 }
|
||||
|
||||
sub pre_drop_table { 1 }
|
||||
sub post_drop_table { 1 }
|
||||
|
||||
# GT::SQL::Table
|
||||
sub pre_add_record { 1 }
|
||||
sub post_add_record { 1 }
|
||||
|
||||
sub pre_update_record { 1 }
|
||||
sub post_update_record { 1 }
|
||||
|
||||
sub pre_delete_record { 1 }
|
||||
sub post_delete_record { 1 }
|
||||
|
||||
sub pre_delete_all_records { 1 }
|
||||
sub post_delete_all_records { 1 }
|
||||
|
||||
sub driver_ok { 1 }
|
||||
|
||||
sub reindex_all { 1 }
|
||||
|
||||
1;
|
@ -0,0 +1,287 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::STH
|
||||
# Author: Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: STH.pm,v 1.25 2004/08/28 03:53:46 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
|
||||
package GT::SQL::Search::STH;
|
||||
#--------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE /;
|
||||
use GT::Base;
|
||||
|
||||
@ISA = ('GT::Base');
|
||||
$ATTRIBS = {
|
||||
'_debug' => 0,
|
||||
'sth' => undef,
|
||||
'results' => {},
|
||||
'db' => undef,
|
||||
'table' => undef,
|
||||
'index' => 0,
|
||||
'order' => [],
|
||||
'sb' => 'score',
|
||||
'so' => '',
|
||||
'score_col' => 'SCORE',
|
||||
'score_sort'=> 0,
|
||||
'nh' => 0,
|
||||
'mh' => 0
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
$ERRORS = {
|
||||
BADSB => 'Invalid character found in so: "%s"',
|
||||
};
|
||||
|
||||
sub init {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
|
||||
# setup the options
|
||||
$self->set(@_);
|
||||
|
||||
# correct a few of the values
|
||||
--$self->{nh} if $self->{nh};
|
||||
|
||||
my $sth;
|
||||
my $results = $self->{results};
|
||||
$self->{rows} = scalar( $results ? keys %{$results} : 0 );
|
||||
|
||||
# if we have asked to have sorting by another column (non score), create the part of the query that handles taht
|
||||
$self->debug( "Setting up sorting for GT::SQL::Search::STH" ) if ($self->{_debug});
|
||||
my $sb;
|
||||
|
||||
# clean up the sort by columns.
|
||||
unless ($self->{'score_sort'}) {
|
||||
$sb = GT::SQL::Search::Base::Search->clean_sb($self->{sb}, $self->{so});
|
||||
}
|
||||
|
||||
# setup the max hits and the offsets
|
||||
$self->{index} = $self->{nh} * $self->{mh} || 0;
|
||||
$self->{max_index} = $self->{index} + $self->{mh} - 1; # index + mh is the first record which should not be returned.
|
||||
|
||||
if ( $self->{max_index} > $self->{rows} ) {
|
||||
$self->{max_index} = $self->{rows};
|
||||
$self->{rows} = $self->{rows} - $self->{index};
|
||||
$self->{rows} < 0 ? $self->{rows} = 0 : 0;
|
||||
}
|
||||
|
||||
else {
|
||||
$self->{rows} = $self->{mh};
|
||||
}
|
||||
|
||||
# if we are sorting by another column, handle that
|
||||
if ( $sb and (keys %{$self->{results}})) {
|
||||
my ( $table, $pk ) = $self->_table_info();
|
||||
my ( $query, $where, $st, $limit );
|
||||
|
||||
$where = qq!$pk in(! . join( ",", keys %{$self->{results}}) . ')';
|
||||
$self->{mh} and $limit = qq!LIMIT $self->{index}, $self->{rows}!;
|
||||
$query = qq!
|
||||
SELECT $pk
|
||||
FROM $table
|
||||
WHERE $where
|
||||
$sb
|
||||
$limit
|
||||
!;
|
||||
$self->debug( "Row fetch query: $query" ) if ($self->{_debug});
|
||||
$sth = $self->{table}->{driver}->prepare( $query );
|
||||
$sth->execute();
|
||||
|
||||
# fix the counts
|
||||
$self->{index} = 0;
|
||||
$self->{max_hits} = $self->{rows};
|
||||
|
||||
# now return them
|
||||
my $order = $sth->fetchall_arrayref();
|
||||
$sth->finish();
|
||||
|
||||
$self->{'order'} = [ map { $_->[0] } @{$order} ];
|
||||
}
|
||||
else {
|
||||
$self->{'order'} = [ sort {
|
||||
( $results->{$b} || return 0 ) <=> ( $results->{$a} || 0 )
|
||||
} keys %{$results} ];
|
||||
$self->debug_dumper( "Results will be presented in the following order: ", $self->{'order'} ) if ($self->{_debug});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
sub cache_results {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
|
||||
my $results = $self->{'results'};
|
||||
my ($sth, @records, $i, %horder, @order, $in_list);
|
||||
my $table = $self->{table};
|
||||
my $tname = $table->name();
|
||||
my ($pk) = $self->{table}->pk;
|
||||
|
||||
use GT::SQL::Condition;
|
||||
|
||||
# we know what we're doing here so shut off warns (complains about uninit'd values in range
|
||||
# if thee aren't enough elements in the order array)
|
||||
my $w = $^W; $^W = 0;
|
||||
@order = grep $_, @{$self->{'order'} || []}[$self->{index}..$self->{max_index}] or return [];
|
||||
$^W = $w;
|
||||
|
||||
$i = 0; %horder = ( map { ( $_ => $i++) } @order );
|
||||
$in_list = join ( ",", @order );
|
||||
my $query = qq|
|
||||
SELECT *
|
||||
FROM
|
||||
$tname
|
||||
WHERE
|
||||
$pk IN($in_list)
|
||||
|;
|
||||
|
||||
# the following is left commented out as...
|
||||
# if $tbl->select is used $table->hits() will not
|
||||
# return an accurate count of the number of all the hits. instead, will return
|
||||
# a value up to mh. $tbl->hits() is important because the value is used
|
||||
# in toolbar calculations
|
||||
#
|
||||
# $sth = $table->select( GT::SQL::Condition->new( $pk, 'IN', \"($in_list)" ) );
|
||||
$sth = $table->do_query( $query );
|
||||
|
||||
while ( my $href = $sth->fetchrow_hashref() ) {
|
||||
$records[$horder{$href->{$pk}}] = \%$href
|
||||
}
|
||||
|
||||
return \@records;
|
||||
|
||||
}
|
||||
|
||||
sub fetchrow_array {
|
||||
#--------------------------------------------------------------------------------
|
||||
return @{ $_[0]->fetchrow_arrayref() || [] };
|
||||
}
|
||||
|
||||
sub fetchrow_arrayref {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $records = $self->{cache} ||= $self->cache_results;
|
||||
my $href = shift @$records or return;
|
||||
return $self->_hash_to_array($href);
|
||||
}
|
||||
|
||||
sub fetchrow_hashref {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
|
||||
my $results = $self->{'results'};
|
||||
my $records = $self->{cache} ||= $self->cache_results;
|
||||
my $table = $self->{table};
|
||||
my ($pk) = $self->{table}->pk;
|
||||
|
||||
my $href = shift @$records or return;
|
||||
|
||||
$href->{$self->{'score_col'}} = ( $self->{score} = $results->{$href->{$pk}} );
|
||||
|
||||
return $href;
|
||||
|
||||
}
|
||||
|
||||
sub fetchall_hashref {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my @results;
|
||||
while (my $res = $self->fetchrow_hashref) {
|
||||
push @results, $res;
|
||||
}
|
||||
return \@results;
|
||||
}
|
||||
|
||||
sub fetchall_list {
|
||||
#--------------------------------------------------------------------------------
|
||||
return { map { @$_ } @{shift->fetchall_arrayref} }
|
||||
}
|
||||
|
||||
sub fetchall_arrayref {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
|
||||
$self->{order} or return [];
|
||||
my $results = $self->{results};
|
||||
my ($pk) = $self->{table}->pk;
|
||||
my $scol = $self->{score_col};
|
||||
|
||||
|
||||
if (!$self->{allref_cache}) {
|
||||
$self->{allref_cache} ||= $self->cache_results;
|
||||
|
||||
for my $i ( 0 .. $#{$self->{allref_cache}} ) {
|
||||
my $element = $self->{allref_cache}->[$i];
|
||||
if ( $_[0] eq 'HASH' ) {
|
||||
$element->{$scol} = $results->{$element->{$pk}};
|
||||
}
|
||||
else {
|
||||
$element->{$scol} = $self->_hash_to_array( $element->{$scol} );
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
my $records = $self->{allref_cache};
|
||||
|
||||
return $records;
|
||||
}
|
||||
|
||||
sub score {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
return $self->{score};
|
||||
}
|
||||
|
||||
sub _hash_to_array {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $href = shift or return;
|
||||
|
||||
my $results = $self->{'results'};
|
||||
my $table = $self->{table};
|
||||
my $cols = $table->cols();
|
||||
my $ordercols = ( $self->{ordercols} ||= [ sort { $cols->{$a}->{pos} <=> $cols->{$b}->{pos} } keys %$cols ] );
|
||||
my ($pk) = $self->{table}->pk;
|
||||
my $aref = [ map { $href->{$_} } @$ordercols, ( $self->{score} = $results->{$href->{$pk}} ) ];
|
||||
|
||||
return $aref;
|
||||
}
|
||||
|
||||
sub rows {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
return $self->{rows};
|
||||
}
|
||||
|
||||
sub _table_info {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $table = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
|
||||
my ($pk) = $self->{table}->pk;
|
||||
return ( $table, $pk );
|
||||
}
|
||||
|
||||
sub DESTROY {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
$self->{'sth'} and $self->{'sth'}->finish();
|
||||
}
|
||||
|
||||
sub debug_dumper {
|
||||
#--------------------------------------------------------------------------------
|
||||
# calls debug but also dumps all the messages
|
||||
my $self = shift;
|
||||
my $message = shift;
|
||||
my $level = ref $_[0] ? 1 : shift;
|
||||
|
||||
if ( $self->{_debug} >= $level ) {
|
||||
require GT::Dumper;
|
||||
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )) if ($self->{_debug});
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,572 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::Base
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.25 2004/08/28 03:53:46 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Base classes upon which all search drivers are based
|
||||
#
|
||||
|
||||
package GT::SQL::Search::Base::Search;
|
||||
|
||||
|
||||
use strict;
|
||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
|
||||
use GT::Base;
|
||||
use GT::SQL::Search::Base::Common;
|
||||
@ISA = qw( GT::Base GT::SQL::Search::Base::Common);
|
||||
|
||||
#--------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.25 $ =~ /(\d+)\.(\d+)/;
|
||||
@ISA = qw/ GT::Base /;
|
||||
|
||||
$ATTRIBS = {
|
||||
'stopwords' => $STOPWORDS,
|
||||
'mh' => 25,
|
||||
'nh' => 1,
|
||||
'ww' => undef,
|
||||
'ma' => undef,
|
||||
'bool' => undef,
|
||||
'substring' => 0,
|
||||
'query' => '',
|
||||
'sb' => 'score',
|
||||
'so' => '',
|
||||
'score_col' => 'SCORE',
|
||||
'score_sort'=> 0,
|
||||
'debug' => 0,
|
||||
'_debug' => 0,
|
||||
|
||||
# query related
|
||||
'db' => undef,
|
||||
'table' => undef,
|
||||
'filter' => undef,
|
||||
'callback' => undef,
|
||||
|
||||
# strict matching of indexed words, accents on words do count
|
||||
'sm' => 0,
|
||||
'min_word_size' => 3,
|
||||
'max_word_size' => 50,
|
||||
};
|
||||
|
||||
sub init {
|
||||
#--------------------------------------------------------------------------------
|
||||
# Initialises the Search object
|
||||
#
|
||||
my $self = shift;
|
||||
my $input = $self->common_param(@_);
|
||||
|
||||
$self->set($input);
|
||||
|
||||
# now handle filters...,
|
||||
my $tbl = $self->{table};
|
||||
my $cols = $tbl->cols();
|
||||
my %filters = map {
|
||||
(my $tmp = $_) =~ s/-[lg]t$//;
|
||||
exists $cols->{$tmp} ? ($_ => $input->{$_}) : ()
|
||||
} keys %{$input};
|
||||
|
||||
if ( keys %filters ) {
|
||||
$self->debug_dumper( "INIT: Creating Filters: ", \%filters ) if ( $self->{_debug} );
|
||||
$self->filter(\%filters);
|
||||
}
|
||||
|
||||
$self->{table}->connect;
|
||||
}
|
||||
|
||||
sub query {
|
||||
#--------------------------------------------------------------------------------
|
||||
# Returns a sth based on a query
|
||||
#
|
||||
# Options:
|
||||
# - paging
|
||||
# mh : max hits
|
||||
# nh : number hit (or page of hits)
|
||||
#
|
||||
# - searching
|
||||
# ww : whole word
|
||||
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
||||
# substring : search for substrings of words
|
||||
# bool : 'and' => and search, 'or' => or search, '' => regular query
|
||||
# query : the string of things to ask for
|
||||
#
|
||||
# - filtering
|
||||
# field_name : value # Find all rows with field_name = value
|
||||
# field_name : ">value" # Find all rows with field_name > value.
|
||||
# field_name : "<value" # Find all rows with field_name < value.
|
||||
# field_name-gt : value # Find all rows with field_name > value.
|
||||
# field_name-lt : value # Find all rows with field_name < value.
|
||||
#
|
||||
# Parameters:
|
||||
# ( $CGI ) : a single cgi object
|
||||
# ( $HASH ) : a hash of the parameters
|
||||
#
|
||||
my $self = shift;
|
||||
# find out what sort of a parameter we're dealing with
|
||||
my $input = $self->common_param(@_);
|
||||
|
||||
# add additional parameters if required
|
||||
foreach my $parameter ( keys %{$ATTRIBS} ) {
|
||||
if ( not exists $input->{$parameter} ) {
|
||||
$input->{$parameter} = $self->{$parameter};
|
||||
}
|
||||
}
|
||||
|
||||
# parse query...,
|
||||
$self->debug( "Search Query: $$input{query}", 1 ) if ($self->{_debug});
|
||||
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
|
||||
|
||||
$self->{'rejected_keywords'} = $rejected;
|
||||
|
||||
# setup the additional input parameters
|
||||
$query = $self->_preset_options( $query, $input );
|
||||
|
||||
$self->debug_dumper( "Set the pre-options: ", $query ) if ($self->{_debug});
|
||||
|
||||
# now sort into distinct buckets
|
||||
my $buckets = &_create_buckets( $query );
|
||||
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
|
||||
|
||||
return $self->_query($input, $buckets);
|
||||
}
|
||||
|
||||
sub _query {
|
||||
#--------------------------------------------------------------------------------
|
||||
my ( $self, $input, $buckets ) = @_;
|
||||
|
||||
# now handle the separate possibilities
|
||||
my $results = {};
|
||||
|
||||
# query can have phrases
|
||||
$results = $self->_phrase_query( $buckets->{phrases}, $results );
|
||||
$self->debug_dumper("Did phrase union query. Current result set and scores: ", $results ) if ($self->{_debug});
|
||||
|
||||
# query have keywords
|
||||
$results = $self->_union_query( $buckets->{keywords}, $results );
|
||||
$self->debug_dumper("Did keyword union query. Current result set and scores: ", $results ) if ($self->{_debug});
|
||||
|
||||
# query must have phrases
|
||||
$results = $self->_phrase_intersect_query( $buckets->{phrases_must}, $results );
|
||||
$self->debug_dumper("Did phrase intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
|
||||
|
||||
# query must have keywords
|
||||
$results = $self->_intersect_query( $buckets->{keywords_must}, $results );
|
||||
$self->debug_dumper("Did keyword intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
|
||||
|
||||
# query cannot have keywords
|
||||
$results = $self->_disjoin_query( $buckets->{keywords_cannot}, $results );
|
||||
$self->debug_dumper("Did keyword disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
|
||||
|
||||
# query cannot have phrases
|
||||
$results = $self->_phrase_disjoin_query( $buckets->{phrases_cannot}, $results);
|
||||
$self->debug_dumper("Did phrase disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
|
||||
|
||||
# now handle filters
|
||||
my $cols = $self->{'table'}->cols();
|
||||
my %filters = map {
|
||||
(my $tmp = $_) =~ s/-[lg]t$//;
|
||||
$cols->{$tmp} ? ($_ => $input->{$_}) : ()
|
||||
} keys %{$input};
|
||||
|
||||
if (keys %filters) {
|
||||
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
|
||||
$results = $self->filter(\%filters, $results);
|
||||
}
|
||||
elsif ($self->{filter}) {
|
||||
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
|
||||
$results = $self->_filter_query( $self->{filter}, $results );
|
||||
}
|
||||
else {
|
||||
$self->debug( "No filters being used.") if ($self->{_debug});
|
||||
}
|
||||
|
||||
# now this query should probably clear the filters once it's been used, so i'll dothat here
|
||||
$self->{filter} = undef;
|
||||
|
||||
# now run through a callback function if needed.
|
||||
if ($self->{callback}) {
|
||||
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
|
||||
return $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
|
||||
}
|
||||
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
$results = $self->{callback}->($self, $results);
|
||||
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
}
|
||||
|
||||
# so how many hits did we get?
|
||||
$self->{table}->{last_hits} = ( $self->{rows} = scalar($results ? keys %{$results} : ()) );
|
||||
|
||||
# and now create a search sth object to handle all this
|
||||
return $self->sth( $results );
|
||||
}
|
||||
|
||||
sub sth {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $results = shift;
|
||||
|
||||
require GT::SQL::Search::Base::STH;
|
||||
my $sth = GT::SQL::Search::STH->new(
|
||||
'results' => $results,
|
||||
'db' => $self->{table}->{driver},
|
||||
# pass the following attributes down to the STH handler
|
||||
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
|
||||
);
|
||||
|
||||
return $sth;
|
||||
}
|
||||
|
||||
sub rows {
|
||||
#--------------------------------------------------------------------------------
|
||||
# after a query is run, returns the number of rows
|
||||
my $self = shift;
|
||||
return $self->{rows} || 0;
|
||||
}
|
||||
|
||||
sub _add_filters {
|
||||
#--------------------------------------------------------------------------------
|
||||
# creates the filter object
|
||||
my $self = shift;
|
||||
my $filter;
|
||||
|
||||
# find out how we're calling the parameters
|
||||
if ( ref $_[0] eq 'GT::SQL::Condition' ) {
|
||||
$filter = shift;
|
||||
}
|
||||
elsif ( ref $_[0] eq 'HASH' ) {
|
||||
|
||||
|
||||
# setup the query condition using the build_query condition method
|
||||
# build the condition object
|
||||
my %opts = %{ shift() || {} };
|
||||
delete $opts{query};
|
||||
|
||||
$filter = $self->{table}->build_query_cond( \%opts, $self->{table}{schema}{cols} );
|
||||
|
||||
}
|
||||
else {
|
||||
return $self->error ('BADARGS', 'FATAL', "Invalid argument: $_[0] passed to _add_filter");
|
||||
}
|
||||
|
||||
# Use ref, as someone can pass in filter => 1 and mess things up.
|
||||
|
||||
ref $self->{filter} ? $self->{filter}->add ($filter) : ($self->{filter} = $filter);
|
||||
$self->debug_dumper( "Filters: ", $self->{filter} ) if ($self->{_debug});
|
||||
|
||||
return $self->{filter};
|
||||
|
||||
}
|
||||
|
||||
sub _preset_options {
|
||||
#--------------------------------------------------------------------------------
|
||||
# sets up word parameters
|
||||
my $self = shift;
|
||||
my $query = shift or return;
|
||||
my $input = shift or return $query;
|
||||
|
||||
# whole word searching
|
||||
if ( defined $input->{'ww'} or defined $self->{'ww'}) {
|
||||
if ( defined $input->{'ww'} ? $input->{'ww'} : $self->{'ww'} ) {
|
||||
for ( keys %{$query} ) { $query->{$_}->{mode} = 'substring'; }
|
||||
}
|
||||
}
|
||||
|
||||
# substring searching
|
||||
if ( defined $input->{'substring'} or defined $self->{'substring'}) {
|
||||
if ( defined $input->{'substring'} ? $input->{'substring'} : $self->{'substring'} ) {
|
||||
for ( keys %{$query} ) { $query->{$_}->{'substring'} = 1; }
|
||||
}
|
||||
}
|
||||
|
||||
if ( defined $input->{'ma'} or defined $self->{'ma'} ) {
|
||||
# each keyword must be included
|
||||
if ( defined $input->{'ma'} ? $input->{'ma'} : $self->{'ma'} ) {
|
||||
for ( keys %{$query} ) {
|
||||
next if $query->{$_}->{mode} eq 'cannot';
|
||||
$query->{$_}->{mode} = 'must';
|
||||
}
|
||||
}
|
||||
# each word can be included but is not necessary
|
||||
else {
|
||||
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
|
||||
}
|
||||
}
|
||||
|
||||
# some more and or searches, only if user hasn't put +word -word
|
||||
if ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'and' ) {
|
||||
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
|
||||
for ( keys %{$query} ) {
|
||||
next if $query->{$_}->{mode} eq 'cannot';
|
||||
$query->{$_}->{mode} = 'must';
|
||||
}
|
||||
}
|
||||
}
|
||||
elsif ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'or' ) {
|
||||
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
|
||||
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
|
||||
}
|
||||
}
|
||||
|
||||
return $query;
|
||||
}
|
||||
|
||||
sub _phrase_query { $_[1] }
|
||||
sub _union_query { $_[1] }
|
||||
sub _phrase_intersect_query { $_[1] }
|
||||
sub _intersect_query { $_[1] }
|
||||
sub _disjoin_query { $_[1] }
|
||||
sub _phrase_disjoin_query { $_[1] }
|
||||
|
||||
sub filter {
|
||||
#--------------------------------------------------------------------------------
|
||||
# adds a filter
|
||||
#
|
||||
my $self = shift;
|
||||
|
||||
# add filters..,
|
||||
my $filters = $self->_add_filters( shift );
|
||||
my $results = shift;
|
||||
|
||||
# see if we need to execute a search, otherwise just return the current filterset
|
||||
defined $results or return $results;
|
||||
|
||||
# start doing the filter stuff
|
||||
return $self->_filter_query( $filters, $results );
|
||||
}
|
||||
|
||||
sub _parse_query_string {
|
||||
#------------------------------------------------------------
|
||||
# from Mastering Regular Expressions altered a fair bit
|
||||
# takes a space delimited string and breaks it up.
|
||||
#
|
||||
my $self = shift;
|
||||
my $text = shift;
|
||||
|
||||
my %words = ();
|
||||
my %reject = ();
|
||||
my %mode = (
|
||||
'+' => 'must',
|
||||
'-' => 'cannot',
|
||||
'<' => 'greater',
|
||||
'>' => 'less'
|
||||
);
|
||||
|
||||
# work on the individual elements
|
||||
my @new = ();
|
||||
while ( $text =~ m{
|
||||
# the first part groups the phrase inside the quotes.
|
||||
# see explanation of this pattern in MRE
|
||||
([+-]?"[^\"\\]*(?:\\.[^\"\\]*)*)" ?
|
||||
| (\+?[\w\x80-\xFF\-\*]+),?
|
||||
| ' '
|
||||
}gx ) {
|
||||
|
||||
my $match = lc $+;
|
||||
|
||||
# strip out buffering spaces
|
||||
$match =~ s/^\s+//; $match =~ s/\s+$//;
|
||||
|
||||
# don't bother trying if there is nothing there
|
||||
next unless $match;
|
||||
|
||||
# find out the searching mode
|
||||
my ($mode, $substring, $phrase);
|
||||
if (my $m = $mode{substr($match,0,1)}) {
|
||||
$match = substr($match,1);
|
||||
$mode = $m;
|
||||
}
|
||||
|
||||
# do we need to substring match?
|
||||
if ( substr( $match, -1, 1 ) eq "*" ) {
|
||||
$match = substr($match,0,length($match)-1);
|
||||
$substring = 1;
|
||||
}
|
||||
|
||||
# find out if we're dealing with a phrase
|
||||
if ( substr($match,0,1) eq '"' ) {
|
||||
$self->debug( "Dealing with a phrase: $match" ) if ($self->{_debug});
|
||||
|
||||
$match = substr($match,1);
|
||||
|
||||
# however, we want to make sure it's a phrase and not something else
|
||||
my ( $word_list, $rejected ) = $self->_tokenize( $match );
|
||||
$self->debug_dumper( "Phrase reduced to: ", $word_list ) if ($self->{_debug});
|
||||
$self->debug_dumper( "Phrase words rejected: ", $rejected ) if ($self->{_debug});
|
||||
my $word_count = @$word_list;
|
||||
|
||||
if ( $word_count > 1 ) { $phrase = $word_list } # ok, standard phrase
|
||||
elsif ($word_count == 1) { $match = $word_list->[0] } # just a standard word, don't worry about searching by phrase
|
||||
}
|
||||
|
||||
# make sure we can use this word
|
||||
if ( not $phrase and my $code = $self->_check_word( $match, $self->{stopwords} ) ) {
|
||||
$reject{ $match } = $code;
|
||||
next;
|
||||
}
|
||||
|
||||
# now, see if we should toss this word
|
||||
$words{$match} = {
|
||||
mode => $mode,
|
||||
phrase => $phrase,
|
||||
substring => $substring,
|
||||
keyword => not $phrase,
|
||||
};
|
||||
}
|
||||
|
||||
# words is a hashref of:
|
||||
# {
|
||||
# word => {
|
||||
# paramaters => 'values'
|
||||
# },
|
||||
# word1 => {
|
||||
# ...
|
||||
# },
|
||||
# ...
|
||||
# }
|
||||
#
|
||||
return( \%words, \%reject );
|
||||
|
||||
}
|
||||
|
||||
|
||||
sub _filter_query {
|
||||
#--------------------------------------------------------------------------------
|
||||
# get the results from the filter
|
||||
#
|
||||
my $self = shift;
|
||||
my $filters = shift;
|
||||
my $results = shift or return {};
|
||||
keys %{$results} or return $results;
|
||||
|
||||
my $table = $self->{table};
|
||||
my $tname = $table->name();
|
||||
|
||||
# setup the where clause
|
||||
my $where = $filters->sql() or return $results;
|
||||
my ($pk) = $table->pk;
|
||||
$where .= qq! AND $pk IN (! . join(',', keys %$results) . ')';
|
||||
|
||||
# now do the filter
|
||||
my $query = qq!
|
||||
SELECT $pk
|
||||
FROM
|
||||
$tname
|
||||
WHERE
|
||||
$where
|
||||
!;
|
||||
$self->debug( "Filter Query: $query" ) if ($self->{_debug});
|
||||
my $sth = $self->{table}->{driver}->prepare($query);
|
||||
$sth->execute();
|
||||
|
||||
# get all the results
|
||||
my $aref = $sth->fetchall_arrayref;
|
||||
return {
|
||||
map {
|
||||
$_->[0] => $results->{$_->[0]}
|
||||
} @$aref
|
||||
};
|
||||
}
|
||||
|
||||
sub _create_buckets {
|
||||
#------------------------------------------------------------
|
||||
# takes the output from _parse_query_string and creates a
|
||||
# bucket hash of all the different types of searching
|
||||
# possible
|
||||
my $query = shift or return;
|
||||
|
||||
my %buckets;
|
||||
|
||||
# put each word in the appropriate hash bucket
|
||||
foreach my $parameter ( keys %{$query} ) {
|
||||
|
||||
my $word_data = $query->{$parameter};
|
||||
|
||||
# the following is slower, however, done that way to be syntatically legible
|
||||
if ( $word_data->{'phrase'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'} =~ /(must|cannot)/ ))) {
|
||||
$buckets{"phrases_$1"}->{$parameter} = $word_data;
|
||||
}
|
||||
elsif ( $word_data->{'phrase'} ) {
|
||||
$buckets{'phrases'}->{$parameter} = $word_data;
|
||||
}
|
||||
elsif ( $word_data->{'keyword'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'}=~ /(must|cannot|greater|less)/) )) {
|
||||
$buckets{"keywords_$1"}->{$parameter} = $word_data;
|
||||
}
|
||||
else {
|
||||
$buckets{'keywords'}->{$parameter} = $word_data;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return \%buckets;
|
||||
}
|
||||
|
||||
sub alternate_driver_query {
|
||||
#--------------------------------------------------------------------------------
|
||||
my ( $self, $drivername, $input ) = @_;
|
||||
|
||||
$drivername = uc $drivername;
|
||||
require GT::SQL::Search;
|
||||
my $driver = GT::SQL::Search->load_search({ %$input, table => $self->{table}, driver => $drivername });
|
||||
my $sth = $driver->query( $input );
|
||||
foreach ( qw( rows rejected_keywords ) ) { $self->{$_} = $driver->{$_}; }
|
||||
return $sth;
|
||||
|
||||
}
|
||||
|
||||
sub clean_sb {
|
||||
# -------------------------------------------------------------------------------
|
||||
# Convert the sort by, sort order into an sql string.
|
||||
#
|
||||
my ($class, $sb, $so) = @_;
|
||||
my $output = '';
|
||||
|
||||
return $output unless ($sb);
|
||||
|
||||
# Remove score attribute, used only for internal indexes.
|
||||
$sb =~ s/^\s*score\b//;
|
||||
$sb =~ s/,?\s*\bscore\b//;
|
||||
|
||||
if ($sb and not ref $sb) {
|
||||
if ($sb =~ /^[\w\s,]+$/) {
|
||||
if ($sb =~ /\s(?:asc|desc)/i) {
|
||||
$output = 'ORDER BY ' . $sb;
|
||||
}
|
||||
else {
|
||||
$output = 'ORDER BY ' . $sb . ' ' . $so;
|
||||
}
|
||||
}
|
||||
else {
|
||||
$class->error('BADSB', 'WARN', $sb);
|
||||
}
|
||||
}
|
||||
elsif (ref $sb eq 'ARRAY') {
|
||||
foreach ( @$sb ) {
|
||||
/^[\w\s,]+$/ or $class->error( 'BADSB', 'WARN', $sb ), next;
|
||||
}
|
||||
$output = 'ORDER BY ' . join(',', @$sb);
|
||||
}
|
||||
return $output;
|
||||
}
|
||||
|
||||
sub debug_dumper {
|
||||
#--------------------------------------------------------------------------------
|
||||
# calls debug but also dumps all the messages
|
||||
my $self = shift;
|
||||
my $message = shift;
|
||||
my $level = ref $_[0] ? 1 : defined $_[0] ? shift : 0;
|
||||
|
||||
if ( $self->{_debug} >= $level ) {
|
||||
require GT::Dumper;
|
||||
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )) if ($self->{_debug});
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,411 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::INTERNAL::Indexer
|
||||
# Author: Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Indexer.pm,v 1.11 2004/01/13 01:35:19 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
|
||||
package GT::SQL::Search::INTERNAL::Indexer;
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG /;
|
||||
use GT::SQL::Search::Base::Indexer;
|
||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.11 $ =~ /(\d+)\.(\d+)/;
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::INTERNAL::Indexer->new(@_)
|
||||
}
|
||||
|
||||
sub drop_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $table = $self->{table}->name;
|
||||
my $rc1 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Word_List");
|
||||
my $rc2 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Score_List");
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub add_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $name = $self->{table}->name;
|
||||
|
||||
# first create the table that handles the words.
|
||||
my $creator = $self->{table}->creator ( $name . "_Word_List" );
|
||||
$creator->cols(
|
||||
Word_ID => {
|
||||
pos => 1,
|
||||
type => 'int',
|
||||
not_null => 1,
|
||||
unsigned => 1
|
||||
},
|
||||
Word => {
|
||||
pos => 2,
|
||||
type => 'varchar',
|
||||
not_null=> 1,
|
||||
size => '50'
|
||||
},
|
||||
Frequency => {
|
||||
pos => 3,
|
||||
type => 'int',
|
||||
not_null=> 1
|
||||
}
|
||||
);
|
||||
$creator->pk('Word_ID');
|
||||
$creator->ai('Word_ID');
|
||||
$creator->unique({ $name . "_wordndx" => ['Word'] });
|
||||
$creator->create('force') or return;
|
||||
|
||||
# now create the handler for scores
|
||||
$creator = $self->{table}->creator( $name . '_Score_List' );
|
||||
$creator->cols(
|
||||
Word_ID => {
|
||||
pos => 1,
|
||||
type => 'int',
|
||||
not_null => 1,
|
||||
unsigned => 1
|
||||
},
|
||||
Item_ID => {
|
||||
pos => 2,
|
||||
type => 'int',
|
||||
not_null => 1,
|
||||
unsigned => 1
|
||||
},
|
||||
Score => {
|
||||
pos => 3,
|
||||
type => 'int',
|
||||
not_null => 1
|
||||
},
|
||||
Word_Pos => {
|
||||
pos => 4,
|
||||
type => 'int',
|
||||
not_null => 1
|
||||
}
|
||||
);
|
||||
$creator->index({ 'wndx' => ['Word_ID', 'Item_ID', 'Score'], 'itndx' => ['Item_ID'] });
|
||||
$creator->create('force') or return;
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
sub post_create_table {
|
||||
# ------------------------------------------------------------------------------
|
||||
# creates the index tables..
|
||||
#
|
||||
return $_[0]->add_search_driver(@_);
|
||||
}
|
||||
|
||||
sub post_drop_table {
|
||||
# -------------------------------------------------------
|
||||
# Remove the index tables.
|
||||
#
|
||||
return $_[0]->drop_search_driver(@_);
|
||||
}
|
||||
|
||||
sub init_queries {
|
||||
# -------------------------------------------------------
|
||||
# Pre-load all our queries.
|
||||
#
|
||||
my $self = shift;
|
||||
my $queries = shift;
|
||||
|
||||
my $driver = $self->{table}->{driver} or return $self->error ('NODRIVER', 'FATAL');
|
||||
my $table_name = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
|
||||
my $wtable = $table_name . '_Word_List';
|
||||
my $seq = $wtable . '_seq';
|
||||
my $stable = $table_name . '_Score_List';
|
||||
|
||||
my %ai_queries = (
|
||||
ins_word_ORACLE => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES ($seq.NEXTVAL, ?, ?)",
|
||||
ins_word_PG => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES (NEXTVAL('$seq'), ?, ?)",
|
||||
ins_word => "INSERT INTO $wtable (Word, Frequency) VALUES (?, ?)"
|
||||
);
|
||||
my %queries = (
|
||||
upd_word => "UPDATE $wtable SET Frequency = ? WHERE Word_ID = ?",
|
||||
sel_word => "SELECT Word_ID,Word,Frequency FROM $wtable WHERE Word = ?",
|
||||
sel_freq => "SELECT Frequency FROM $wtable WHERE Word_ID = ?",
|
||||
del_word => "DELETE FROM $wtable WHERE Word_ID = ?",
|
||||
mod_word => "UPDATE $wtable SET Frequency = Frequency - ? WHERE Word_ID = ?",
|
||||
ins_scor => "INSERT INTO $stable (Word_ID, Item_ID, Score, Word_Pos) VALUES (?, ?, ?, ?)",
|
||||
item_cnt => "SELECT Word_ID, COUNT(*) FROM $stable WHERE Item_ID = ? GROUP BY Word_ID",
|
||||
scr_del => "DELETE FROM $stable WHERE Item_ID = ?",
|
||||
dump_word => "DELETE FROM $wtable",
|
||||
dump_scor => "DELETE FROM $stable"
|
||||
);
|
||||
my $type = uc $self->{table}->{connect}->{driver};
|
||||
$self->{ins_word} = $driver->prepare($ai_queries{"ins_word_$type"} || $ai_queries{"ins_word"});
|
||||
|
||||
# check to see if the table exist
|
||||
$self->{table}->new_table( $wtable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
|
||||
$self->{table}->new_table( $stable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
|
||||
|
||||
|
||||
if ($type eq 'MYSQL') {
|
||||
foreach my $query (keys %queries) {
|
||||
$self->{$query} = $driver->prepare_raw ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
|
||||
}
|
||||
}
|
||||
else {
|
||||
foreach my $query (keys %queries) {
|
||||
$self->{$query} = $driver->prepare ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub post_add_record {
|
||||
# -------------------------------------------------------
|
||||
# indexes a single record
|
||||
my ($self, $rec, $insert_sth ) = @_;
|
||||
|
||||
# Only continue if we have weights and a primary key.
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my %weights = $tbl->_weight_cols() or return;
|
||||
my ($pk) = $tbl->pk();
|
||||
my $item_id = ( $tbl->ai() and $insert_sth ) ? $insert_sth->insert_id() : $rec->{$pk};
|
||||
my $index = 0;
|
||||
|
||||
$self->{init} or $self->init_queries;
|
||||
|
||||
# Go through each column and index it.
|
||||
foreach my $column ( keys %weights ) {
|
||||
my ($word_list, $rejected) = $self->_tokenize( $rec->{$column} );
|
||||
$word_list or next;
|
||||
|
||||
# Build a hash of word => frequency.
|
||||
my %words;
|
||||
foreach my $word (@{$word_list}) {
|
||||
$words{$word}++;
|
||||
}
|
||||
|
||||
# Add the words in, or update frequency.
|
||||
my %word_ids = ();
|
||||
while (my ($word, $freq) = each %words) {
|
||||
$self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
my $word_r = $self->{sel_word}->fetchrow_arrayref; # Word_ID, Word, Frequency
|
||||
if ($word_r) {
|
||||
$word_r->[2] += $freq;
|
||||
$word_ids{$word} = $word_r->[0];
|
||||
$self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
else {
|
||||
$self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
$word_ids{$word} = $self->{ins_word}->insert_id();
|
||||
}
|
||||
}
|
||||
# now that we have the word ids, insert each of the word-points
|
||||
my $weight = $weights{$column};
|
||||
foreach my $word ( @{$word_list} ) {
|
||||
$self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
$index++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub reindex_all {
|
||||
# -------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $table = shift;
|
||||
my $opts = shift;
|
||||
my $tick = $opts->{tick} || 0;
|
||||
my $max = $opts->{max} || 5000;
|
||||
|
||||
my %weights = $self->{table}->_weight_cols() or return;
|
||||
my @weight_list = keys %weights;
|
||||
my @weight_arr = map { $weights{$_} } @weight_list;
|
||||
my ($pk) = $self->{table}->pk();
|
||||
my $index = 0;
|
||||
my $word_id = 1;
|
||||
$self->{init} or $self->init_queries;
|
||||
|
||||
# first nuke the current index
|
||||
$self->dump_index();
|
||||
|
||||
# Go through the table and index each field.
|
||||
my $iterations = 1;
|
||||
my $count = 0;
|
||||
|
||||
while (1) {
|
||||
if ($max) {
|
||||
my $offset = ($iterations-1) * $max;
|
||||
$table->select_options ( "LIMIT $offset,$max");
|
||||
}
|
||||
my $cond = $opts->{cond} || {};
|
||||
my $sth = $table->select($cond, [ $pk, @weight_list] );
|
||||
my $done = 1;
|
||||
|
||||
while ( my $arrayref = $sth->fetchrow_arrayref() ) {
|
||||
# the primary key value
|
||||
my $i = 0;
|
||||
my $item_id = $arrayref->[($i++)];
|
||||
$index = 0;
|
||||
$done = 0;
|
||||
|
||||
# start going through the record data
|
||||
foreach my $weight ( @weight_arr ) {
|
||||
my ($word_list, $junk) = $self->_tokenize( $arrayref->[$i++] );
|
||||
$word_list or next;
|
||||
|
||||
# Build a hash of word => frequency.
|
||||
my %words;
|
||||
foreach my $word (@{$word_list}) {
|
||||
$words{$word}++;
|
||||
}
|
||||
|
||||
# Add the words in, or update frequency.
|
||||
my %word_ids = ();
|
||||
while (my ($word, $freq) = each %words) {
|
||||
$self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
my $word_r = $self->{sel_word}->fetchrow_arrayref; # WordID,Word,Freq
|
||||
if ($word_r) {
|
||||
$word_r->[2] += $freq;
|
||||
$word_ids{$word} = $word_r->[0];
|
||||
$self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
else {
|
||||
$self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
$word_ids{$word} = $self->{ins_word}->insert_id();
|
||||
}
|
||||
}
|
||||
# now that we have the word ids, insert each of the word-points
|
||||
foreach my $word ( @{$word_list} ) {
|
||||
$self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
$index++;
|
||||
}
|
||||
if ($tick) {
|
||||
$count++;
|
||||
$count % $tick or (print "$count ");
|
||||
$count % ($tick*10) or (print "\n");
|
||||
}
|
||||
}
|
||||
return if ($done);
|
||||
$iterations++;
|
||||
return if (! $max);
|
||||
}
|
||||
}
|
||||
|
||||
sub pre_delete_record {
|
||||
# -------------------------------------------------------
|
||||
# Delete a records index values.
|
||||
#
|
||||
my $self = shift;
|
||||
my $where = shift;
|
||||
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my %weights = $tbl->_weight_cols() or return;
|
||||
my ($pk) = $tbl->pk();
|
||||
my $q = $tbl->select( $where, [ $pk ] );
|
||||
|
||||
while ( my $aref = $q->fetchrow_arrayref() ) {
|
||||
my $item_id = $aref->[0] or next;
|
||||
my @weight_list = keys %weights;
|
||||
my $index = 0;
|
||||
$self->{init} or $self->init_queries;
|
||||
|
||||
# Get a frequency count for each word
|
||||
$self->{item_cnt}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
|
||||
# Now go through and either decrement the freq, or remove the entry.
|
||||
while ( my ($word_id, $frequency) = $self->{item_cnt}->fetchrow_array() ) {
|
||||
$self->{sel_freq}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
$self->debug( "Deleting frequencies for $word_id. decreasing by $frequency" ) if ($self->{_debug});
|
||||
if (my $freq = $self->{sel_freq}->fetchrow_arrayref) {
|
||||
if ($freq->[0] == $frequency) {
|
||||
$self->{del_word}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
else {
|
||||
$self->{mod_word}->execute($frequency, $word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
}
|
||||
}
|
||||
# Remove the listings from the scores table.
|
||||
$self->{scr_del}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub post_update_record {
|
||||
# -------------------------------------------------------
|
||||
my ( $self, $set_cond, $where_cond, $tmp ) = @_;
|
||||
|
||||
# delete the previous record
|
||||
$self->pre_delete_record( $where_cond ) or return;
|
||||
#
|
||||
# the new record
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my $q = $tbl->select( $where_cond );
|
||||
while ( my $href = $q->fetchrow_hashref() ) {
|
||||
$self->post_add_record( $href );
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
sub reindex_record {
|
||||
# -------------------------------------------------------
|
||||
# reindexes a record. basically deletes all associated records from current db abnd does an index.
|
||||
# it's safe to use this
|
||||
my $self = shift;
|
||||
my $rec = shift;
|
||||
|
||||
$self->delete_record($rec);
|
||||
$self->index_record($rec);
|
||||
}
|
||||
|
||||
sub dump_index {
|
||||
# -------------------------------------------------------
|
||||
my $self = shift;
|
||||
$self->{init} or $self->init_queries;
|
||||
|
||||
$self->{dump_word}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
$self->{dump_scor}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
|
||||
}
|
||||
|
||||
|
||||
sub debug_dumper {
|
||||
# ------------------------------------------------------------------------------
|
||||
# calls debug but also dumps all the messages
|
||||
my $self = shift;
|
||||
my $message = shift;
|
||||
my $level = ref $_[0] ? 1 : shift;
|
||||
|
||||
if ( $self->{_debug} >= $level ) {
|
||||
require GT::Dumper;
|
||||
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ ));
|
||||
}
|
||||
}
|
||||
|
||||
sub DESTROY {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Calls finish on init queries.
|
||||
#
|
||||
my $self = shift;
|
||||
return unless ($self->{init});
|
||||
$self->{upd_word}->finish;
|
||||
# $self->{ins_word}->finish; will get finished automatically
|
||||
$self->{sel_word}->finish;
|
||||
$self->{sel_freq}->finish;
|
||||
$self->{del_word}->finish;
|
||||
$self->{mod_word}->finish;
|
||||
$self->{ins_scor}->finish;
|
||||
$self->{item_cnt}->finish;
|
||||
$self->{scr_del}->finish;
|
||||
$self->{dump_word}->finish;
|
||||
$self->{dump_scor}->finish;
|
||||
$self->{init} = 0;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,604 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Indexer
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.18 2004/08/28 03:53:47 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to make changes to tables and create tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::INTERNAL::Search;
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $VERSION $DEBUG $ATTRIBS /;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/;
|
||||
$ATTRIBS = {
|
||||
# the max number of links that can be handled by UNION before it should simply
|
||||
# shunt the searching pipe to NONINDEXED system
|
||||
'union_shunt_threshold' => '5000',
|
||||
'phrase_shunt_threshold' => '1000',
|
||||
};
|
||||
|
||||
|
||||
################################################################################
|
||||
# Internal functions
|
||||
################################################################################
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::INTERNAL::Search->new(@_)
|
||||
}
|
||||
|
||||
sub _query {
|
||||
# ------------------------------------------------------------------------------
|
||||
# this just checks to ensure that the words are not all search keywords
|
||||
#
|
||||
my ( $self, $input, $buckets ) = @_;
|
||||
|
||||
# calculate wordids and frequencies
|
||||
foreach ( keys %$buckets ) {
|
||||
$buckets->{$_} = $self->get_wordids( $buckets->{$_}, ( /phrase/ ? "phrases" : "keywords" ) );
|
||||
}
|
||||
|
||||
# the following is a bit tricky and will be replaced however, if the number
|
||||
# of results from a union is more than the maximum shunt value, it will
|
||||
# simply do a nonindexed query
|
||||
if ( $buckets->{keywords} ) {
|
||||
my $rec = _count_frequencies( $buckets->{keywords} );
|
||||
my $count = 0;
|
||||
foreach ( values %$rec ) { $count += $_; }
|
||||
if ($count > $self->{union_shunt_threshold}) {
|
||||
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
|
||||
return $self->alternate_driver_query( 'NONINDEXED', $input );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Now test the phrases. Just due to how the phrase searching works, the queries
|
||||
# can grow in size extremely rapidly, and slowdown the search. So the limit for
|
||||
# phrase searching is separate as it requires a different cutoff value than
|
||||
# the keyword search which is usually much lower!
|
||||
if ($buckets->{phrases}) {
|
||||
foreach my $phrase ( keys %{$buckets->{phrases} || {} } ) {
|
||||
my $rec = _count_frequencies( $buckets->{phrases}->{$phrase}->{word_info} );
|
||||
my ( $count ) = sort values %$rec; # Get smallest frequency.
|
||||
if ( $count > $self->{phrase_shunt_threshold} ) {
|
||||
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
|
||||
return $self->alternate_driver_query( 'NONINDEXED', $input );
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($buckets->{phrases_must}) {
|
||||
foreach my $phrase ( keys %{$buckets->{phrases_must} || {} } ) {
|
||||
my $rec = _count_frequencies( $buckets->{phrases_must}->{$phrase}->{word_info} );
|
||||
my ( $count ) = sort values %$rec; # Get smallest frequency.
|
||||
if ( $count > $self->{phrase_shunt_threshold} ) {
|
||||
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
|
||||
return $self->alternate_driver_query( 'NONINDEXED', $input );
|
||||
}
|
||||
}
|
||||
}
|
||||
return $self->SUPER::_query( $input, $buckets );
|
||||
}
|
||||
|
||||
sub _count_frequencies {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $word_info = shift;
|
||||
my $rec = {};
|
||||
foreach my $word ( keys %$word_info ) {
|
||||
my $freq = 0;
|
||||
foreach ( values %{$word_info->{$word}->{word_info}} ) {
|
||||
$freq += $_;
|
||||
}
|
||||
$rec->{$word} = $freq;
|
||||
}
|
||||
|
||||
return $rec;
|
||||
}
|
||||
|
||||
sub _table_names {
|
||||
# ------------------------------------------------------------------------------
|
||||
# return the table names
|
||||
#
|
||||
my $self = shift;
|
||||
my $table = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
|
||||
my $wtable = $table . '_Word_List';
|
||||
my $stable = $table . '_Score_List';
|
||||
|
||||
return ( $table, $wtable, $stable);
|
||||
}
|
||||
|
||||
sub _word_infos {
|
||||
# ------------------------------------------------------------------------------
|
||||
# get the word ids and frequencies
|
||||
#
|
||||
my $self = shift;
|
||||
my $word_infos = shift;
|
||||
|
||||
my $rec = {};
|
||||
|
||||
foreach my $word ( keys %$word_infos ) {
|
||||
my $wi = $word_infos->{$word}->{word_info};
|
||||
$rec->{$word} = [ map { [ $_, $wi->{$_} ] } keys %$wi ];
|
||||
}
|
||||
|
||||
return $rec;
|
||||
|
||||
}
|
||||
|
||||
sub _union_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Takes a list of words and gets all words that match
|
||||
# returns { itemid -> score } of hits that match
|
||||
#
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $results = shift || {};
|
||||
my ( $query, $where, $db, $word_infos );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
|
||||
$word_infos = $self->_word_infos( $words ) or return $results;
|
||||
|
||||
return $results unless (keys %{$word_infos});
|
||||
|
||||
$self->debug_dumper( "Getting words: ", $words) if ($self->{_debug});
|
||||
|
||||
# build the where clause
|
||||
my @word_ids;
|
||||
foreach my $word_synonym_list ( values %$word_infos ) {
|
||||
next unless ( $word_synonym_list );
|
||||
foreach my $word_id ( @{$word_synonym_list }) {
|
||||
next unless ( ref $word_id eq 'ARRAY' ); # ensure it's a reference
|
||||
push @word_ids, $word_id->[0]; # we need to shed the word quantities
|
||||
}
|
||||
}
|
||||
|
||||
return $results unless ( @word_ids );
|
||||
$where = 'Word_ID IN(' . join(",", @word_ids) . ")";
|
||||
|
||||
# build the query
|
||||
$query = qq!
|
||||
SELECT Item_ID, SUM(Score)
|
||||
FROM $stable
|
||||
WHERE
|
||||
$where
|
||||
GROUP BY Item_ID
|
||||
!;
|
||||
|
||||
$self->debug( "Union Query: $query" ) if ($self->{_debug});
|
||||
|
||||
# prepare the query
|
||||
my $sth = $db->prepare( $query ) or return;
|
||||
$sth->execute() or return;
|
||||
|
||||
# get the results
|
||||
my %word_infos = $sth->fetchall_list;
|
||||
|
||||
# merge the current result set into found
|
||||
foreach my $item ( keys %{$results} ) {
|
||||
$word_infos{$item} += $results->{$item};
|
||||
};
|
||||
|
||||
return \%word_infos;
|
||||
}
|
||||
|
||||
sub _intersect_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Takes a list of words and gets all words that match all the keywords
|
||||
# returns { itemid -> score } of hits that match
|
||||
#
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $results = shift || {};
|
||||
|
||||
$words or return $results;
|
||||
keys %{$words} or return $results;
|
||||
|
||||
my ( $query, $where, $db, $word_infos, $word_hits );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
# have we left any of our words out?
|
||||
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
|
||||
$word_infos = $self->_word_infos( $words ) or return {};
|
||||
if ( keys %{$word_infos} < keys %{$words} ) {
|
||||
return {};
|
||||
}
|
||||
|
||||
$self->debug_dumper( "Keyword Intersect words: ", $word_infos ) if ($self->{_debug});
|
||||
|
||||
# take the words and get a hash of the word scores
|
||||
foreach my $word ( keys %{$word_infos} ) {
|
||||
|
||||
my $total_freq = 0;
|
||||
foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
|
||||
$total_freq += $word_synonyms->[1];
|
||||
}
|
||||
|
||||
$word_hits->{$word} = $total_freq or return;
|
||||
|
||||
}
|
||||
|
||||
# so now, sort out the words from lowest frequency to highest frequency
|
||||
my @search_order = sort { $word_hits->{$a} <=> $word_hits->{$b} } keys %{$word_hits};
|
||||
|
||||
$self->debug_dumper( "Searching words in this order: ", \@search_order) if ($self->{_debug});
|
||||
|
||||
# find out how we're going to handle the searching, if the first elements
|
||||
|
||||
################################################################################
|
||||
### The following part is for smaller intersect subsets
|
||||
################################################################################
|
||||
my $intersect = $results;
|
||||
foreach my $word ( @search_order ) {
|
||||
|
||||
# setup the where clause to get all the words associated
|
||||
my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
|
||||
|
||||
# setup the intersect for the previous if required. for iterative intersecting
|
||||
if ( keys %{$intersect} ) {
|
||||
$where .= " AND Item_ID in(" . join(",",keys %{$intersect}) . ")";
|
||||
}
|
||||
|
||||
# make the database engine work a little bit
|
||||
$query = qq!
|
||||
SELECT Item_ID, SUM(Score) AS Score
|
||||
FROM $stable
|
||||
WHERE
|
||||
$where
|
||||
GROUP BY Item_ID
|
||||
!;
|
||||
$self->debug( "Intersect Query: $query" ) if ($self->{_debug});
|
||||
my $intersect_sth = $db->prepare( $query );
|
||||
|
||||
$intersect_sth->execute();
|
||||
|
||||
# get a list of all the matches
|
||||
my $matches = $intersect_sth->fetchall_arrayref();
|
||||
|
||||
$self->debug_dumper( "Matches found for $word: ", $matches ) if ($self->{_debug});
|
||||
|
||||
# go through all the matches and intersect them
|
||||
my %tmp = ();
|
||||
foreach my $row ( @{$matches} ) {
|
||||
my ( $itemid, $score ) = @{$row};
|
||||
$intersect->{$itemid} ||= 0;
|
||||
$tmp{ $itemid } = $intersect->{$itemid} + $score;
|
||||
}
|
||||
|
||||
# inform the system of that development
|
||||
%tmp or return;
|
||||
$intersect = \%tmp;
|
||||
}
|
||||
|
||||
return $intersect;
|
||||
}
|
||||
|
||||
sub _disjoin_query {
|
||||
#------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $results = shift || {};
|
||||
$words or return $results;
|
||||
|
||||
my ( $query, $where, $db, $word_infos, $word_hits );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
$db = $self->{table}->{driver} or return $results;
|
||||
|
||||
# have we left any of our words out?
|
||||
$word_infos = $self->_word_infos( $words ) or return $results;
|
||||
# if ( keys %{$word_infos} < keys %{$words} ) {
|
||||
# return $results;
|
||||
# }
|
||||
|
||||
# take the words and get a hash of the word scores
|
||||
foreach my $word ( keys %{$word_infos} ) {
|
||||
my $total_freq = 0;
|
||||
foreach my $word_synonyms ( $word_infos->{$word} ) {
|
||||
$total_freq += ( $word_synonyms->[0] || 0 );
|
||||
}
|
||||
# if the value is null this mean there is actually no results, whoops!
|
||||
$total_freq and $word_hits->{$word} = $total_freq;
|
||||
}
|
||||
|
||||
# so now, sort out the words from lowest frequency to highest frequency
|
||||
my @search_order = sort { $word_hits->{$b} <=> $word_hits->{$b} } keys %{$word_hits};
|
||||
$self->debug_dumper( "Disjoining words in the following order: ", \@search_order) if ($self->{_debug});
|
||||
|
||||
################################################################################
|
||||
### This following part is for smaller disjoin presets
|
||||
################################################################################
|
||||
foreach my $word ( @search_order ) {
|
||||
|
||||
# setup the where clause to get all the words associated
|
||||
my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
|
||||
|
||||
# setup the intersect for the previous if required. for iterative intersecting
|
||||
if ( keys %{$results} ) {
|
||||
$where .= " AND Item_ID in(" . join(",", keys %{$results}) . ")";
|
||||
}
|
||||
|
||||
# make the database engine work a little bit
|
||||
$query = qq!
|
||||
SELECT Item_ID
|
||||
FROM $stable
|
||||
WHERE
|
||||
$where
|
||||
GROUP BY Item_ID
|
||||
!;
|
||||
$self->debug($query) if ($self->{_debug});
|
||||
my $intersect_sth = $db->prepare( $query );
|
||||
|
||||
$intersect_sth->execute();
|
||||
|
||||
# get a list of all the matches
|
||||
my $matches = $intersect_sth->fetchall_arrayref();
|
||||
|
||||
# strip the matches from the current result set
|
||||
foreach my $word ( map { $_->[0] } @{$matches}) {
|
||||
delete $results->{$word};
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _phrase_disjoin_query {
|
||||
#------------------------------------------------------------
|
||||
# subtracts the found phrases from the list
|
||||
my $self = shift;
|
||||
my $phrases = shift;
|
||||
my $results = shift || {};
|
||||
$phrases or return $results;
|
||||
|
||||
foreach my $phrase ( values %{$phrases} ) {
|
||||
my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
|
||||
|
||||
# perform disjoin
|
||||
foreach my $itemid ( keys %{$temp} ) {
|
||||
$self->debug( "Deleting $itemid from list" ) if ($self->{_debug});
|
||||
delete $results->{$itemid};
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _phrase_intersect_query {
|
||||
#------------------------------------------------------------
|
||||
# intersects phrases together
|
||||
my $self = shift;
|
||||
my $phrases = shift;
|
||||
my $results = shift || {};
|
||||
|
||||
$phrases or return $results;
|
||||
|
||||
foreach my $phrase ( values %{$phrases} ) {
|
||||
my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
|
||||
|
||||
# perform intersect
|
||||
foreach my $itemid ( keys %{$temp} ) {
|
||||
$temp->{$itemid} += $results->{$itemid} || 0;
|
||||
}
|
||||
$results = $temp;
|
||||
|
||||
}
|
||||
|
||||
return $results;
|
||||
|
||||
}
|
||||
|
||||
sub _phrase_query {
|
||||
#------------------------------------------------------------
|
||||
# this is a phrase union query
|
||||
my $self = shift;
|
||||
my $phrases = shift or return;
|
||||
my $results = shift || {};
|
||||
|
||||
foreach my $phrase ( values %{$phrases} ) {
|
||||
$self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug});
|
||||
$results = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info}, $results );
|
||||
}
|
||||
|
||||
return $results;
|
||||
|
||||
}
|
||||
|
||||
sub _get_phrase {
|
||||
#------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $wordlist= shift;
|
||||
my $word_info = shift;
|
||||
my $results = shift || {};
|
||||
|
||||
$wordlist or return $results;
|
||||
|
||||
my ( $query, $where, $db, $word_infos, %word_hits );
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
my ($pk) = $self->{table}->pk;
|
||||
|
||||
$self->debug_dumper( "Getting words: ", $wordlist ) if ($self->{_debug});
|
||||
|
||||
# get all the word ids that we want to handle
|
||||
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
|
||||
$word_infos = $self->_word_infos( $word_info ) or return;
|
||||
|
||||
|
||||
$self->debug_dumper( "Word infos: ", $word_infos ) if ($self->{_debug});
|
||||
|
||||
# take the words and get a hash of the word scores
|
||||
foreach my $word ( keys %{$word_infos} ) {
|
||||
|
||||
@{$word_infos->{$word} || []} or return;
|
||||
|
||||
my $total_freq = 0;
|
||||
foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
|
||||
$total_freq += $word_synonyms->[1];
|
||||
}
|
||||
|
||||
# if the value is null this mean there is actually no results, whoops!
|
||||
$word_hits{$word} = $total_freq;
|
||||
}
|
||||
|
||||
$self->debug_dumper( "With synonyms tallied: ", \%word_hits ) if ($self->{_debug});
|
||||
|
||||
# so now, setup the order of search
|
||||
my $i = 0;
|
||||
my %word_order = map { $_ => $i++ } @{$wordlist};
|
||||
my @search_order = sort { $word_hits{$a} <=> $word_hits{$b} } keys %word_hits;
|
||||
|
||||
$self->debug_dumper( "Word search order: ", \@search_order ) if ($self->{_debug});
|
||||
|
||||
################################################################################
|
||||
### This following part is for smaller phrases
|
||||
################################################################################
|
||||
# start getting words in order of their frequency
|
||||
my %matches = ();
|
||||
my $index = 0;
|
||||
foreach my $word ( @search_order ) {
|
||||
|
||||
# setup the where clause for the individual words, firstly
|
||||
if ( keys %matches ) {
|
||||
my $vector = $word_order{$word} - $index;
|
||||
$where = '(';
|
||||
$where =
|
||||
'(' .
|
||||
join(
|
||||
" OR ",
|
||||
map(
|
||||
"Item_ID = $_ AND Word_Pos IN(" . join(",", map $_->[0] + $vector, @{$matches{$_}}) . ')',
|
||||
keys %matches
|
||||
)
|
||||
) .
|
||||
") AND ";
|
||||
}
|
||||
else {
|
||||
$where = '';
|
||||
}
|
||||
|
||||
$where .= "Word_ID IN(" . ( join ",", map { $_->[0] || () } @{$word_infos->{$word}} or return $results ) . ')';
|
||||
|
||||
$query = qq!
|
||||
SELECT
|
||||
Item_ID, Score, Word_Pos
|
||||
FROM
|
||||
$stable
|
||||
WHERE
|
||||
$where
|
||||
!;
|
||||
|
||||
$self->debug( "Phrase get for '$word': " . $query ) if ($self->{_debug});
|
||||
my $sth = $db->prepare( $query );
|
||||
$sth->execute();
|
||||
|
||||
%matches = ();
|
||||
|
||||
while (my $hit = $sth->fetchrow_arrayref) {
|
||||
push @{$matches{$hit->[0]}}, [ $hit->[2], $hit->[1] ];
|
||||
}
|
||||
|
||||
# If there are no values stored in %matches, it means that for
|
||||
# this keyword, there have been no hits based upon position.
|
||||
# In that case, terminate and return a null result
|
||||
keys %matches or last;
|
||||
|
||||
# where were we in the string?
|
||||
$index = $word_order{$word};
|
||||
}
|
||||
|
||||
# now tally up all the scores and merge the new records in
|
||||
foreach my $itemid ( keys %matches ) {
|
||||
my $score = 0;
|
||||
foreach my $sub_total ( @{$matches{$itemid}} ) {
|
||||
$score += $sub_total->[1];
|
||||
}
|
||||
$results->{$itemid} += $score;
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub get_wordids {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Get a list of words
|
||||
#
|
||||
my $self = shift;
|
||||
my $elements = shift or return;
|
||||
my $mode = lc shift || 'keywords';
|
||||
|
||||
if ( $mode eq 'keywords' ) {
|
||||
$elements = $self->_get_wordid($elements);
|
||||
}
|
||||
else {
|
||||
foreach my $phrase ( keys %$elements ) {
|
||||
my $results = $self->_get_wordid({
|
||||
map { ($_ => { substring => 0 }) } @{$elements->{$phrase}->{phrase}}
|
||||
});
|
||||
|
||||
$elements->{$phrase}->{word_info} = $results;
|
||||
}
|
||||
}
|
||||
|
||||
return $elements;
|
||||
}
|
||||
|
||||
sub _get_wordid {
|
||||
# ------------------------------------------------------------------------------
|
||||
# Get a list of words
|
||||
#
|
||||
my $self = shift;
|
||||
my $words = shift;
|
||||
my $tbl = $self->{table};
|
||||
|
||||
my ( $table, $wtable, $stable) = $self->_table_names();
|
||||
|
||||
foreach my $word ( keys %$words ) {
|
||||
my $query =
|
||||
qq!SELECT Word_ID, Frequency FROM $wtable WHERE Word LIKE '! .
|
||||
quotemeta($word) .
|
||||
( $words->{$word}->{substring} ? '%' : '' ) .
|
||||
"'";
|
||||
my $sth = $tbl->do_query($query) or next;
|
||||
my $tmp = { $sth->fetchall_list };
|
||||
|
||||
$words->{$word}->{word_info} = $tmp;
|
||||
}
|
||||
|
||||
return $words;
|
||||
}
|
||||
|
||||
##
|
||||
# Internal Use
|
||||
# $self->_cgi_to_hash ($in);
|
||||
# --------------------------
|
||||
# Creates a hash ref from a cgi object.
|
||||
##
|
||||
sub _cgi_to_hash {
|
||||
my ($self, $cgi) = @_;
|
||||
$cgi and UNIVERSAL::can($cgi, 'param') or return $self->error(NODRIVER => 'FATAL');
|
||||
my @keys = $cgi->param;
|
||||
my $result = {};
|
||||
foreach my $key (@keys) {
|
||||
my @values = $cgi->param($key);
|
||||
if (@values == 1) { $result->{$key} = $values[0] }
|
||||
else { $result->{$key} = \@values }
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,239 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::LUCENE::Indexer
|
||||
# Author: Scott Beck
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Indexer.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
|
||||
package GT::SQL::Search::LUCENE::Indexer;
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
|
||||
use Lucene;
|
||||
use GT::SQL::Search::Base::Indexer;
|
||||
use GT::TempFile;
|
||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
|
||||
$ERRORS = {
|
||||
INDEX_CORRUPT => 'Could not create an Indexer, this probably means your index is corrupted and you should rebuild it. The error was: %s',
|
||||
DELETE_FAILED => 'Could not delete some records: %s'
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
|
||||
sub load {
|
||||
my $class = shift;
|
||||
return $class->new(@_)
|
||||
}
|
||||
|
||||
sub _get_path {
|
||||
my $self = shift;
|
||||
my $name = $self->{table}->name;
|
||||
my $tmpdir = GT::TempFile::find_tmpdir();
|
||||
my $path = $tmpdir . '/' . $name;
|
||||
$path = $1 if $path =~ /(.*)/; # XXX untaint
|
||||
return $path;
|
||||
}
|
||||
|
||||
sub _get_store {
|
||||
my ($self, $create) = @_;
|
||||
my $path = $self->_get_path;
|
||||
return Lucene::Store::FSDirectory->getDirectory($path, $create);
|
||||
}
|
||||
|
||||
sub _get_indexer {
|
||||
my ($self, $create) = @_;
|
||||
my %weights = $self->{table}->_weight_cols() or return $self->error(NOWEIGHTS => 'WARN');
|
||||
|
||||
my ($pk) = $self->{table}->pk;
|
||||
if (!$pk) {
|
||||
return $self->error('NOPRIMARYKEY','WARN');
|
||||
}
|
||||
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
|
||||
my $store = $self->_get_store($create);
|
||||
|
||||
my $iw;
|
||||
eval { $iw = new Lucene::Index::IndexWriter($store, $analyzer, $create); };
|
||||
if ($@) {
|
||||
return $self->error('INDEX_CORRUPT', 'WARN', "$@");
|
||||
}
|
||||
return $iw;
|
||||
}
|
||||
|
||||
sub drop_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $path = $self->_get_path;
|
||||
require File::Tools;
|
||||
File::Tools::deldir($path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub add_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
$self->_get_indexer(1) or return;
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub post_create_table {
|
||||
# ------------------------------------------------------------------------------
|
||||
# creates the index tables..
|
||||
#
|
||||
return $_[0]->add_search_driver(@_);
|
||||
}
|
||||
|
||||
sub post_drop_table {
|
||||
# -------------------------------------------------------
|
||||
# Remove the index tables.
|
||||
#
|
||||
return $_[0]->drop_search_driver(@_);
|
||||
}
|
||||
|
||||
|
||||
sub post_add_record {
|
||||
# -------------------------------------------------------
|
||||
# indexes a single record
|
||||
my ($self, $rec, $insert_sth, $no_optimize) = @_;
|
||||
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my %weights = $tbl->_weight_cols() or return;
|
||||
|
||||
my $indexer = $self->_get_indexer(0) or return $self->{_debug} ? () : 1;
|
||||
my $doc = new Lucene::Document;
|
||||
my ($pk) = $self->{table}->pk;
|
||||
delete $weights{$pk};
|
||||
for my $column_name (keys %weights) {
|
||||
my $field = Lucene::Document::Field->UnStored($column_name, $rec->{$column_name});
|
||||
$field->setBoost($weights{$column_name});
|
||||
$doc->add($field);
|
||||
}
|
||||
$doc->add(Lucene::Document::Field->Keyword($pk, ($tbl->ai && $insert_sth ? $insert_sth->insert_id : $rec->{$pk})));
|
||||
$indexer->addDocument($doc);
|
||||
$indexer->optimize if !$no_optimize;
|
||||
$indexer->close;
|
||||
undef $indexer;
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub reindex_all {
|
||||
# -------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $table = shift;
|
||||
my $opts = shift;
|
||||
my $tick = $opts->{tick} || 0;
|
||||
my $max = $opts->{max} || 5000;
|
||||
|
||||
my $indexer = $self->_get_indexer(1) or return $self->{_debug} ? () : 1; # clobbers the old one
|
||||
$indexer->close;
|
||||
undef $indexer;
|
||||
|
||||
my %weights = $self->{table}->_weight_cols() or return;
|
||||
my @weight_list = keys %weights;
|
||||
my ($pk) = $self->{table}->pk();
|
||||
|
||||
# Go through the table and index each field.
|
||||
my $iterations = 1;
|
||||
my $count = 0;
|
||||
|
||||
while (1) {
|
||||
if ($max) {
|
||||
my $offset = ($iterations-1) * $max;
|
||||
$table->select_options("LIMIT $offset,$max");
|
||||
}
|
||||
my $cond = $opts->{cond} || {};
|
||||
my $sth = $table->select($cond, [$pk, @weight_list]);
|
||||
my $done = 1;
|
||||
|
||||
while (my $rec = $sth->fetchrow_hashref() ) {
|
||||
$self->post_add_record($rec, undef, 1);
|
||||
$done = 0;
|
||||
if ($tick) {
|
||||
$count++;
|
||||
$count % $tick or (print "$count ");
|
||||
$count % ($tick*10) or (print "\n");
|
||||
}
|
||||
}
|
||||
last if $done;
|
||||
$iterations++;
|
||||
last if !$max;
|
||||
}
|
||||
$indexer = $self->_get_indexer(0) or return;
|
||||
$indexer->optimize;
|
||||
$indexer->close;
|
||||
undef $indexer;
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub pre_delete_record {
|
||||
# -------------------------------------------------------
|
||||
# Delete a records index values.
|
||||
#
|
||||
my ($self, $where) = @_;
|
||||
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my ($pk) = $tbl->pk();
|
||||
my $q = $tbl->select($where, [$pk]);
|
||||
|
||||
my $reader = eval { Lucene::Index::IndexReader->open($self->_get_store(0)); };
|
||||
if ($@) {
|
||||
return $self->{_debug} ? $self->error('INDEX_CORRUPT', 'WARN', "$@") : 1;
|
||||
}
|
||||
|
||||
my @errors;
|
||||
while (my ($item_id) = $q->fetchrow) {
|
||||
my $t = new Lucene::Index::Term($pk => $item_id);
|
||||
eval { $reader->deleteDocuments($t); };
|
||||
if ($@) {
|
||||
push @errors, "$@";
|
||||
}
|
||||
}
|
||||
$reader->close;
|
||||
undef $reader;
|
||||
if (@errors) {
|
||||
return $self->{_debug} ? $self->error('DELETE_FAILED', 'WARN', join(", ", @errors)) : 1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub post_update_record {
|
||||
# -------------------------------------------------------
|
||||
my ( $self, $set_cond, $where_cond, $tmp ) = @_;
|
||||
|
||||
# delete the previous record
|
||||
eval {
|
||||
$self->pre_delete_record($where_cond) or return $self->{_debug} ? () : 1;
|
||||
};
|
||||
#
|
||||
# the new record
|
||||
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
|
||||
my ($pk) = $tbl->pk();
|
||||
my %weights = $self->{table}->_weight_cols();
|
||||
my @weight_list = keys %weights;
|
||||
my $q = $tbl->select($where_cond, [$pk, @weight_list]);
|
||||
while (my $href = $q->fetchrow_hashref) {
|
||||
$self->post_add_record($href);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
sub reindex_record {
|
||||
# -------------------------------------------------------
|
||||
# reindexes a record. basically deletes all associated records from current db abnd does an index.
|
||||
# it's safe to use this
|
||||
my ($self, $rec) = @_;
|
||||
|
||||
$self->delete_record($rec);
|
||||
$self->index_record($rec);
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,206 @@
|
||||
NAME
|
||||
Lucene -- API to the C++ port of the Lucene search engine
|
||||
|
||||
SYNOPSIS
|
||||
Initialize/Empty Lucene index
|
||||
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 1);
|
||||
|
||||
my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
|
||||
$tmp_writer->close;
|
||||
undef $tmp_writer;
|
||||
|
||||
Choose your Analyzer (string tokenizer)
|
||||
# lowercases text and splits it at non-letter characters
|
||||
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||||
# same as before and removes stop words
|
||||
my $analyzer = Lucene::Analysis::StopAnalyzer();
|
||||
# splits text at whitespace characters
|
||||
my $analyzer = Lucene::Analysis::WhitespaceAnalyzer();
|
||||
# lowercases text, tokenized it based on a grammer that
|
||||
# leaves named authorities intact (e-mails, company names,
|
||||
# web hostnames, IP addresses, etc) and removed stop words
|
||||
my $analyzer = Lucene::Analysis::Standard::StandardAnalyzer();
|
||||
|
||||
Choose your Store (storage engine)
|
||||
# in-memory storage
|
||||
my $store = new Lucene::Store::RAMDirectory();
|
||||
# disk-based storage
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||||
|
||||
Open and configure an IndexWriter
|
||||
my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 0);
|
||||
# optional settings for power users
|
||||
$writer->setMergeFactor(100);
|
||||
$writer->setUseCompoundFile(0);
|
||||
$writer->setMaxFieldLength(255);
|
||||
$writer->setMinMergeDocs(10);
|
||||
$writer->setMaxMergeDocs(100);
|
||||
|
||||
Create Documents and add Fields
|
||||
my $doc = new Lucene::Document;
|
||||
# field gets analyzed, indexed and stored
|
||||
$doc->add(Lucene::Document::Field->Text("content", $content));
|
||||
# field gets indexed and stored
|
||||
$doc->add(Lucene::Document::Field->Keyword("isbn", $isbn));
|
||||
# field gets just stored
|
||||
$doc->add(Lucene::Document::Field->UnIndexed("sales_rank", $sales_rank));
|
||||
# field gets analyzed and indexed
|
||||
$doc->add(Lucene::Document::Field->UnStored("categories", $categories));
|
||||
|
||||
Add Documents to an IndexWriter
|
||||
$writer->addDocument($doc);
|
||||
|
||||
Optimize your index and close the IndexWriter
|
||||
$writer->optimize();
|
||||
$writer->close();
|
||||
undef $writer;
|
||||
|
||||
Delete Documents
|
||||
my $reader = Lucene::Index::IndexReader->open($store);
|
||||
my $term = new Lucene::Index::Term("isbn", $isbn);
|
||||
$reader->deleteDocuments($term);
|
||||
$reader->close();
|
||||
undef $reader;
|
||||
|
||||
Query index
|
||||
# initalize searcher and parser
|
||||
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||||
my $searcher = new Lucene::Search::IndexSearcher($store);
|
||||
my $parser = new Lucene::QueryParser("default_field", $analyzer);
|
||||
|
||||
# build a query on the default field
|
||||
my $query = $parser->parse("perl");
|
||||
|
||||
# build a query on another field
|
||||
my $query = $parser->parse("title:cookbook");
|
||||
|
||||
# define a sort on one field
|
||||
my $sortfield = new Lucene::Search::SortField("unixtime");
|
||||
my $reversed_sortfield = new Lucene::Search::SortField("unixtime", 1);
|
||||
my $sort = new Lucene::Search::Sort($sortfield);
|
||||
|
||||
# define a sort on two fields
|
||||
my $sort = new Lucene::Search::Sort($sortfield1, $sortfield2);
|
||||
|
||||
# use Lucene's INDEXORDER or RELEVANCE sort
|
||||
my $sort = Lucene::Search::Sort->INDEXORDER;
|
||||
my $sort = Lucene::Search::Sort->RELEVANCE;
|
||||
|
||||
# query index and get results
|
||||
my $hits = $searcher->search($query);
|
||||
my $sorted_hits = $searcher->search($query, $sort);
|
||||
|
||||
# get number of results
|
||||
my $num_hits = $hits->length();
|
||||
|
||||
# get fields and ranking score for each hit
|
||||
for (my $i = 0; $i < $num_hits; $i++) {
|
||||
my $doc = $hits->doc($i);
|
||||
my $score = $hits->score($i);
|
||||
my $title = $doc->get("title");
|
||||
my $isbn = $doc->get("isbn");
|
||||
}
|
||||
|
||||
# free memory and close searcher
|
||||
undef $hits;
|
||||
undef $query;
|
||||
undef $parser;
|
||||
undef $analyzer;
|
||||
$searcher->close();
|
||||
undef $fsdir;
|
||||
undef $searcher;
|
||||
}
|
||||
|
||||
Close your Store
|
||||
$store->close;
|
||||
undef $store;
|
||||
|
||||
DESCRIPTION
|
||||
Like it or not Apache Lucene has become the de-facto standard for
|
||||
open-source high-performance search. It has a large user-base, is well
|
||||
documented and has plenty of committers. Unfortunately Apache Lucene is
|
||||
entirely written in Java and therefore of relatively little use for perl
|
||||
programmers. Fortunately in the recent years a group of C++ programmers
|
||||
led by Ben van Klinken decided to port Java Lucene to C++.
|
||||
|
||||
The purpose of the module is to export the C++ Lucene API to perl and at
|
||||
the same time be as close as possible to the original Java API. This has
|
||||
the combined advantage of providing perl programmers with a
|
||||
well-documented API and giving them access to a C++ search engine
|
||||
library that is supposedly faster than the original.
|
||||
|
||||
CHARACTER SUPPORT
|
||||
Currently only ISO 8859-1 (Latin-1) characters are supported. Obviously
|
||||
this included all ASCII characters.
|
||||
|
||||
INDEX COMPATIBLITY
|
||||
For the moment indices produced by this module are not compatible with
|
||||
those from Apache Lucene. The reason for this is that this module uses
|
||||
1-byte character encoding as opposed to 2-byte (widechar) encoding with
|
||||
Apache Lucene.
|
||||
|
||||
INSTALLATION
|
||||
This module requires the clucene library to be installed. The best way
|
||||
to get it is to go to the following page
|
||||
|
||||
http://sourceforge.net/projects/clucene/
|
||||
|
||||
and download the latest STABLE clucene-core version. Currently it is
|
||||
clucene-core-0.9.15. Make sure you compile it in ASCII mode and install
|
||||
it in your standard library path.
|
||||
|
||||
On a Linux platform this goes as follows:
|
||||
|
||||
wget http://kent.dl.sourceforge.net/sourceforge/clucene/clucene-core-0.9.15.tar.gz
|
||||
cd clucene-core-0.9.15
|
||||
./autogen.sh
|
||||
./configure --disable-debug --prefix=/usr --exec-prefix=/usr --enable-ascii
|
||||
make
|
||||
make check
|
||||
(as root) make install
|
||||
|
||||
To install the perl module itself, run the following commands:
|
||||
|
||||
perl Makefile.PL
|
||||
make
|
||||
make test
|
||||
(as root) make install
|
||||
|
||||
AUTHOR
|
||||
Thomas Busch <tbusch at cpan dot org>
|
||||
|
||||
COPYRIGHT AND LICENSE
|
||||
Copyright (c) 2006 Thomas Busch
|
||||
|
||||
This library is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
SEE ALSO
|
||||
Plucene - a pure-Perl implementation of Lucene
|
||||
|
||||
KinoSearch - a search engine library inspired by Lucene
|
||||
|
||||
DISCLAIMER OF WARRANTY
|
||||
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
|
||||
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
|
||||
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
|
||||
NECESSARY SERVICING, REPAIR, OR CORRECTION.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
|
||||
TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
@ -0,0 +1,115 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::STH
|
||||
# Author: Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: STH.pm,v 1.1 2006/12/07 07:04:51 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
|
||||
package GT::SQL::Search::LUCENE::STH;
|
||||
#--------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE /;
|
||||
require GT::SQL::Search::Base::STH;
|
||||
|
||||
@ISA = ('GT::SQL::Search::STH');
|
||||
$ATTRIBS = {
|
||||
'db_sort' => 1,
|
||||
'hits' => undef
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL::Search::STH';
|
||||
|
||||
|
||||
|
||||
sub init {
|
||||
#--------------------------------------------------------------------------------
|
||||
# GT::SQL::Search::STH expects a full set of results in $self->{results}. For
|
||||
# Lucene the only time a full set of results is there is when we are sorting
|
||||
# on a field that is not weighted, otherwise the results in $self->{results} is
|
||||
# the proper page and number of results.
|
||||
my $self = shift;
|
||||
|
||||
$self->set(@_);
|
||||
|
||||
--$self->{nh} if $self->{nh};
|
||||
|
||||
# Here we allow hits to override our concept of rows. This is only useful
|
||||
# when !$self->{db_sort}
|
||||
$self->{rows} = $self->{hits}
|
||||
? $self->{hits}
|
||||
: $self->{results}
|
||||
? scalar(keys %{$self->{results}})
|
||||
: 0;
|
||||
|
||||
if ($self->{db_sort}) {
|
||||
$self->get_db_sorted_results;
|
||||
}
|
||||
else {
|
||||
$self->get_sorted_results;
|
||||
}
|
||||
}
|
||||
|
||||
sub get_sorted_results {
|
||||
# Just sorts the results out of $self->{results} which should have been setup
|
||||
# by a search driver
|
||||
my ($self) = @_;
|
||||
my $results = $self->{results};
|
||||
$self->{index} = 0;
|
||||
$self->{max_index} = $self->{mh} - 1;
|
||||
$self->{'order'} = [ sort {
|
||||
( $results->{$b} || return 0 ) <=> ( $results->{$a} || 0 )
|
||||
} keys %{$results} ];
|
||||
$self->debug_dumper( "Results will be presented in the following order: ", $self->{'order'} ) if ($self->{_debug});
|
||||
}
|
||||
|
||||
sub get_db_sorted_results {
|
||||
# This assumes $self->{results} has a full result set, i.e. without any LIMIT
|
||||
# It then selects the result set using the SQL driver to do the sorting. This
|
||||
# is for Search modules which can not handle their own sorting
|
||||
my ($self) = @_;
|
||||
|
||||
my $sb = GT::SQL::Search::Base::Search->clean_sb($self->{sb}, $self->{so});
|
||||
$self->{index} = $self->{nh} * $self->{mh} || 0;
|
||||
$self->{max_index} = $self->{index} + $self->{mh} - 1; # index + mh is the first record which should not be returned.
|
||||
if ($self->{max_index} > $self->{rows}) {
|
||||
$self->{max_index} = $self->{rows};
|
||||
$self->{rows} = $self->{rows} - $self->{index};
|
||||
$self->{rows} < 0 ? $self->{rows} = 0 : 0;
|
||||
}
|
||||
|
||||
else {
|
||||
$self->{rows} = $self->{mh};
|
||||
}
|
||||
my ( $table, $pk ) = $self->_table_info();
|
||||
my ( $query, $where, $st, $limit );
|
||||
|
||||
$where = qq!$pk in(! . join( ",", keys %{$self->{results}}) . ')';
|
||||
$self->{mh} and $limit = qq!LIMIT $self->{index}, $self->{rows}!;
|
||||
$query = qq!
|
||||
SELECT $pk
|
||||
FROM $table
|
||||
WHERE $where
|
||||
$sb
|
||||
$limit
|
||||
!;
|
||||
$self->debug( "Row fetch query: $query" ) if ($self->{_debug});
|
||||
my $sth = $self->{table}->{driver}->prepare( $query );
|
||||
$sth->execute();
|
||||
|
||||
$self->{index} = 0;
|
||||
$self->{max_hits} = $self->{rows};
|
||||
|
||||
# Fetch the results in sorted order
|
||||
my $order = $sth->fetchall_arrayref();
|
||||
$sth->finish();
|
||||
|
||||
$self->{'order'} = [ map { $_->[0] } @{$order} ];
|
||||
}
|
||||
|
||||
1;
|
||||
|
@ -0,0 +1,260 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::LUCENE::Search
|
||||
# Author : Scott Beck
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to search indexed tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::LUCENE::Search;
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS $ERRORS $ERROR_MESSAGE /;
|
||||
use Lucene;
|
||||
use GT::TempFile;
|
||||
use GT::SQL::Search::LUCENE::STH;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
|
||||
$ERRORS = {
|
||||
SEARCH_ERROR => "Error searching: %s",
|
||||
QUERY_ERROR => "Query error: %s"
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::LUCENE::Search->new(@_)
|
||||
}
|
||||
|
||||
sub _get_path {
|
||||
my $self = shift;
|
||||
my $name = $self->{table}->name;
|
||||
my $tmpdir = GT::TempFile::find_tmpdir();
|
||||
my $path = $tmpdir . '/' . $name;
|
||||
$path = $1 if $path =~ /(.*)/; # XXX untaint
|
||||
return $path;
|
||||
}
|
||||
|
||||
sub _get_store {
|
||||
my ($self, $create) = @_;
|
||||
my $path = $self->_get_path;
|
||||
return Lucene::Store::FSDirectory->getDirectory($path, $create);
|
||||
}
|
||||
|
||||
sub query {
|
||||
# --------------------------------------------------
|
||||
# Returns a sth based on a query
|
||||
#
|
||||
# Options:
|
||||
# - paging
|
||||
# mh : max hits
|
||||
# nh : number hit (or page of hits)
|
||||
#
|
||||
# - searching
|
||||
# ww : whole word
|
||||
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
||||
# substring : search for substrings of words
|
||||
# bool : 'and' => and search, 'or' => or search, '' => regular query
|
||||
# query : the string of things to ask for
|
||||
#
|
||||
# - filtering
|
||||
# field_name : value # Find all rows with field_name = value
|
||||
# field_name : ">value" # Find all rows with field_name > value.
|
||||
# field_name : "<value" # Find all rows with field_name < value.
|
||||
# field_name-gt : value # Find all rows with field_name > value.
|
||||
# field_name-lt : value # Find all rows with field_name < value.
|
||||
#
|
||||
# Parameters:
|
||||
# ( $CGI ) : a single cgi object
|
||||
# ( $HASH ) : a hash of the parameters
|
||||
#
|
||||
my $self = shift;
|
||||
|
||||
# create an easily accessible argument hash
|
||||
my $args = $self->common_param(@_);
|
||||
my $tbl = $self->{table};
|
||||
|
||||
# see if we can setup the filtering constraints
|
||||
my $filter = { %$args };
|
||||
my $query = delete $args->{query} || $self->{query} || '';
|
||||
my $ftr_cond;
|
||||
|
||||
# parse query
|
||||
$self->debug( "Search Query: $query" ) if ($self->{_debug});
|
||||
|
||||
my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
|
||||
|
||||
$self->{rejected_keywords} = $rejected;
|
||||
|
||||
# setup the additional input parameters
|
||||
$query_struct = $self->_preset_options( $query_struct, $args );
|
||||
|
||||
# now sort into distinct buckets
|
||||
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
|
||||
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
|
||||
|
||||
# with the buckets, it's now possible to create a query string
|
||||
# that can be passed directly into the Lucene search.
|
||||
my $query_string = '';
|
||||
|
||||
foreach my $search_type ( keys %$buckets ) {
|
||||
my $bucket = $buckets->{$search_type};
|
||||
foreach my $token ( keys %$bucket ) {
|
||||
next unless $token;
|
||||
my $properties = $bucket->{$token} or next;
|
||||
$token =~ s/(["()])/\\$1/g;
|
||||
$token =~ s/\b(or|and)\b/ /g;
|
||||
|
||||
my $e = ' ';
|
||||
|
||||
# handle boolean operations
|
||||
$properties->{mode} ||= '';
|
||||
if ( $properties->{mode} eq 'must' ) {
|
||||
$e .= '+';
|
||||
}
|
||||
elsif ( $properties->{mode} eq 'cannot' ) {
|
||||
$e .= '-';
|
||||
}
|
||||
|
||||
# deal with phrase vs keyword
|
||||
if ( $properties->{phrase} ) {
|
||||
$e .= '"' . $token . '"' unless $token =~ /^"|"$/;
|
||||
}
|
||||
else {
|
||||
$e .= $token;
|
||||
|
||||
# substring match
|
||||
if ($properties->{mode} ne 'substring') {
|
||||
$e .= '*' if $properties->{substring};
|
||||
}
|
||||
}
|
||||
|
||||
$query_string .= $e;
|
||||
}
|
||||
}
|
||||
|
||||
# calculate the cursor constraints
|
||||
foreach my $k (qw( nh mh so sb )) {
|
||||
next if defined $args->{$k};
|
||||
$args->{$k} = $self->{$k} || '';
|
||||
}
|
||||
$args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
|
||||
$args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
|
||||
$args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score';
|
||||
|
||||
# Score is the default
|
||||
$args->{so} = (defined $args->{so} and $args->{so} =~ /^(asc|desc)(?:end)?$/i) ? lc($1) : 'asc';
|
||||
|
||||
my %weights = $tbl->_weight_cols();
|
||||
my @sortfields;
|
||||
my $do_mysql_sort = 0;
|
||||
for (ref($args->{sb}) eq 'ARRAY' ? @{$args->{sb}} : $args->{sb}) {
|
||||
if (!exists $weights{$_}) {
|
||||
$do_mysql_sort = 1 if $_ ne 'score';
|
||||
next;
|
||||
}
|
||||
push @sortfields, new Lucene::Search::SortField($_, $args->{so} ne 'asc');
|
||||
}
|
||||
my $sort = @sortfields ? new Lucene::Search::Sort(@sortfields) : Lucene::Search::Sort->RELEVANCE;
|
||||
my $store = $self->_get_store(0);
|
||||
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
|
||||
my $searcher = eval { new Lucene::Search::IndexSearcher($store); };
|
||||
if ($@) {
|
||||
$self->{_debug} and $self->error('SEARCH_ERROR', 'WARN', "$@");
|
||||
return $self->sth({}, 0); # no hits
|
||||
}
|
||||
# Random default field, it's not used
|
||||
my $parser = new Lucene::MultiFieldQueryParser((keys %weights)[0], $analyzer);
|
||||
my $pquery = eval { $parser->parse($query_string, [keys %weights], $analyzer); };
|
||||
if ($@) {
|
||||
$self->{_debug} and $self->error('QUERY_ERROR', 'WARN', "$@");
|
||||
return $self->sth({}, 0); # no hits
|
||||
}
|
||||
my $hits = $searcher->search($pquery, $sort);
|
||||
my $num_hits = $hits->length;
|
||||
|
||||
## Setup a limit only if there is no callback. The callback argument requires a full results list
|
||||
my ($offset, $max_hits) = (0, $num_hits);
|
||||
unless ($self->{callback} or $do_mysql_sort) {
|
||||
$offset = ( $args->{nh} - 1 ) * $args->{mh};
|
||||
$max_hits = $offset + $args->{mh};
|
||||
}
|
||||
$max_hits = $num_hits if $max_hits > $num_hits;
|
||||
my ($pk) = $self->{table}->pk;
|
||||
my @indexes;
|
||||
my $results = {};
|
||||
for (my $i= $offset; $i < $max_hits; ++$i) {
|
||||
my $doc = $hits->doc($i);
|
||||
my $value = $doc->get($pk);
|
||||
my $score = $hits->score($i);
|
||||
$results->{$value} = $score;
|
||||
}
|
||||
|
||||
# now handle filters
|
||||
my $cols = $self->{'table'}->cols();
|
||||
my %filters = map {
|
||||
(my $tmp = $_) =~ s/-[lg]t$//;
|
||||
$cols->{$tmp} ? ($_ => $args->{$_}) : ()
|
||||
} keys %{$args};
|
||||
|
||||
if (keys %filters) {
|
||||
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
|
||||
$results = $self->filter(\%filters, $results);
|
||||
}
|
||||
elsif ($self->{filter}) {
|
||||
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
|
||||
$results = $self->_filter_query( $self->{filter}, $results );
|
||||
}
|
||||
else {
|
||||
$self->debug( "No filters being used.") if ($self->{_debug});
|
||||
}
|
||||
|
||||
# now this query should probably clear the filters once it's been used, so i'll dothat here
|
||||
$self->{filter} = undef;
|
||||
|
||||
# now run through a callback function if needed.
|
||||
if ($self->{callback}) {
|
||||
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
|
||||
$self->{_debug} and $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
|
||||
return $self->sth({}, 0); # no hits
|
||||
}
|
||||
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
$results = $self->{callback}->($self, $results);
|
||||
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
}
|
||||
|
||||
$self->{rows} = $num_hits;
|
||||
|
||||
return $self->sth($results, $do_mysql_sort);
|
||||
}
|
||||
|
||||
sub sth {
|
||||
#--------------------------------------------------------------------------------
|
||||
my ($self, $results, $db_sort) = @_;
|
||||
|
||||
my $sth = GT::SQL::Search::LUCENE::STH->new(
|
||||
'results' => $results,
|
||||
'hits' => $self->{rows},
|
||||
'db' => $self->{table}->{driver},
|
||||
'db_sort' => $db_sort,
|
||||
# pass the following attributes down to the STH handler
|
||||
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
|
||||
);
|
||||
|
||||
return $sth;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,98 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::MSSQL::Indexer
|
||||
# Author: Alex Krohn
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Indexer.pm,v 1.6 2004/08/28 03:53:48 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Supports MS SQL full text indexer on MS SQL 2000 only.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::MSSQL::Indexer;
|
||||
#--------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
|
||||
use GT::SQL::Search::Base::Indexer;
|
||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.6 $ =~ /(\d+)\.(\d+)/;
|
||||
|
||||
$ERRORS = {
|
||||
NOTFROMWEB => 'There are far too many records in table %s for create/destroy of this indexing scheme from the web. Please use alternative method.',
|
||||
MSSQLNONSUPPORT => 'You must be using MS SQL 2000 in order to use full text indexing. Current Database: %s',
|
||||
CREATEINDEX => 'Problem Creating Full Text Index: %s'
|
||||
};
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
|
||||
sub load {
|
||||
my $class = shift;
|
||||
return $class->new(@_);
|
||||
}
|
||||
|
||||
sub ok {
|
||||
#--------------------------------------------------------------------------------
|
||||
my ($class, $tbl) = @_;
|
||||
unless (uc $tbl->{connect}->{driver} eq 'ODBC') {
|
||||
return $class->error ('MSSQLNONSUPPORT', 'WARN', $tbl->{connect}->{driver});
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub drop_search_driver {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $table = $self->{table};
|
||||
my $name = $table->name;
|
||||
my $cat = $name . '_ctlg';
|
||||
|
||||
my $res = eval {
|
||||
$table->do_query(" sp_fulltext_table '$name', 'drop' ");
|
||||
$table->do_query(" sp_fulltext_catalog '$cat', 'drop' ");
|
||||
1;
|
||||
};
|
||||
$res ? return 1 : return;
|
||||
}
|
||||
|
||||
sub add_search_driver {
|
||||
#--------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $table = $self->{table};
|
||||
my $name = $table->name;
|
||||
my $cat = $name . '_ctlg';
|
||||
my %weights = $table->weight;
|
||||
my ($pk) = $table->pk;
|
||||
|
||||
# Enable a database for full text indexing
|
||||
$table->do_query(" sp_fulltext_database 'enable' ") or $self->error('CREATEINDEX', 'FATAL', $GT::SQL::error);
|
||||
# Create a full text catalog to store the data.
|
||||
$table->do_query(" sp_fulltext_catalog '$cat', 'create' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
|
||||
# Make a unique index on primary key (not sure why it isn't by default.
|
||||
$table->do_query(" create unique index PK_$name on $name ($pk) ");
|
||||
# Mark this table as using the full text catalog created
|
||||
$table->do_query(" sp_fulltext_table '$name', 'create', '$cat', 'PK_$name' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
|
||||
# Specify which columns are to be indexed
|
||||
foreach my $col (keys %weights) {
|
||||
if ($weights{$col}) {
|
||||
$table->do_query(" sp_fulltext_column '$name', '$col', 'add' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
|
||||
}
|
||||
}
|
||||
# Must have a timestamp field.
|
||||
$table->do_query(" alter table $name add timestamp ");
|
||||
# Build the index.
|
||||
$table->do_query(" sp_fulltext_table '$name', 'start_change_tracking' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
|
||||
$table->do_query(" sp_fulltext_table '$name', 'start_background_updateindex' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub post_create_table {
|
||||
#--------------------------------------------------------------------------------
|
||||
shift->add_search_driver(@_);
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,179 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::MSSQL::Search
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.9 2004/08/28 03:53:48 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to search indexed tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::MSSQL::Search;
|
||||
#--------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
#--------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.9 $ =~ /(\d+)\.(\d+)/;
|
||||
$ATTRIBS = {
|
||||
min_word_size => 2,
|
||||
};
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::MSSQL::Search->new(@_)
|
||||
}
|
||||
|
||||
sub query {
|
||||
#--------------------------------------------------------------------------------
|
||||
# overruns the usual query system with the mssql version
|
||||
#
|
||||
my $self = shift;
|
||||
|
||||
# Find out what sort of a parameter we're dealing with
|
||||
my $input = $self->common_param(@_);
|
||||
|
||||
# Add additional parameters if required
|
||||
foreach my $parameter ( keys %{$ATTRIBS} ) {
|
||||
if ( not exists $input->{$parameter} ) {
|
||||
$input->{$parameter} = $self->{$parameter};
|
||||
}
|
||||
}
|
||||
|
||||
# Parse query...,
|
||||
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
|
||||
$self->{'rejected_keywords'} = $rejected;
|
||||
|
||||
# Setup the additional input parameters
|
||||
$query = $self->_preset_options( $query, $input );
|
||||
|
||||
# Now sort into distinct buckets
|
||||
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query );
|
||||
my $tbl = $self->{table} or $self->error( 'BADARGS', 'FATAL', 'Must have table object defined' );
|
||||
my $string = $self->_string ($buckets);
|
||||
|
||||
return $self->sth({}) unless ($string =~ /\w/);
|
||||
|
||||
my $table_name = $tbl->name();
|
||||
my ($pk) = $tbl->pk;
|
||||
|
||||
# create the filter
|
||||
my $filter_sql = $self->{filter} ? "WHERE ( " . $self->{filter}->sql . ' )' : '';
|
||||
|
||||
# If we have a callback, we need all results.
|
||||
if ($self->{callback}) {
|
||||
$query = qq!
|
||||
SELECT $pk, K.RANK
|
||||
FROM $table_name AS T INNER JOIN
|
||||
CONTAINSTABLE ( $table_name, *,
|
||||
'$string'
|
||||
) AS K
|
||||
ON T.$pk = K.[KEY]
|
||||
$filter_sql
|
||||
!;
|
||||
my %results = $tbl->do_query($query)->fetchall_list;
|
||||
my $results = $self->{callback}->($self, \%results);
|
||||
$self->{rows} = $results ? scalar keys %$results : 0;
|
||||
return $self->sth($results);
|
||||
}
|
||||
else {
|
||||
my $mh = (defined $input->{nh} and $input->{nh} =~ /^(\d+)$/) ? $1 : 1;
|
||||
my $nh = (defined $input->{mh} and $input->{mh} =~ /^(\d+)$/) ? $1 : 25;
|
||||
# First get the total.
|
||||
$query = qq!
|
||||
SELECT COUNT(*)
|
||||
FROM $table_name AS T INNER JOIN
|
||||
CONTAINSTABLE ( $table_name, *,
|
||||
'$string'
|
||||
) AS K
|
||||
ON T.$pk = K.[KEY]
|
||||
$filter_sql
|
||||
!;
|
||||
my ($count) = $tbl->do_query($query)->fetchrow;
|
||||
|
||||
# Now get results.
|
||||
$query = qq!
|
||||
SELECT $pk, K.RANK
|
||||
FROM $table_name AS T INNER JOIN
|
||||
CONTAINSTABLE ( $table_name, *,
|
||||
'$string'
|
||||
) AS K
|
||||
ON T.$pk = K.[KEY]
|
||||
$filter_sql
|
||||
ORDER BY K.RANK DESC
|
||||
!;
|
||||
my %results = $tbl->do_query($query)->fetchall_list;
|
||||
$self->{rows} = $count;
|
||||
return $self->sth(\%results);
|
||||
}
|
||||
}
|
||||
|
||||
sub _string {
|
||||
# -------------------------------------------------------------------
|
||||
# Returns the string to use for containstable.
|
||||
#
|
||||
my ($self, $buckets) = @_;
|
||||
|
||||
# union
|
||||
my $tmp_bucket = $buckets->{keywords};
|
||||
my $union_request_str = join(
|
||||
" or ",
|
||||
map(
|
||||
qq!"$_"!,
|
||||
keys %{$buckets->{phrases}}
|
||||
),
|
||||
map(
|
||||
($tmp_bucket->{$_}->{substring} ? "$_*" : $_),
|
||||
keys %$tmp_bucket
|
||||
)
|
||||
);
|
||||
|
||||
# intersect
|
||||
$tmp_bucket = $buckets->{keywords_must};
|
||||
my $intersect_request_str = join(
|
||||
" and ",
|
||||
map(
|
||||
qq!"$_"!,
|
||||
keys %{$buckets->{phrases_must}}
|
||||
),
|
||||
map(
|
||||
($tmp_bucket->{$_}->{substring} ? "$_*" : $_),
|
||||
keys %$tmp_bucket
|
||||
)
|
||||
);
|
||||
|
||||
# disjoin
|
||||
$tmp_bucket = $buckets->{keywords_cannot};
|
||||
my $disjoin_request_str = join(
|
||||
" and ",
|
||||
map(
|
||||
qq!"$_"!,
|
||||
keys %{$buckets->{phrases_cannot}}
|
||||
),
|
||||
map(
|
||||
($tmp_bucket->{$_}->{substring} ? "$_*" : $_),
|
||||
keys %$tmp_bucket
|
||||
)
|
||||
);
|
||||
|
||||
# now build the query
|
||||
my $tmp_request_str = join(
|
||||
" and ",
|
||||
($union_request_str ? "( $union_request_str )" : ()),
|
||||
($intersect_request_str ? "( $intersect_request_str )" : ()),
|
||||
($disjoin_request_str ? "NOT ( $disjoin_request_str )" : ())
|
||||
);
|
||||
return $tmp_request_str;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,187 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::MYSQL::Indexer
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Indexer.pm,v 1.17 2004/08/28 03:53:49 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to search indexed tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::MYSQL::Indexer;
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
|
||||
use GT::SQL::Search::Base::Indexer;
|
||||
@ISA = qw/GT::SQL::Search::Base::Indexer/;
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.17 $ =~ /(\d+)\.(\d+)/;
|
||||
|
||||
$ERRORS = {
|
||||
NOTFROMWEB => 'There are far too many records in table %s for create/destroy of this indexing scheme from the web. Please use alternative method.',
|
||||
MYSQLNONSUPPORT => 'Driver MYSQL requires MySQL version 3.23.23 or greater. Currently MySQL version: %s'
|
||||
};
|
||||
|
||||
@$GT::SQL::ERRORS{ keys %$ERRORS } = values %$ERRORS;
|
||||
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
|
||||
sub load {
|
||||
my $class = shift;
|
||||
return $class->new(@_);
|
||||
}
|
||||
|
||||
sub ok {
|
||||
# ------------------------------------------------------------------------------
|
||||
my ($class, $tbl) = @_;
|
||||
unless (uc $tbl->{connect}->{driver} eq 'MYSQL') {
|
||||
return $class->error ('MYSQLNONSUPPORT', 'WARN', $tbl->{connect}->{driver});
|
||||
}
|
||||
my $sth = $tbl->do_query(qq!SELECT VERSION()!);
|
||||
my $version = $sth->fetchrow;
|
||||
my ($maj, $min) = split (/\./, $version);
|
||||
unless ($maj > 3 or ($maj == 3 and $min >= 23)) {
|
||||
return $class->error(MYSQLNONSUPPORT => WARN => $version);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub drop_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
|
||||
$self->too_much() and return;
|
||||
|
||||
my $tbl = $self->{table} or return;
|
||||
$tbl->connect();
|
||||
|
||||
my %weights = $tbl->weight() or return;
|
||||
my $tblname = $tbl->name();
|
||||
|
||||
# Group the fulltext columns by value of the weight
|
||||
my %cols_grouped;
|
||||
foreach ( keys %weights ) {
|
||||
my $val = $weights{$_} or next;
|
||||
push @{$cols_grouped{$val}}, $_;
|
||||
}
|
||||
|
||||
# Drop unified fulltext columns if required
|
||||
if ( keys %cols_grouped > 1 ) {
|
||||
$cols_grouped{-1} = [ grep { $weights{$_} } keys %weights ];
|
||||
}
|
||||
|
||||
# For each value grouped column set create a full text
|
||||
# column
|
||||
foreach my $v ( keys %cols_grouped ) {
|
||||
|
||||
my $ft_name = 'ft_'.join("_", sort @{$cols_grouped{$v}});
|
||||
|
||||
my $res = eval {
|
||||
$tbl->do_query(qq!
|
||||
ALTER TABLE $tblname
|
||||
DROP INDEX $ft_name
|
||||
!);
|
||||
};
|
||||
|
||||
# Break on errors that can't be handled
|
||||
if ( $@ ) {
|
||||
next if $@ !~ /exist/i;
|
||||
$self->warn( "$@" );
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub add_search_driver {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
|
||||
$self->too_much() and return;
|
||||
|
||||
my $tbl = $self->{table} or return $self->error(BADARGS => FATAL => "table must be passed into add_search_driver.");
|
||||
my %weights = $tbl->weight() or return $self->error(NOWEIGHTS => 'WARN');
|
||||
my $tblname = $tbl->name() or return $self->error(BADARGS => FATAL => "table does not have a name?");
|
||||
|
||||
# group the fulltext columns by value of the weight
|
||||
my %cols_grouped;
|
||||
foreach ( keys %weights ) {
|
||||
my $val = $weights{$_} or next;
|
||||
push @{$cols_grouped{$val}}, $_;
|
||||
}
|
||||
|
||||
# Create unified fulltext columns if required
|
||||
if ( keys %cols_grouped > 1 ) {
|
||||
$cols_grouped{-1} = [ grep { $weights{$_} } keys %weights ];
|
||||
}
|
||||
|
||||
# for each value grouped column set create a full text
|
||||
# column
|
||||
foreach my $v ( keys %cols_grouped ) {
|
||||
|
||||
my $cols = join(",", sort @{$cols_grouped{$v}});
|
||||
my $ft_name = 'ft_'.join("_", sort @{$cols_grouped{$v}});
|
||||
|
||||
my $res = eval {
|
||||
$tbl->do_query(qq!
|
||||
ALTER TABLE $tblname
|
||||
ADD FULLTEXT $ft_name ( $cols )
|
||||
!);
|
||||
};
|
||||
|
||||
# break on errors that can't be handled
|
||||
if ( $@ ) {
|
||||
next if $@ =~ /duplicate/i;
|
||||
$self->warn( "$@" );
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
sub too_much {
|
||||
# ------------------------------------------------------------------------------
|
||||
# returns true if there are too many records to be used on the Web
|
||||
#
|
||||
if ( $ENV{REQUEST_METHOD} ) {
|
||||
my $self = shift;
|
||||
my $tbl = $self->{table};
|
||||
if ( $tbl->count() > 5000 ) {
|
||||
$self->error( 'NOTFROMWEB', 'WARN', $tbl->name() );
|
||||
return 1
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
sub post_create_table {
|
||||
# ------------------------------------------------------------------------------
|
||||
shift->add_search_driver(@_);
|
||||
}
|
||||
|
||||
sub reindex_all {
|
||||
# ------------------------------------------------------------------------------
|
||||
# this will drop all the fulltext columns and reindex all of them. This should
|
||||
# not be required unless the user changes the weights on one of their columns.
|
||||
# Unfortunately, this method is not particularly smart and risks not dropping
|
||||
# certain index columns and reindexes even when it's not required. It must be
|
||||
# recoded at a future date, but as this action won't happen frequently and will
|
||||
# rarely affect the user, it is not a priority.
|
||||
#
|
||||
my $self = shift;
|
||||
|
||||
$self->drop_search_driver;
|
||||
$self->add_search_driver;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,51 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::MYSQL::Search
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.14 2004/01/13 01:35:19 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to search indexed tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::MYSQL::Search;
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.14 $ =~ /(\d+)\.(\d+)/;
|
||||
$ATTRIBS = {
|
||||
min_word_size => 4
|
||||
};
|
||||
|
||||
sub load {
|
||||
# --------------------------------------------------
|
||||
my $self = shift;
|
||||
my $opts = $self->common_param( @_ );
|
||||
|
||||
# determine which mysql search variant to use.
|
||||
my $tbl = $opts->{table};
|
||||
my $ver_sth = $tbl->do_query( 'SELECT VERSION()' );
|
||||
my $version = $ver_sth->fetchrow_array();
|
||||
|
||||
my ( $maj, $min ) = split /\./, $version;
|
||||
|
||||
my $pkg = 'GT::SQL::Search::MYSQL::';
|
||||
$pkg .= $maj > 3 ? 'VER4' : 'VER3';
|
||||
|
||||
eval "require $pkg";
|
||||
return $pkg->new(@_)
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,178 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::MYSQL::VER3
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: VER3.pm,v 1.3 2004/08/28 03:53:49 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to search indexed tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::MYSQL::VER3;
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.3 $ =~ /(\d+)\.(\d+)/;
|
||||
$ATTRIBS = {
|
||||
min_word_size => 4
|
||||
};
|
||||
|
||||
sub _phrase_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $phrases = shift or return $_[0];
|
||||
my $results = shift || {};
|
||||
|
||||
foreach my $phrase ( values %{$phrases} ) {
|
||||
$self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug});
|
||||
|
||||
my $tmp = {};
|
||||
foreach my $keyword ( @{ $phrase->{phrase}|| [] } ) {
|
||||
$tmp = $self->_get_words ( [ $keyword ], $tmp, 'intersect' );
|
||||
keys %$tmp or return {};
|
||||
}
|
||||
foreach my $key ( keys %$tmp ) { $results->{$key} += $tmp->{$key} }
|
||||
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _get_phrase {
|
||||
# ------------------------------------------------------------------------------
|
||||
# one day change this so it does words properly
|
||||
return _get_words(@_);
|
||||
}
|
||||
|
||||
sub _union_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
return _get_words(@_);
|
||||
}
|
||||
|
||||
sub _intersect_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
my ( $self, $keywords, $results ) = @_;
|
||||
$keywords or return $results;
|
||||
|
||||
foreach my $keyword ( keys %{ $keywords || {} } ) {
|
||||
$results = $self->_get_words ( [ $keyword ], $results, 'intersect' );
|
||||
keys %$results or return {};
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _phrase_intersect_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $phrases = shift or return $_[0];
|
||||
my $results = shift || {};
|
||||
|
||||
my $tmp = $self->_phrase_query ( $phrases, $results );
|
||||
keys %$results or return $tmp;
|
||||
foreach my $key ( keys %$results ) {
|
||||
if ( $tmp->{$key} ) {
|
||||
$results->{$key} += $tmp->{$key};
|
||||
}
|
||||
else {
|
||||
delete $results->{$key}
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _disjoin_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $words = shift or return shift;
|
||||
my $results = shift || {};
|
||||
|
||||
$results = $self->_get_words([ keys %{$words || {}} ], $results, 'disjoin' );
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
sub _phrase_disjoin_query {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $phrases = shift or return shift;
|
||||
my $results = shift || {};
|
||||
|
||||
my $tmp = $self->_phrase_query ( $phrases, $results );
|
||||
keys %$results or return $tmp;
|
||||
foreach my $key ( keys %$results ) {
|
||||
$tmp->{$key} and delete $results->{$key};
|
||||
}
|
||||
}
|
||||
|
||||
sub _get_words {
|
||||
# ------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $words = shift or return $_[0] || {};
|
||||
my $results = shift || {};
|
||||
my $mode = lc shift;
|
||||
|
||||
my $tbl = $self->{table} or $self->error( 'BADARGS', 'FATAL', 'Must have table object defined' );
|
||||
my $tname = $tbl->name();
|
||||
my $wordlist = join " ", ( ref $words ? ( ref $words eq 'HASH' ? keys %$words : @$words ) : $words ) ;
|
||||
my ($pk) = $tbl->pk;
|
||||
|
||||
my %weights = $tbl->_weight_cols();
|
||||
my $cols = join(",", keys %weights);
|
||||
my $qwrds = quotemeta( $wordlist );
|
||||
my $where = ( $results and keys %$results )
|
||||
? ("AND $pk IN(" . join(',', keys %$results) . ")")
|
||||
: '';
|
||||
my $query = qq!
|
||||
SELECT $pk, MATCH($cols) AGAINST ('$qwrds') AS SCORE
|
||||
FROM $tname
|
||||
WHERE MATCH($cols) AGAINST ('$qwrds')
|
||||
$where
|
||||
!;
|
||||
my $sth = $tbl->do_query( $query ) or return;
|
||||
|
||||
if ( $mode eq 'disjoin' ) {
|
||||
while ( my $result = $sth->fetchrow ) {
|
||||
delete $results->{$result};
|
||||
}
|
||||
}
|
||||
elsif ( $mode eq 'intersect' ) {
|
||||
my $tmp = {};
|
||||
while ( my $aref = $sth->fetchrow_arrayref ) {
|
||||
$tmp->{$aref->[0]} = $aref->[1];
|
||||
}
|
||||
if ( $results and keys %$results ) {
|
||||
while (my ($id, $score) = each %$results) {
|
||||
if (not defined $tmp->{$id}) {
|
||||
delete $results->{$id};
|
||||
next;
|
||||
}
|
||||
$results->{$id} += $score;
|
||||
}
|
||||
}
|
||||
else {
|
||||
$results = $tmp;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while ( my $aref = $sth->fetchrow_arrayref ) {
|
||||
$results->{$aref->[0]} += $aref->[1];
|
||||
}
|
||||
}
|
||||
return $results;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,355 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::Search::MYSQL::VER4
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: VER4.pm,v 1.9 2004/01/13 01:35:19 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Class used to search indexed tables.
|
||||
#
|
||||
|
||||
package GT::SQL::Search::MYSQL::VER4;
|
||||
# ------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS /;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Preamble information related to the object
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.9 $ =~ /(\d+)\.(\d+)/;
|
||||
$STOPWORDS = { map { $_ => 1 } qw/
|
||||
|
||||
a's able about above according accordingly across actually after
|
||||
afterwards again against ain't all allow allows almost alone
|
||||
along already also although always am among amongst an and another
|
||||
any anybody anyhow anyone anything anyway anyways anywhere apart
|
||||
appear appreciate appropriate are aren't around as aside ask asking
|
||||
associated at available away awfully be became because become becomes
|
||||
becoming been before beforehand behind being believe below beside
|
||||
besides best better between beyond both brief but by c'mon c's came
|
||||
can can't cannot cant cause causes certain certainly changes clearly
|
||||
co com come comes concerning consequently consider considering
|
||||
contain containing contains corresponding could couldn't course currently
|
||||
definitely described despite did didn't different do does doesn't
|
||||
doing don't done down downwards during each edu eg eight either else
|
||||
elsewhere enough entirely especially et etc even ever every everybody
|
||||
everyone everything everywhere ex exactly example except far few
|
||||
fifth first five followed following follows for former formerly
|
||||
forth four from further furthermore get gets getting given gives
|
||||
go goes going gone got gotten greetings had hadn't happens hardly
|
||||
has hasn't have haven't having he he's hello help hence her here
|
||||
here's hereafter hereby herein hereupon hers herself hi him himself
|
||||
his hither hopefully how howbeit however i'd i'll i'm i've ie if ignored
|
||||
immediate in inasmuch inc indeed indicate indicated indicates inner
|
||||
insofar instead into inward is isn't it it'd it'll it's its itself
|
||||
just keep keeps kept know knows known last lately later latter latterly
|
||||
least less lest let let's like liked likely little look looking looks
|
||||
ltd mainly many may maybe me mean meanwhile merely might more
|
||||
moreover most mostly much must my myself name namely nd near nearly
|
||||
necessary need needs neither never nevertheless new next nine no
|
||||
nobody non none noone nor normally not nothing novel now nowhere
|
||||
obviously of off often oh ok okay old on once one ones only onto
|
||||
or other others otherwise ought our ours ourselves out outside over
|
||||
overall own particular particularly per perhaps placed please plus
|
||||
possible presumably probably provides que quite qv rather rd re
|
||||
really reasonably regarding regardless regards relatively respectively
|
||||
right said same saw say saying says second secondly see seeing seem
|
||||
seemed seeming seems seen self selves sensible sent serious seriously
|
||||
seven several shall she should shouldn't since six so some somebody
|
||||
somehow someone something sometime sometimes somewhat somewhere
|
||||
soon sorry specified specify specifying still sub such sup sure
|
||||
t's take taken tell tends th than thank thanks thanx that that's
|
||||
thats the their theirs them themselves then thence there there's
|
||||
thereafter thereby therefore therein theres thereupon these they
|
||||
they'd they'll they're they've think third this thorough thoroughly
|
||||
those though three through throughout thru thus to together too
|
||||
took toward towards tried tries truly try trying twice two un
|
||||
under unfortunately unless unlikely until unto up upon us use used
|
||||
useful uses using usually value various very via viz vs want wants
|
||||
was wasn't way we we'd we'll we're we've welcome well went were
|
||||
weren't what what's whatever when whence whenever where where's
|
||||
whereafter whereas whereby wherein whereupon wherever whether
|
||||
which while whither who who's whoever whole whom whose why will
|
||||
willing wish with within without won't wonder would would wouldn't
|
||||
yes yet you you'd you'll you're you've your yours yourself
|
||||
yourselves zero
|
||||
|
||||
/ };
|
||||
|
||||
$ATTRIBS = {
|
||||
min_word_size => 4,
|
||||
stopwords => $STOPWORDS,
|
||||
};
|
||||
|
||||
sub query {
|
||||
# --------------------------------------------------
|
||||
# Returns a sth based on a query
|
||||
#
|
||||
# Options:
|
||||
# - paging
|
||||
# mh : max hits
|
||||
# nh : number hit (or page of hits)
|
||||
#
|
||||
# - searching
|
||||
# ww : whole word
|
||||
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
||||
# substring : search for substrings of words
|
||||
# bool : 'and' => and search, 'or' => or search, '' => regular query
|
||||
# query : the string of things to ask for
|
||||
#
|
||||
# - filtering
|
||||
# field_name : value # Find all rows with field_name = value
|
||||
# field_name : ">value" # Find all rows with field_name > value.
|
||||
# field_name : "<value" # Find all rows with field_name < value.
|
||||
# field_name-gt : value # Find all rows with field_name > value.
|
||||
# field_name-lt : value # Find all rows with field_name < value.
|
||||
#
|
||||
# Parameters:
|
||||
# ( $CGI ) : a single cgi object
|
||||
# ( $HASH ) : a hash of the parameters
|
||||
#
|
||||
my $self = shift;
|
||||
|
||||
# create an easily accessible argument hash
|
||||
my $args = $self->common_param(@_);
|
||||
|
||||
# see if we can setup the filtering constraints
|
||||
my $filter = { %$args };
|
||||
my $query = delete $args->{query} || $self->{query} || '';
|
||||
my $ftr_cond;
|
||||
|
||||
# parse query
|
||||
$self->debug( "Search Query: $query" ) if ($self->{_debug});
|
||||
my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
|
||||
|
||||
$self->{rejected_keywords} = $rejected;
|
||||
|
||||
# setup the additional input parameters
|
||||
$query_struct = $self->_preset_options( $query_struct, $args );
|
||||
|
||||
# now sort into distinct buckets
|
||||
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
|
||||
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
|
||||
|
||||
# with the buckets, it's now possible to create a query string
|
||||
# that can be passed directly into the FULLTEXT search.
|
||||
my $query_string = '';
|
||||
|
||||
foreach my $search_type ( keys %$buckets ) {
|
||||
my $bucket = $buckets->{$search_type};
|
||||
foreach my $token ( keys %$bucket ) {
|
||||
next unless $token;
|
||||
my $properties = $bucket->{$token} or next;
|
||||
|
||||
my $e = ' ';
|
||||
|
||||
# handle boolean operations
|
||||
$properties->{mode} ||= '';
|
||||
if ( $properties->{mode} eq 'must' ) {
|
||||
$e .= '+';
|
||||
}
|
||||
elsif ( $properties->{mode} eq 'cannot' ) {
|
||||
$e .= '-';
|
||||
}
|
||||
|
||||
# deal with phrase vs keyword
|
||||
if ( $properties->{phrase} ) {
|
||||
$e .= '"' . quotemeta( $token ) . '"';
|
||||
}
|
||||
else {
|
||||
$e .= quotemeta $token;
|
||||
|
||||
# substring match
|
||||
$e .= '*' if $properties->{substring};
|
||||
}
|
||||
|
||||
$query_string .= $e;
|
||||
}
|
||||
}
|
||||
|
||||
# start building the GT::SQL::COndition object that will allow us to
|
||||
# to retreive the data
|
||||
|
||||
require GT::SQL::Condition;
|
||||
my $tbl = $self->{table};
|
||||
my $constraints = GT::SQL::Condition->new;
|
||||
|
||||
# create the GT::SQL::Condition object that will become the filtering
|
||||
# constraints
|
||||
my $filt = $self->{filter};
|
||||
|
||||
if ( $filt and ref $filt eq 'HASH' ) {
|
||||
foreach my $fkey ( keys %$filt ) {
|
||||
next if exists $args->{$fkey};
|
||||
$args->{$fkey} = $filt->{$fkey};
|
||||
}
|
||||
}
|
||||
|
||||
if ( my $filter_cond = $tbl->build_query_cond( $args ) ) {
|
||||
$constraints->add( $filter_cond );
|
||||
}
|
||||
|
||||
# if the cached filter object is a Condition object, append
|
||||
# it to the filter set
|
||||
if ( $filt and UNIVERSAL::isa( $filt, 'GT::SQL::Condition' ) ) {
|
||||
$constraints->add( $filt );
|
||||
}
|
||||
|
||||
# create our fulltext query condition
|
||||
my %weights = $tbl->_weight_cols();
|
||||
my $cols = join(",", keys %weights);
|
||||
if ( $query_string ) {
|
||||
$constraints->add( GT::SQL::Condition->new(
|
||||
"MATCH( $cols )",
|
||||
"AGAINST",
|
||||
\"('$query_string' IN BOOLEAN MODE)" ) );
|
||||
}
|
||||
|
||||
# calculate the cursor constraints
|
||||
foreach my $k (qw( nh mh so sb )) {
|
||||
next if defined $args->{$k};
|
||||
$args->{$k} = $self->{$k} || '';
|
||||
}
|
||||
$args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
|
||||
$args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
|
||||
$args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score';
|
||||
|
||||
# if the sorting method is "score" the order is forced to "descend" (as there
|
||||
# is almost no reason to order by worst matches)
|
||||
# if the storing key is not "score", the default order will be "ascend"
|
||||
$args->{so} =
|
||||
$args->{sb} eq 'score' ? 'desc' : # comment out this entire line to prevent "descend" order forcing
|
||||
( (defined $args->{so} and $args->{so} =~ /^(asc(?:end)?|desc(?:end)?)$/i) ? $1 : 'asc' );
|
||||
|
||||
# check that sb is not dangerous
|
||||
my $sb = $self->clean_sb($args->{sb}, $args->{so});
|
||||
|
||||
$self->debug_dumper( "About to query. Constraint object: ", $constraints) if ($self->{_debug});
|
||||
|
||||
# Setup a limit only if there is no callback. The callback argument requires a full results list
|
||||
unless ( $self->{callback} ) {
|
||||
my $offset = ( $args->{nh} - 1 ) * $args->{mh};
|
||||
$tbl->select_options($sb) if ($sb);
|
||||
$tbl->select_options("LIMIT $offset, $args->{mh}");
|
||||
}
|
||||
|
||||
my $sth;
|
||||
|
||||
# if the weights are all the same value, the query can be optimized
|
||||
# to use just one MATCH AGAINST argument. However, if the weights
|
||||
# are different, each element must be sectioned and queried separately
|
||||
# with the weight value multipler
|
||||
|
||||
# check to see if all the weight values are the same.
|
||||
my $base_weight;
|
||||
my $weights_same = 1;
|
||||
foreach ( values %weights ) {
|
||||
$base_weight ||= $_ or next; # init and skip 0s
|
||||
next if $base_weight == $_;
|
||||
$weights_same = 0;
|
||||
last;
|
||||
}
|
||||
|
||||
# multiplex the action
|
||||
my $result_cols = $self->{callback} ? ($tbl->pk)[0] : '*';
|
||||
|
||||
unless ( $query_string ) {
|
||||
$sth = $tbl->select( [ $result_cols ], $constraints ) or return;
|
||||
}
|
||||
elsif ( $weights_same ) {
|
||||
$sth = $tbl->select( [ $result_cols, "MATCH($cols) AGAINST ('$query_string' IN BOOLEAN MODE) AS score" ], $constraints )
|
||||
or return;
|
||||
}
|
||||
else {
|
||||
|
||||
# group the multiplier counts
|
||||
my %column_multiplier;
|
||||
foreach ( keys %weights ) {
|
||||
push @{$column_multiplier{$weights{$_}}}, $_;
|
||||
}
|
||||
|
||||
my @search_parameters;
|
||||
foreach my $val ( keys %column_multiplier ) {
|
||||
next unless $val;
|
||||
|
||||
my $cols_ar = $column_multiplier{ $val } or next;
|
||||
my $search_cols = join ",", @$cols_ar;
|
||||
|
||||
if ( $val > 1 ) {
|
||||
push @search_parameters, "( ( MATCH($search_cols) AGAINST ('$query_string' IN BOOLEAN MODE) ) * $val )";
|
||||
}
|
||||
else {
|
||||
push @search_parameters, "( MATCH($search_cols) AGAINST ('$query_string' IN BOOLEAN MODE) )";
|
||||
}
|
||||
}
|
||||
|
||||
my $search_sql = "( " . join( " + ", @search_parameters ) . " ) AS score";
|
||||
|
||||
$sth = $tbl->select( [ $result_cols, $search_sql ], $constraints )
|
||||
or return;
|
||||
}
|
||||
|
||||
# If we have a callback, we fetch the primary key => score and pass that hash into
|
||||
# the filter.
|
||||
if ($self->{callback}) {
|
||||
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
|
||||
return $self->error('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
|
||||
}
|
||||
my %results = map { $_->[0] => $_->[1] } @{$sth->fetchall_arrayref};
|
||||
|
||||
$self->debug_dumper("Running results through callback. Had: " . scalar (keys %results) . " results.", \%results) if ($self->{_debug});
|
||||
my $filtered = $self->{callback}->($self, \%results) || {};
|
||||
$self->debug_dumper("New result set: " . scalar (keys %$filtered) . " results.", $filtered) if ($self->{_debug});
|
||||
|
||||
$self->{rows} = scalar keys %$filtered;
|
||||
return $self->sth($filtered);
|
||||
}
|
||||
|
||||
# count the number of hits. create a query for this purpose only if we are required to.
|
||||
$self->{rows} = $sth->rows();
|
||||
if (($args->{nh} > 1) or ($self->{rows} == $args->{mh})) {
|
||||
$self->{rows} = $tbl->count($constraints);
|
||||
}
|
||||
return $sth;
|
||||
}
|
||||
|
||||
sub clean_sb {
|
||||
# -------------------------------------------------------------------------------
|
||||
# Convert the sort by, sort order into an sql string.
|
||||
#
|
||||
my ($class, $sb, $so) = @_;
|
||||
my $output = '';
|
||||
|
||||
return $output unless ($sb);
|
||||
|
||||
if ($sb and not ref $sb) {
|
||||
if ($sb =~ /^[\w\s,]+$/) {
|
||||
if ($sb =~ /\s(?:asc|desc)/i) {
|
||||
$output = 'ORDER BY ' . $sb;
|
||||
}
|
||||
else {
|
||||
$output = 'ORDER BY ' . $sb . ' ' . $so;
|
||||
}
|
||||
}
|
||||
else {
|
||||
$class->error('BADSB', 'WARN', $sb);
|
||||
}
|
||||
}
|
||||
elsif (ref $sb eq 'ARRAY') {
|
||||
foreach ( @$sb ) {
|
||||
/^[\w\s,]+$/ or $class->error( 'BADSB', 'WARN', $sb ), next;
|
||||
}
|
||||
$output = 'ORDER BY ' . join(',', @$sb);
|
||||
}
|
||||
return $output;
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,25 @@
|
||||
# ====================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::NONINDEXED::Indexer
|
||||
# Author: Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Indexer.pm,v 1.3 2004/01/13 01:35:20 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ====================================================================
|
||||
#
|
||||
|
||||
package GT::SQL::Search::NONINDEXED::Indexer;
|
||||
#--------------------------------------------------------------------------------
|
||||
use strict;
|
||||
use vars qw/@ISA $DEBUG/;
|
||||
use GT::SQL::Search::Base::Indexer;
|
||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::NONINDEXED::Indexer->new(@_)
|
||||
}
|
||||
|
||||
1;
|
@ -0,0 +1,257 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search::NONINDEXED::Search
|
||||
# Author : Alex Krohn
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.30 2006/08/09 06:58:39 brewt Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Nonindex search system
|
||||
#
|
||||
|
||||
package GT::SQL::Search::NONINDEXED::Search;
|
||||
# ==================================================================
|
||||
use strict;
|
||||
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG/;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
use GT::SQL::Condition;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
$DEBUG = 0;
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.30 $ =~ /(\d+)\.(\d+)/;
|
||||
$ATTRIBS = {
|
||||
# parse based on latin characters
|
||||
latin_query_parse => 0
|
||||
};
|
||||
|
||||
sub load {
|
||||
shift;
|
||||
return GT::SQL::Search::NONINDEXED::Search->new(@_)
|
||||
}
|
||||
|
||||
sub query {
|
||||
#--------------------------------------------------------------------------------
|
||||
# Returns a sth based on a query
|
||||
#
|
||||
# Options:
|
||||
# - paging
|
||||
# mh : max hits
|
||||
# nh : number hit (or page of hits)
|
||||
#
|
||||
# - searching
|
||||
# ww : whole word
|
||||
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
||||
# substring : search for substrings of words
|
||||
# bool : 'and' => and search, 'or' => or search, '' => regular query
|
||||
# query : the string of things to ask for
|
||||
#
|
||||
# - filtering
|
||||
# field_name : value # Find all rows with field_name = value
|
||||
# field_name : ">value" # Find all rows with field_name > value.
|
||||
# field_name : "<value" # Find all rows with field_name < value.
|
||||
# field_name-gt : value # Find all rows with field_name > value.
|
||||
# field_name-lt : value # Find all rows with field_name < value.
|
||||
#
|
||||
# Parameters:
|
||||
# ( $CGI ) : a single cgi object
|
||||
# ( $HASH ) : a hash of the parameters
|
||||
#
|
||||
my $self = shift;
|
||||
|
||||
# find out what sort of a parameter we're dealing with
|
||||
my $input = $self->common_param(@_);
|
||||
|
||||
# add additional parameters if required
|
||||
foreach my $parameter ( keys %{$ATTRIBS} ) {
|
||||
if ( not exists $input->{$parameter} ) {
|
||||
$input->{$parameter} = $self->{$parameter};
|
||||
}
|
||||
}
|
||||
|
||||
# parse query
|
||||
$self->debug( "Search Query: $$input{query}", 1 ) if ($self->{_debug});
|
||||
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
|
||||
|
||||
$self->{rejected_keywords} = $rejected;
|
||||
|
||||
# setup the additional input parameters
|
||||
$query = $self->_preset_options( $query, $input );
|
||||
|
||||
$self->debug( "Set the pre-options: ", $query ) if ($self->{_debug});
|
||||
|
||||
# now sort into distinct buckets
|
||||
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query );
|
||||
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
|
||||
|
||||
|
||||
require GT::SQL::Condition;
|
||||
my $query_condition = new GT::SQL::Condition;
|
||||
|
||||
# now handle the separate possibilities
|
||||
# the union
|
||||
my $union_cond = $self->_get_condition( $buckets->{keywords}, $buckets->{phrases} );
|
||||
$query_condition->add(GT::SQL::Condition->new(@$union_cond, 'OR')) if $union_cond;
|
||||
# the intersect
|
||||
my $intersect_cond = $self->_get_condition( $buckets->{keywords_must}, $buckets->{phrases_must} );
|
||||
$query_condition->add(GT::SQL::Condition->new(@$intersect_cond)) if $intersect_cond;
|
||||
|
||||
# the disjoin
|
||||
my $disjoin_cond = $self->_get_condition( $buckets->{keywords_cannot}, $buckets->{phrases_cannot} );
|
||||
$query_condition->add(GT::SQL::Condition->new(@$disjoin_cond, 'OR')->not) if $disjoin_cond;
|
||||
|
||||
# now handle filters
|
||||
my $cols = $self->{'table'}->cols();
|
||||
my %filters = map {
|
||||
(my $column = $_) =~ s/-[lg]t$//;
|
||||
exists $cols->{$column}
|
||||
? ($_ => $input->{$_})
|
||||
: ()
|
||||
} keys %{$input};
|
||||
|
||||
# if there was no query nor filter return nothing.
|
||||
keys %$query or keys %filters or return $self->sth({});
|
||||
|
||||
if (keys %filters) {
|
||||
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
|
||||
$self->_add_filters( \%filters );
|
||||
$query_condition = GT::SQL::Condition->new( keys %$query ? $query_condition : (), $self->{filter} );
|
||||
}
|
||||
elsif ($self->{filter} and keys %{$self->{filter}} ) {
|
||||
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
|
||||
$query_condition = GT::SQL::Condition->new( keys %$query ? $query_condition : (), $self->{filter} );
|
||||
}
|
||||
else {
|
||||
$self->debug( "No filters being used.") if ($self->{_debug});
|
||||
}
|
||||
|
||||
# now this query should probably clear the filters once it's been used, so i'll do that here
|
||||
$self->{filter} = undef;
|
||||
|
||||
my $tbl = $self->{table};
|
||||
my ($pk) = $tbl->pk;
|
||||
|
||||
# now run through a callback function if needed.
|
||||
if ($self->{callback}) {
|
||||
|
||||
# Warning: this slows things a heck of a lot.
|
||||
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
|
||||
return $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
|
||||
}
|
||||
|
||||
my $sth = $tbl->select( [ $pk ], $query_condition );
|
||||
my $results = {};
|
||||
while (my $result = $sth->fetchrow) {
|
||||
$results->{$result} = undef;
|
||||
}
|
||||
$self->debug_dumper("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
$results = $self->{callback}->($self, $results);
|
||||
$self->debug_dumper("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
|
||||
$self->{rows} = scalar($results ? keys %{$results} : ());
|
||||
|
||||
return $self->sth( $results );
|
||||
}
|
||||
|
||||
# and now create a search sth object to handle all this
|
||||
$input->{nh} = (defined $input->{nh} and $input->{nh} =~ /^(\d+)$/) ? $1 : 1;
|
||||
$input->{mh} = (defined $input->{mh} and $input->{mh} =~ /^(\d+)$/) ? $1 : 25;
|
||||
$input->{so} = (defined $input->{so} and $input->{so} =~ /^(asc(?:end)?|desc(?:end)?)$/i) ? $1 : '';
|
||||
|
||||
# check that sb is not dangerous
|
||||
my $sb = $self->clean_sb($input->{sb}, $input->{so});
|
||||
|
||||
my $offset = ( $input->{nh} - 1 ) * $input->{mh};
|
||||
$tbl->select_options($sb) if ($sb);
|
||||
$tbl->select_options("LIMIT $offset, $input->{mh}");
|
||||
my $sth = $tbl->select( $query_condition ) or return;
|
||||
|
||||
# so how many hits did we get?
|
||||
$self->{rows} = $sth->rows();
|
||||
if (($input->{nh} > 1) or ($self->{rows} == $input->{mh})) {
|
||||
$self->{rows} = $tbl->count($query_condition);
|
||||
}
|
||||
return $sth;
|
||||
}
|
||||
|
||||
sub _get_condition {
|
||||
#-------------------------------------------------------------------------------
|
||||
my ( $self, $keywords, $phrases ) = @_;
|
||||
|
||||
my @list = ( keys %$keywords, keys %$phrases );
|
||||
|
||||
my $tbl = $self->{table} or return $self->error( 'NODRIVER', 'FATAL' );
|
||||
my @cond = ();
|
||||
my %tmp = $tbl->weight();
|
||||
my @weights = keys %tmp or return;
|
||||
foreach my $element ( @list ) {
|
||||
my @where = ();
|
||||
foreach my $cols ( @weights ) {
|
||||
push @where, [$cols, 'LIKE', "%$element%"]; # Condition does quoting by default.
|
||||
}
|
||||
push @cond, GT::SQL::Condition->new(@where, 'OR');
|
||||
}
|
||||
@cond or return;
|
||||
|
||||
return \@cond;
|
||||
}
|
||||
|
||||
sub _parse_query_string {
|
||||
#------------------------------------------------------------
|
||||
# Parses a query string '+foo -"bar this" alpha' into a hash of
|
||||
# words and modes.
|
||||
#
|
||||
my ($self, $text) = @_;
|
||||
my %modes = (
|
||||
'+' => 'must',
|
||||
'-' => 'cannot',
|
||||
'<' => 'greater',
|
||||
'>' => 'less'
|
||||
);
|
||||
|
||||
# Latin will break up on actual words and punctuation.
|
||||
if ($self->{latin_query_parse}) {
|
||||
return $self->SUPER::_parse_query_string( $text );
|
||||
}
|
||||
else {
|
||||
my $words = {};
|
||||
my @terms;
|
||||
my $i = 0;
|
||||
foreach my $term (split /"/, $text) {
|
||||
push @terms, ($i++ % 2 ? $term : split ' ', $term);
|
||||
}
|
||||
for (my $i = 0; $i < @terms; $i++) {
|
||||
my $word = $terms[$i];
|
||||
$word =~ s/^\s*|\s*$//g;
|
||||
next if ($word eq '');
|
||||
if ($i < $#terms) {
|
||||
($word eq '-') and ($word = '-' . $terms[++$i]);
|
||||
($word eq '+') and ($word = '+' . $terms[++$i]);
|
||||
}
|
||||
$word =~ s/^([<>+-])//;
|
||||
my $mode = ($1 and $modes{$1} or 'can');
|
||||
my $substring = ($word =~ s/\*$//) || 0;
|
||||
if ($word =~ /\s/) {
|
||||
$words->{$word} = {
|
||||
mode => $mode,
|
||||
phrase => 1,
|
||||
substring => $substring,
|
||||
keyword => 0,
|
||||
};
|
||||
}
|
||||
elsif ($word) {
|
||||
$words->{$word} = {
|
||||
mode => $mode,
|
||||
phrase => 0,
|
||||
substring => $substring,
|
||||
keyword => 1,
|
||||
};
|
||||
}
|
||||
}
|
||||
return $words;
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
Reference in New Issue
Block a user