First pass at adding key files

This commit is contained in:
dsainty
2024-06-17 21:49:12 +10:00
commit aa25e9347f
1274 changed files with 392549 additions and 0 deletions

View File

@ -0,0 +1,82 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::Base::Common
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Common.pm,v 1.8 2004/10/13 21:45:02 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Base classes upon which all search drivers are based
#
package GT::SQL::Search::Base::Common;
use strict;
use Exporter;
use vars qw/ @ISA @EXPORT $STOPWORDS /;
@ISA = qw( Exporter );
@EXPORT = qw( &_tokenize &_check_word $STOPWORDS );
$STOPWORDS = { map { $_ => 1 } qw/
of about or all several also she among since an some and such are than
as that at the be them because there been these between they both this
but those by to do toward during towards each upon either for from was
had were has what have when he where her which his while however with if
within in would into you your is it its many more most must on re it
test not above add am pm jan january feb february mar march apr april
may jun june jul july aug august sep sept september oct october nov
november dec december find &amp &gt &lt we http com www inc other
including
/ };
sub _tokenize {
#--------------------------------------------------------------------------------
# takes a strings and chops it up into little bits
my $self = shift;
my $text = shift;
my ( @words, $i, %rejected, $word, $code );
# split on any non-word (includes accents) characters
@words = split /[^\w\x80-\xFF\-]+/, lc $text;
$self->debug_dumper( "Words: ", \@words ) if ($self->{_debug});
# drop all words that are too small, etc.
$i = 0;
while ( $i <= $#words ) {
$word = $words[ $i ];
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
splice( @words, $i, 1 );
$rejected{$word} = $self->{'rejections'}->{$code};
}
else {
$i++; # Words ok.
}
}
$self->debug_dumper( "Accepted Words: ", \@words ) if ($self->{_debug});
$self->debug_dumper( "Rejected Words: ", \%rejected ) if ($self->{_debug});
return ( \@words, \%rejected );
}
sub _check_word {
#--------------------------------------------------------------------------------
# Returns an error code if it is an invalid word, otherwise returns nothing.
#
my $self = shift;
my $word = shift;
my $code;
if ((exists $self->{stopwords}{$word} and ($code = 'STOPWORD')) or
(length($word) < $self->{min_word_size} and $code = 'TOOSMALL' ) or
(length($word) > $self->{max_word_size} and $code = 'TOOBIG')) {
return $code;
}
return;
}
1;

View File

@ -0,0 +1,78 @@
# ====================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::Base::Indexer
# Author: Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Indexer.pm,v 1.4 2004/01/13 01:35:19 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ====================================================================
#
# Description:
#
#
package GT::SQL::Search::Base::Indexer;
use strict;
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
use GT::Base;
use GT::SQL::Search::Base::Common;
#--------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.4 $ =~ /(\d+)\.(\d+)/;
@ISA = qw/GT::Base GT::SQL::Search::Base::Common/;
$ATTRIBS = {
driver => undef,
stopwords => $STOPWORDS,
rejections => {
STOPWORD => "is a stopword",
TOOSMALL => "is too small a word",
TOOBIG => "is too big a word"
},
table => '',
init => 0,
debug => 0,
min_word_size => 3,
max_word_size => 50,
};
sub drop_search_driver { 1 }
sub add_search_driver { 1 }
# found in GT::SQL::Creator
sub pre_create_table { 1 }
sub post_create_table { 1 }
# GT::SQL::Editor
sub pre_add_column { 1 }
sub post_add_column { 1 }
sub pre_delete_column { 1 }
sub post_delete_column { 1 }
sub pre_drop_table { 1 }
sub post_drop_table { 1 }
# GT::SQL::Table
sub pre_add_record { 1 }
sub post_add_record { 1 }
sub pre_update_record { 1 }
sub post_update_record { 1 }
sub pre_delete_record { 1 }
sub post_delete_record { 1 }
sub pre_delete_all_records { 1 }
sub post_delete_all_records { 1 }
sub driver_ok { 1 }
sub reindex_all { 1 }
1;

View File

@ -0,0 +1,287 @@
# ====================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::STH
# Author: Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: STH.pm,v 1.25 2004/08/28 03:53:46 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ====================================================================
#
package GT::SQL::Search::STH;
#--------------------------------------------------------------------------------
use strict;
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE /;
use GT::Base;
@ISA = ('GT::Base');
$ATTRIBS = {
'_debug' => 0,
'sth' => undef,
'results' => {},
'db' => undef,
'table' => undef,
'index' => 0,
'order' => [],
'sb' => 'score',
'so' => '',
'score_col' => 'SCORE',
'score_sort'=> 0,
'nh' => 0,
'mh' => 0
};
$ERROR_MESSAGE = 'GT::SQL';
$ERRORS = {
BADSB => 'Invalid character found in so: "%s"',
};
sub init {
#--------------------------------------------------------------------------------
my $self = shift;
# setup the options
$self->set(@_);
# correct a few of the values
--$self->{nh} if $self->{nh};
my $sth;
my $results = $self->{results};
$self->{rows} = scalar( $results ? keys %{$results} : 0 );
# if we have asked to have sorting by another column (non score), create the part of the query that handles taht
$self->debug( "Setting up sorting for GT::SQL::Search::STH" ) if ($self->{_debug});
my $sb;
# clean up the sort by columns.
unless ($self->{'score_sort'}) {
$sb = GT::SQL::Search::Base::Search->clean_sb($self->{sb}, $self->{so});
}
# setup the max hits and the offsets
$self->{index} = $self->{nh} * $self->{mh} || 0;
$self->{max_index} = $self->{index} + $self->{mh} - 1; # index + mh is the first record which should not be returned.
if ( $self->{max_index} > $self->{rows} ) {
$self->{max_index} = $self->{rows};
$self->{rows} = $self->{rows} - $self->{index};
$self->{rows} < 0 ? $self->{rows} = 0 : 0;
}
else {
$self->{rows} = $self->{mh};
}
# if we are sorting by another column, handle that
if ( $sb and (keys %{$self->{results}})) {
my ( $table, $pk ) = $self->_table_info();
my ( $query, $where, $st, $limit );
$where = qq!$pk in(! . join( ",", keys %{$self->{results}}) . ')';
$self->{mh} and $limit = qq!LIMIT $self->{index}, $self->{rows}!;
$query = qq!
SELECT $pk
FROM $table
WHERE $where
$sb
$limit
!;
$self->debug( "Row fetch query: $query" ) if ($self->{_debug});
$sth = $self->{table}->{driver}->prepare( $query );
$sth->execute();
# fix the counts
$self->{index} = 0;
$self->{max_hits} = $self->{rows};
# now return them
my $order = $sth->fetchall_arrayref();
$sth->finish();
$self->{'order'} = [ map { $_->[0] } @{$order} ];
}
else {
$self->{'order'} = [ sort {
( $results->{$b} || return 0 ) <=> ( $results->{$a} || 0 )
} keys %{$results} ];
$self->debug_dumper( "Results will be presented in the following order: ", $self->{'order'} ) if ($self->{_debug});
}
}
sub cache_results {
#--------------------------------------------------------------------------------
my $self = shift;
my $results = $self->{'results'};
my ($sth, @records, $i, %horder, @order, $in_list);
my $table = $self->{table};
my $tname = $table->name();
my ($pk) = $self->{table}->pk;
use GT::SQL::Condition;
# we know what we're doing here so shut off warns (complains about uninit'd values in range
# if thee aren't enough elements in the order array)
my $w = $^W; $^W = 0;
@order = grep $_, @{$self->{'order'} || []}[$self->{index}..$self->{max_index}] or return [];
$^W = $w;
$i = 0; %horder = ( map { ( $_ => $i++) } @order );
$in_list = join ( ",", @order );
my $query = qq|
SELECT *
FROM
$tname
WHERE
$pk IN($in_list)
|;
# the following is left commented out as...
# if $tbl->select is used $table->hits() will not
# return an accurate count of the number of all the hits. instead, will return
# a value up to mh. $tbl->hits() is important because the value is used
# in toolbar calculations
#
# $sth = $table->select( GT::SQL::Condition->new( $pk, 'IN', \"($in_list)" ) );
$sth = $table->do_query( $query );
while ( my $href = $sth->fetchrow_hashref() ) {
$records[$horder{$href->{$pk}}] = \%$href
}
return \@records;
}
sub fetchrow_array {
#--------------------------------------------------------------------------------
return @{ $_[0]->fetchrow_arrayref() || [] };
}
sub fetchrow_arrayref {
#--------------------------------------------------------------------------------
my $self = shift;
my $records = $self->{cache} ||= $self->cache_results;
my $href = shift @$records or return;
return $self->_hash_to_array($href);
}
sub fetchrow_hashref {
#--------------------------------------------------------------------------------
my $self = shift;
my $results = $self->{'results'};
my $records = $self->{cache} ||= $self->cache_results;
my $table = $self->{table};
my ($pk) = $self->{table}->pk;
my $href = shift @$records or return;
$href->{$self->{'score_col'}} = ( $self->{score} = $results->{$href->{$pk}} );
return $href;
}
sub fetchall_hashref {
#--------------------------------------------------------------------------------
my $self = shift;
my @results;
while (my $res = $self->fetchrow_hashref) {
push @results, $res;
}
return \@results;
}
sub fetchall_list {
#--------------------------------------------------------------------------------
return { map { @$_ } @{shift->fetchall_arrayref} }
}
sub fetchall_arrayref {
#--------------------------------------------------------------------------------
my $self = shift;
$self->{order} or return [];
my $results = $self->{results};
my ($pk) = $self->{table}->pk;
my $scol = $self->{score_col};
if (!$self->{allref_cache}) {
$self->{allref_cache} ||= $self->cache_results;
for my $i ( 0 .. $#{$self->{allref_cache}} ) {
my $element = $self->{allref_cache}->[$i];
if ( $_[0] eq 'HASH' ) {
$element->{$scol} = $results->{$element->{$pk}};
}
else {
$element->{$scol} = $self->_hash_to_array( $element->{$scol} );
}
};
}
my $records = $self->{allref_cache};
return $records;
}
sub score {
#--------------------------------------------------------------------------------
my $self = shift;
return $self->{score};
}
sub _hash_to_array {
#--------------------------------------------------------------------------------
my $self = shift;
my $href = shift or return;
my $results = $self->{'results'};
my $table = $self->{table};
my $cols = $table->cols();
my $ordercols = ( $self->{ordercols} ||= [ sort { $cols->{$a}->{pos} <=> $cols->{$b}->{pos} } keys %$cols ] );
my ($pk) = $self->{table}->pk;
my $aref = [ map { $href->{$_} } @$ordercols, ( $self->{score} = $results->{$href->{$pk}} ) ];
return $aref;
}
sub rows {
#--------------------------------------------------------------------------------
my $self = shift;
return $self->{rows};
}
sub _table_info {
#--------------------------------------------------------------------------------
my $self = shift;
my $table = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
my ($pk) = $self->{table}->pk;
return ( $table, $pk );
}
sub DESTROY {
#--------------------------------------------------------------------------------
my $self = shift;
$self->{'sth'} and $self->{'sth'}->finish();
}
sub debug_dumper {
#--------------------------------------------------------------------------------
# calls debug but also dumps all the messages
my $self = shift;
my $message = shift;
my $level = ref $_[0] ? 1 : shift;
if ( $self->{_debug} >= $level ) {
require GT::Dumper;
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )) if ($self->{_debug});
}
}
1;

View File

@ -0,0 +1,572 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::Base
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.25 2004/08/28 03:53:46 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Base classes upon which all search drivers are based
#
package GT::SQL::Search::Base::Search;
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
use GT::Base;
use GT::SQL::Search::Base::Common;
@ISA = qw( GT::Base GT::SQL::Search::Base::Common);
#--------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.25 $ =~ /(\d+)\.(\d+)/;
@ISA = qw/ GT::Base /;
$ATTRIBS = {
'stopwords' => $STOPWORDS,
'mh' => 25,
'nh' => 1,
'ww' => undef,
'ma' => undef,
'bool' => undef,
'substring' => 0,
'query' => '',
'sb' => 'score',
'so' => '',
'score_col' => 'SCORE',
'score_sort'=> 0,
'debug' => 0,
'_debug' => 0,
# query related
'db' => undef,
'table' => undef,
'filter' => undef,
'callback' => undef,
# strict matching of indexed words, accents on words do count
'sm' => 0,
'min_word_size' => 3,
'max_word_size' => 50,
};
sub init {
#--------------------------------------------------------------------------------
# Initialises the Search object
#
my $self = shift;
my $input = $self->common_param(@_);
$self->set($input);
# now handle filters...,
my $tbl = $self->{table};
my $cols = $tbl->cols();
my %filters = map {
(my $tmp = $_) =~ s/-[lg]t$//;
exists $cols->{$tmp} ? ($_ => $input->{$_}) : ()
} keys %{$input};
if ( keys %filters ) {
$self->debug_dumper( "INIT: Creating Filters: ", \%filters ) if ( $self->{_debug} );
$self->filter(\%filters);
}
$self->{table}->connect;
}
sub query {
#--------------------------------------------------------------------------------
# Returns a sth based on a query
#
# Options:
# - paging
# mh : max hits
# nh : number hit (or page of hits)
#
# - searching
# ww : whole word
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
# substring : search for substrings of words
# bool : 'and' => and search, 'or' => or search, '' => regular query
# query : the string of things to ask for
#
# - filtering
# field_name : value # Find all rows with field_name = value
# field_name : ">value" # Find all rows with field_name > value.
# field_name : "<value" # Find all rows with field_name < value.
# field_name-gt : value # Find all rows with field_name > value.
# field_name-lt : value # Find all rows with field_name < value.
#
# Parameters:
# ( $CGI ) : a single cgi object
# ( $HASH ) : a hash of the parameters
#
my $self = shift;
# find out what sort of a parameter we're dealing with
my $input = $self->common_param(@_);
# add additional parameters if required
foreach my $parameter ( keys %{$ATTRIBS} ) {
if ( not exists $input->{$parameter} ) {
$input->{$parameter} = $self->{$parameter};
}
}
# parse query...,
$self->debug( "Search Query: $$input{query}", 1 ) if ($self->{_debug});
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
$self->{'rejected_keywords'} = $rejected;
# setup the additional input parameters
$query = $self->_preset_options( $query, $input );
$self->debug_dumper( "Set the pre-options: ", $query ) if ($self->{_debug});
# now sort into distinct buckets
my $buckets = &_create_buckets( $query );
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
return $self->_query($input, $buckets);
}
sub _query {
#--------------------------------------------------------------------------------
my ( $self, $input, $buckets ) = @_;
# now handle the separate possibilities
my $results = {};
# query can have phrases
$results = $self->_phrase_query( $buckets->{phrases}, $results );
$self->debug_dumper("Did phrase union query. Current result set and scores: ", $results ) if ($self->{_debug});
# query have keywords
$results = $self->_union_query( $buckets->{keywords}, $results );
$self->debug_dumper("Did keyword union query. Current result set and scores: ", $results ) if ($self->{_debug});
# query must have phrases
$results = $self->_phrase_intersect_query( $buckets->{phrases_must}, $results );
$self->debug_dumper("Did phrase intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
# query must have keywords
$results = $self->_intersect_query( $buckets->{keywords_must}, $results );
$self->debug_dumper("Did keyword intersect query. Current result set and scores: ", $results ) if ($self->{_debug});
# query cannot have keywords
$results = $self->_disjoin_query( $buckets->{keywords_cannot}, $results );
$self->debug_dumper("Did keyword disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
# query cannot have phrases
$results = $self->_phrase_disjoin_query( $buckets->{phrases_cannot}, $results);
$self->debug_dumper("Did phrase disjoin query. Current result set and scores: ", $results ) if ($self->{_debug});
# now handle filters
my $cols = $self->{'table'}->cols();
my %filters = map {
(my $tmp = $_) =~ s/-[lg]t$//;
$cols->{$tmp} ? ($_ => $input->{$_}) : ()
} keys %{$input};
if (keys %filters) {
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
$results = $self->filter(\%filters, $results);
}
elsif ($self->{filter}) {
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
$results = $self->_filter_query( $self->{filter}, $results );
}
else {
$self->debug( "No filters being used.") if ($self->{_debug});
}
# now this query should probably clear the filters once it's been used, so i'll dothat here
$self->{filter} = undef;
# now run through a callback function if needed.
if ($self->{callback}) {
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
return $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
}
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
$results = $self->{callback}->($self, $results);
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
}
# so how many hits did we get?
$self->{table}->{last_hits} = ( $self->{rows} = scalar($results ? keys %{$results} : ()) );
# and now create a search sth object to handle all this
return $self->sth( $results );
}
sub sth {
#--------------------------------------------------------------------------------
my $self = shift;
my $results = shift;
require GT::SQL::Search::Base::STH;
my $sth = GT::SQL::Search::STH->new(
'results' => $results,
'db' => $self->{table}->{driver},
# pass the following attributes down to the STH handler
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
);
return $sth;
}
sub rows {
#--------------------------------------------------------------------------------
# after a query is run, returns the number of rows
my $self = shift;
return $self->{rows} || 0;
}
sub _add_filters {
#--------------------------------------------------------------------------------
# creates the filter object
my $self = shift;
my $filter;
# find out how we're calling the parameters
if ( ref $_[0] eq 'GT::SQL::Condition' ) {
$filter = shift;
}
elsif ( ref $_[0] eq 'HASH' ) {
# setup the query condition using the build_query condition method
# build the condition object
my %opts = %{ shift() || {} };
delete $opts{query};
$filter = $self->{table}->build_query_cond( \%opts, $self->{table}{schema}{cols} );
}
else {
return $self->error ('BADARGS', 'FATAL', "Invalid argument: $_[0] passed to _add_filter");
}
# Use ref, as someone can pass in filter => 1 and mess things up.
ref $self->{filter} ? $self->{filter}->add ($filter) : ($self->{filter} = $filter);
$self->debug_dumper( "Filters: ", $self->{filter} ) if ($self->{_debug});
return $self->{filter};
}
sub _preset_options {
#--------------------------------------------------------------------------------
# sets up word parameters
my $self = shift;
my $query = shift or return;
my $input = shift or return $query;
# whole word searching
if ( defined $input->{'ww'} or defined $self->{'ww'}) {
if ( defined $input->{'ww'} ? $input->{'ww'} : $self->{'ww'} ) {
for ( keys %{$query} ) { $query->{$_}->{mode} = 'substring'; }
}
}
# substring searching
if ( defined $input->{'substring'} or defined $self->{'substring'}) {
if ( defined $input->{'substring'} ? $input->{'substring'} : $self->{'substring'} ) {
for ( keys %{$query} ) { $query->{$_}->{'substring'} = 1; }
}
}
if ( defined $input->{'ma'} or defined $self->{'ma'} ) {
# each keyword must be included
if ( defined $input->{'ma'} ? $input->{'ma'} : $self->{'ma'} ) {
for ( keys %{$query} ) {
next if $query->{$_}->{mode} eq 'cannot';
$query->{$_}->{mode} = 'must';
}
}
# each word can be included but is not necessary
else {
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
}
}
# some more and or searches, only if user hasn't put +word -word
if ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'and' ) {
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
for ( keys %{$query} ) {
next if $query->{$_}->{mode} eq 'cannot';
$query->{$_}->{mode} = 'must';
}
}
}
elsif ( lc( $input->{'bool'} || $self->{'bool'} ) eq 'or' ) {
unless ($input->{query} =~ /(?:^|\s)[+-]\w/) {
for ( keys %{$query} ) { $query->{$_}->{mode} = 'can'; }
}
}
return $query;
}
sub _phrase_query { $_[1] }
sub _union_query { $_[1] }
sub _phrase_intersect_query { $_[1] }
sub _intersect_query { $_[1] }
sub _disjoin_query { $_[1] }
sub _phrase_disjoin_query { $_[1] }
sub filter {
#--------------------------------------------------------------------------------
# adds a filter
#
my $self = shift;
# add filters..,
my $filters = $self->_add_filters( shift );
my $results = shift;
# see if we need to execute a search, otherwise just return the current filterset
defined $results or return $results;
# start doing the filter stuff
return $self->_filter_query( $filters, $results );
}
sub _parse_query_string {
#------------------------------------------------------------
# from Mastering Regular Expressions altered a fair bit
# takes a space delimited string and breaks it up.
#
my $self = shift;
my $text = shift;
my %words = ();
my %reject = ();
my %mode = (
'+' => 'must',
'-' => 'cannot',
'<' => 'greater',
'>' => 'less'
);
# work on the individual elements
my @new = ();
while ( $text =~ m{
# the first part groups the phrase inside the quotes.
# see explanation of this pattern in MRE
([+-]?"[^\"\\]*(?:\\.[^\"\\]*)*)" ?
| (\+?[\w\x80-\xFF\-\*]+),?
| ' '
}gx ) {
my $match = lc $+;
# strip out buffering spaces
$match =~ s/^\s+//; $match =~ s/\s+$//;
# don't bother trying if there is nothing there
next unless $match;
# find out the searching mode
my ($mode, $substring, $phrase);
if (my $m = $mode{substr($match,0,1)}) {
$match = substr($match,1);
$mode = $m;
}
# do we need to substring match?
if ( substr( $match, -1, 1 ) eq "*" ) {
$match = substr($match,0,length($match)-1);
$substring = 1;
}
# find out if we're dealing with a phrase
if ( substr($match,0,1) eq '"' ) {
$self->debug( "Dealing with a phrase: $match" ) if ($self->{_debug});
$match = substr($match,1);
# however, we want to make sure it's a phrase and not something else
my ( $word_list, $rejected ) = $self->_tokenize( $match );
$self->debug_dumper( "Phrase reduced to: ", $word_list ) if ($self->{_debug});
$self->debug_dumper( "Phrase words rejected: ", $rejected ) if ($self->{_debug});
my $word_count = @$word_list;
if ( $word_count > 1 ) { $phrase = $word_list } # ok, standard phrase
elsif ($word_count == 1) { $match = $word_list->[0] } # just a standard word, don't worry about searching by phrase
}
# make sure we can use this word
if ( not $phrase and my $code = $self->_check_word( $match, $self->{stopwords} ) ) {
$reject{ $match } = $code;
next;
}
# now, see if we should toss this word
$words{$match} = {
mode => $mode,
phrase => $phrase,
substring => $substring,
keyword => not $phrase,
};
}
# words is a hashref of:
# {
# word => {
# paramaters => 'values'
# },
# word1 => {
# ...
# },
# ...
# }
#
return( \%words, \%reject );
}
sub _filter_query {
#--------------------------------------------------------------------------------
# get the results from the filter
#
my $self = shift;
my $filters = shift;
my $results = shift or return {};
keys %{$results} or return $results;
my $table = $self->{table};
my $tname = $table->name();
# setup the where clause
my $where = $filters->sql() or return $results;
my ($pk) = $table->pk;
$where .= qq! AND $pk IN (! . join(',', keys %$results) . ')';
# now do the filter
my $query = qq!
SELECT $pk
FROM
$tname
WHERE
$where
!;
$self->debug( "Filter Query: $query" ) if ($self->{_debug});
my $sth = $self->{table}->{driver}->prepare($query);
$sth->execute();
# get all the results
my $aref = $sth->fetchall_arrayref;
return {
map {
$_->[0] => $results->{$_->[0]}
} @$aref
};
}
sub _create_buckets {
#------------------------------------------------------------
# takes the output from _parse_query_string and creates a
# bucket hash of all the different types of searching
# possible
my $query = shift or return;
my %buckets;
# put each word in the appropriate hash bucket
foreach my $parameter ( keys %{$query} ) {
my $word_data = $query->{$parameter};
# the following is slower, however, done that way to be syntatically legible
if ( $word_data->{'phrase'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'} =~ /(must|cannot)/ ))) {
$buckets{"phrases_$1"}->{$parameter} = $word_data;
}
elsif ( $word_data->{'phrase'} ) {
$buckets{'phrases'}->{$parameter} = $word_data;
}
elsif ( $word_data->{'keyword'} and ( defined $word_data->{'mode'} and ($word_data->{'mode'}=~ /(must|cannot|greater|less)/) )) {
$buckets{"keywords_$1"}->{$parameter} = $word_data;
}
else {
$buckets{'keywords'}->{$parameter} = $word_data;
}
}
return \%buckets;
}
sub alternate_driver_query {
#--------------------------------------------------------------------------------
my ( $self, $drivername, $input ) = @_;
$drivername = uc $drivername;
require GT::SQL::Search;
my $driver = GT::SQL::Search->load_search({ %$input, table => $self->{table}, driver => $drivername });
my $sth = $driver->query( $input );
foreach ( qw( rows rejected_keywords ) ) { $self->{$_} = $driver->{$_}; }
return $sth;
}
sub clean_sb {
# -------------------------------------------------------------------------------
# Convert the sort by, sort order into an sql string.
#
my ($class, $sb, $so) = @_;
my $output = '';
return $output unless ($sb);
# Remove score attribute, used only for internal indexes.
$sb =~ s/^\s*score\b//;
$sb =~ s/,?\s*\bscore\b//;
if ($sb and not ref $sb) {
if ($sb =~ /^[\w\s,]+$/) {
if ($sb =~ /\s(?:asc|desc)/i) {
$output = 'ORDER BY ' . $sb;
}
else {
$output = 'ORDER BY ' . $sb . ' ' . $so;
}
}
else {
$class->error('BADSB', 'WARN', $sb);
}
}
elsif (ref $sb eq 'ARRAY') {
foreach ( @$sb ) {
/^[\w\s,]+$/ or $class->error( 'BADSB', 'WARN', $sb ), next;
}
$output = 'ORDER BY ' . join(',', @$sb);
}
return $output;
}
sub debug_dumper {
#--------------------------------------------------------------------------------
# calls debug but also dumps all the messages
my $self = shift;
my $message = shift;
my $level = ref $_[0] ? 1 : defined $_[0] ? shift : 0;
if ( $self->{_debug} >= $level ) {
require GT::Dumper;
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ )) if ($self->{_debug});
}
}
1;

View File

@ -0,0 +1,411 @@
# ====================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::INTERNAL::Indexer
# Author: Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Indexer.pm,v 1.11 2004/01/13 01:35:19 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ====================================================================
#
package GT::SQL::Search::INTERNAL::Indexer;
# ------------------------------------------------------------------------------
# Preamble information related to the object
use strict;
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG /;
use GT::SQL::Search::Base::Indexer;
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.11 $ =~ /(\d+)\.(\d+)/;
sub load {
shift;
return GT::SQL::Search::INTERNAL::Indexer->new(@_)
}
sub drop_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
my $table = $self->{table}->name;
my $rc1 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Word_List");
my $rc2 = $self->{table}->do_query(qq!DROP TABLE $table! ."_Score_List");
return 1;
}
sub add_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
my $name = $self->{table}->name;
# first create the table that handles the words.
my $creator = $self->{table}->creator ( $name . "_Word_List" );
$creator->cols(
Word_ID => {
pos => 1,
type => 'int',
not_null => 1,
unsigned => 1
},
Word => {
pos => 2,
type => 'varchar',
not_null=> 1,
size => '50'
},
Frequency => {
pos => 3,
type => 'int',
not_null=> 1
}
);
$creator->pk('Word_ID');
$creator->ai('Word_ID');
$creator->unique({ $name . "_wordndx" => ['Word'] });
$creator->create('force') or return;
# now create the handler for scores
$creator = $self->{table}->creator( $name . '_Score_List' );
$creator->cols(
Word_ID => {
pos => 1,
type => 'int',
not_null => 1,
unsigned => 1
},
Item_ID => {
pos => 2,
type => 'int',
not_null => 1,
unsigned => 1
},
Score => {
pos => 3,
type => 'int',
not_null => 1
},
Word_Pos => {
pos => 4,
type => 'int',
not_null => 1
}
);
$creator->index({ 'wndx' => ['Word_ID', 'Item_ID', 'Score'], 'itndx' => ['Item_ID'] });
$creator->create('force') or return;
return 1;
}
sub post_create_table {
# ------------------------------------------------------------------------------
# creates the index tables..
#
return $_[0]->add_search_driver(@_);
}
sub post_drop_table {
# -------------------------------------------------------
# Remove the index tables.
#
return $_[0]->drop_search_driver(@_);
}
sub init_queries {
# -------------------------------------------------------
# Pre-load all our queries.
#
my $self = shift;
my $queries = shift;
my $driver = $self->{table}->{driver} or return $self->error ('NODRIVER', 'FATAL');
my $table_name = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
my $wtable = $table_name . '_Word_List';
my $seq = $wtable . '_seq';
my $stable = $table_name . '_Score_List';
my %ai_queries = (
ins_word_ORACLE => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES ($seq.NEXTVAL, ?, ?)",
ins_word_PG => "INSERT INTO $wtable (Word_ID, Word, Frequency) VALUES (NEXTVAL('$seq'), ?, ?)",
ins_word => "INSERT INTO $wtable (Word, Frequency) VALUES (?, ?)"
);
my %queries = (
upd_word => "UPDATE $wtable SET Frequency = ? WHERE Word_ID = ?",
sel_word => "SELECT Word_ID,Word,Frequency FROM $wtable WHERE Word = ?",
sel_freq => "SELECT Frequency FROM $wtable WHERE Word_ID = ?",
del_word => "DELETE FROM $wtable WHERE Word_ID = ?",
mod_word => "UPDATE $wtable SET Frequency = Frequency - ? WHERE Word_ID = ?",
ins_scor => "INSERT INTO $stable (Word_ID, Item_ID, Score, Word_Pos) VALUES (?, ?, ?, ?)",
item_cnt => "SELECT Word_ID, COUNT(*) FROM $stable WHERE Item_ID = ? GROUP BY Word_ID",
scr_del => "DELETE FROM $stable WHERE Item_ID = ?",
dump_word => "DELETE FROM $wtable",
dump_scor => "DELETE FROM $stable"
);
my $type = uc $self->{table}->{connect}->{driver};
$self->{ins_word} = $driver->prepare($ai_queries{"ins_word_$type"} || $ai_queries{"ins_word"});
# check to see if the table exist
$self->{table}->new_table( $wtable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
$self->{table}->new_table( $stable ) or return $self->error('CANTPREPARE','WARN', 'Loading of table', $GT::SQL::error);
if ($type eq 'MYSQL') {
foreach my $query (keys %queries) {
$self->{$query} = $driver->prepare_raw ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
}
}
else {
foreach my $query (keys %queries) {
$self->{$query} = $driver->prepare ($queries{$query}) or return $self->error ('CANTPREPARE', 'WARN', $query, $GT::SQL::error);
}
}
}
sub post_add_record {
# -------------------------------------------------------
# indexes a single record
my ($self, $rec, $insert_sth ) = @_;
# Only continue if we have weights and a primary key.
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my %weights = $tbl->_weight_cols() or return;
my ($pk) = $tbl->pk();
my $item_id = ( $tbl->ai() and $insert_sth ) ? $insert_sth->insert_id() : $rec->{$pk};
my $index = 0;
$self->{init} or $self->init_queries;
# Go through each column and index it.
foreach my $column ( keys %weights ) {
my ($word_list, $rejected) = $self->_tokenize( $rec->{$column} );
$word_list or next;
# Build a hash of word => frequency.
my %words;
foreach my $word (@{$word_list}) {
$words{$word}++;
}
# Add the words in, or update frequency.
my %word_ids = ();
while (my ($word, $freq) = each %words) {
$self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
my $word_r = $self->{sel_word}->fetchrow_arrayref; # Word_ID, Word, Frequency
if ($word_r) {
$word_r->[2] += $freq;
$word_ids{$word} = $word_r->[0];
$self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
}
else {
$self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
$word_ids{$word} = $self->{ins_word}->insert_id();
}
}
# now that we have the word ids, insert each of the word-points
my $weight = $weights{$column};
foreach my $word ( @{$word_list} ) {
$self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
}
$index++;
}
return 1;
}
sub reindex_all {
# -------------------------------------------------------
my $self = shift;
my $table = shift;
my $opts = shift;
my $tick = $opts->{tick} || 0;
my $max = $opts->{max} || 5000;
my %weights = $self->{table}->_weight_cols() or return;
my @weight_list = keys %weights;
my @weight_arr = map { $weights{$_} } @weight_list;
my ($pk) = $self->{table}->pk();
my $index = 0;
my $word_id = 1;
$self->{init} or $self->init_queries;
# first nuke the current index
$self->dump_index();
# Go through the table and index each field.
my $iterations = 1;
my $count = 0;
while (1) {
if ($max) {
my $offset = ($iterations-1) * $max;
$table->select_options ( "LIMIT $offset,$max");
}
my $cond = $opts->{cond} || {};
my $sth = $table->select($cond, [ $pk, @weight_list] );
my $done = 1;
while ( my $arrayref = $sth->fetchrow_arrayref() ) {
# the primary key value
my $i = 0;
my $item_id = $arrayref->[($i++)];
$index = 0;
$done = 0;
# start going through the record data
foreach my $weight ( @weight_arr ) {
my ($word_list, $junk) = $self->_tokenize( $arrayref->[$i++] );
$word_list or next;
# Build a hash of word => frequency.
my %words;
foreach my $word (@{$word_list}) {
$words{$word}++;
}
# Add the words in, or update frequency.
my %word_ids = ();
while (my ($word, $freq) = each %words) {
$self->{sel_word}->execute($word) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
my $word_r = $self->{sel_word}->fetchrow_arrayref; # WordID,Word,Freq
if ($word_r) {
$word_r->[2] += $freq;
$word_ids{$word} = $word_r->[0];
$self->{upd_word}->execute ($word_r->[2], $word_r->[0]) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
}
else {
$self->{ins_word}->execute ($word, $words{$word}) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
$word_ids{$word} = $self->{ins_word}->insert_id();
}
}
# now that we have the word ids, insert each of the word-points
foreach my $word ( @{$word_list} ) {
$self->{ins_scor}->execute ($word_ids{$word}, $item_id, $weight, $index++) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
}
$index++;
}
if ($tick) {
$count++;
$count % $tick or (print "$count ");
$count % ($tick*10) or (print "\n");
}
}
return if ($done);
$iterations++;
return if (! $max);
}
}
sub pre_delete_record {
# -------------------------------------------------------
# Delete a records index values.
#
my $self = shift;
my $where = shift;
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my %weights = $tbl->_weight_cols() or return;
my ($pk) = $tbl->pk();
my $q = $tbl->select( $where, [ $pk ] );
while ( my $aref = $q->fetchrow_arrayref() ) {
my $item_id = $aref->[0] or next;
my @weight_list = keys %weights;
my $index = 0;
$self->{init} or $self->init_queries;
# Get a frequency count for each word
$self->{item_cnt}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
# Now go through and either decrement the freq, or remove the entry.
while ( my ($word_id, $frequency) = $self->{item_cnt}->fetchrow_array() ) {
$self->{sel_freq}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
$self->debug( "Deleting frequencies for $word_id. decreasing by $frequency" ) if ($self->{_debug});
if (my $freq = $self->{sel_freq}->fetchrow_arrayref) {
if ($freq->[0] == $frequency) {
$self->{del_word}->execute($word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
}
else {
$self->{mod_word}->execute($frequency, $word_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
}
}
}
# Remove the listings from the scores table.
$self->{scr_del}->execute($item_id) or return $self->error ('CANTEXECUTE', 'WARN', $DBI::errstr);
}
return 1;
}
sub post_update_record {
# -------------------------------------------------------
my ( $self, $set_cond, $where_cond, $tmp ) = @_;
# delete the previous record
$self->pre_delete_record( $where_cond ) or return;
#
# the new record
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my $q = $tbl->select( $where_cond );
while ( my $href = $q->fetchrow_hashref() ) {
$self->post_add_record( $href );
}
return 1;
}
sub reindex_record {
# -------------------------------------------------------
# reindexes a record. basically deletes all associated records from current db abnd does an index.
# it's safe to use this
my $self = shift;
my $rec = shift;
$self->delete_record($rec);
$self->index_record($rec);
}
sub dump_index {
# -------------------------------------------------------
my $self = shift;
$self->{init} or $self->init_queries;
$self->{dump_word}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
$self->{dump_scor}->execute() or $self->error('CANTEXECUTE', 'WARN', $DBI::errstr);
}
sub debug_dumper {
# ------------------------------------------------------------------------------
# calls debug but also dumps all the messages
my $self = shift;
my $message = shift;
my $level = ref $_[0] ? 1 : shift;
if ( $self->{_debug} >= $level ) {
require GT::Dumper;
$self->debug( $message . join( "", map { GT::Dumper::Dumper($_) } @_ ));
}
}
sub DESTROY {
# ------------------------------------------------------------------------------
# Calls finish on init queries.
#
my $self = shift;
return unless ($self->{init});
$self->{upd_word}->finish;
# $self->{ins_word}->finish; will get finished automatically
$self->{sel_word}->finish;
$self->{sel_freq}->finish;
$self->{del_word}->finish;
$self->{mod_word}->finish;
$self->{ins_scor}->finish;
$self->{item_cnt}->finish;
$self->{scr_del}->finish;
$self->{dump_word}->finish;
$self->{dump_scor}->finish;
$self->{init} = 0;
}
1;

View File

@ -0,0 +1,604 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Indexer
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.18 2004/08/28 03:53:47 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to make changes to tables and create tables.
#
package GT::SQL::Search::INTERNAL::Search;
# ------------------------------------------------------------------------------
use strict;
use vars qw/@ISA $VERSION $DEBUG $ATTRIBS /;
use GT::SQL::Search::Base::Search;
@ISA = qw( GT::SQL::Search::Base::Search );
# ------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/;
$ATTRIBS = {
# the max number of links that can be handled by UNION before it should simply
# shunt the searching pipe to NONINDEXED system
'union_shunt_threshold' => '5000',
'phrase_shunt_threshold' => '1000',
};
################################################################################
# Internal functions
################################################################################
sub load {
shift;
return GT::SQL::Search::INTERNAL::Search->new(@_)
}
sub _query {
# ------------------------------------------------------------------------------
# this just checks to ensure that the words are not all search keywords
#
my ( $self, $input, $buckets ) = @_;
# calculate wordids and frequencies
foreach ( keys %$buckets ) {
$buckets->{$_} = $self->get_wordids( $buckets->{$_}, ( /phrase/ ? "phrases" : "keywords" ) );
}
# the following is a bit tricky and will be replaced however, if the number
# of results from a union is more than the maximum shunt value, it will
# simply do a nonindexed query
if ( $buckets->{keywords} ) {
my $rec = _count_frequencies( $buckets->{keywords} );
my $count = 0;
foreach ( values %$rec ) { $count += $_; }
if ($count > $self->{union_shunt_threshold}) {
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
return $self->alternate_driver_query( 'NONINDEXED', $input );
}
}
# Now test the phrases. Just due to how the phrase searching works, the queries
# can grow in size extremely rapidly, and slowdown the search. So the limit for
# phrase searching is separate as it requires a different cutoff value than
# the keyword search which is usually much lower!
if ($buckets->{phrases}) {
foreach my $phrase ( keys %{$buckets->{phrases} || {} } ) {
my $rec = _count_frequencies( $buckets->{phrases}->{$phrase}->{word_info} );
my ( $count ) = sort values %$rec; # Get smallest frequency.
if ( $count > $self->{phrase_shunt_threshold} ) {
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
return $self->alternate_driver_query( 'NONINDEXED', $input );
}
}
}
if ($buckets->{phrases_must}) {
foreach my $phrase ( keys %{$buckets->{phrases_must} || {} } ) {
my $rec = _count_frequencies( $buckets->{phrases_must}->{$phrase}->{word_info} );
my ( $count ) = sort values %$rec; # Get smallest frequency.
if ( $count > $self->{phrase_shunt_threshold} ) {
$self->debug_dumper("Too many results using internal search, falling back to nonindexed. Counts are: ", $rec) if ($self->{_debug});
return $self->alternate_driver_query( 'NONINDEXED', $input );
}
}
}
return $self->SUPER::_query( $input, $buckets );
}
sub _count_frequencies {
# ------------------------------------------------------------------------------
my $word_info = shift;
my $rec = {};
foreach my $word ( keys %$word_info ) {
my $freq = 0;
foreach ( values %{$word_info->{$word}->{word_info}} ) {
$freq += $_;
}
$rec->{$word} = $freq;
}
return $rec;
}
sub _table_names {
# ------------------------------------------------------------------------------
# return the table names
#
my $self = shift;
my $table = $self->{table}->name() or return $self->error('NOSCHEMA', 'FATAL');
my $wtable = $table . '_Word_List';
my $stable = $table . '_Score_List';
return ( $table, $wtable, $stable);
}
sub _word_infos {
# ------------------------------------------------------------------------------
# get the word ids and frequencies
#
my $self = shift;
my $word_infos = shift;
my $rec = {};
foreach my $word ( keys %$word_infos ) {
my $wi = $word_infos->{$word}->{word_info};
$rec->{$word} = [ map { [ $_, $wi->{$_} ] } keys %$wi ];
}
return $rec;
}
sub _union_query {
# ------------------------------------------------------------------------------
# Takes a list of words and gets all words that match
# returns { itemid -> score } of hits that match
#
my $self = shift;
my $words = shift;
my $results = shift || {};
my ( $query, $where, $db, $word_infos );
my ( $table, $wtable, $stable) = $self->_table_names();
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
$word_infos = $self->_word_infos( $words ) or return $results;
return $results unless (keys %{$word_infos});
$self->debug_dumper( "Getting words: ", $words) if ($self->{_debug});
# build the where clause
my @word_ids;
foreach my $word_synonym_list ( values %$word_infos ) {
next unless ( $word_synonym_list );
foreach my $word_id ( @{$word_synonym_list }) {
next unless ( ref $word_id eq 'ARRAY' ); # ensure it's a reference
push @word_ids, $word_id->[0]; # we need to shed the word quantities
}
}
return $results unless ( @word_ids );
$where = 'Word_ID IN(' . join(",", @word_ids) . ")";
# build the query
$query = qq!
SELECT Item_ID, SUM(Score)
FROM $stable
WHERE
$where
GROUP BY Item_ID
!;
$self->debug( "Union Query: $query" ) if ($self->{_debug});
# prepare the query
my $sth = $db->prepare( $query ) or return;
$sth->execute() or return;
# get the results
my %word_infos = $sth->fetchall_list;
# merge the current result set into found
foreach my $item ( keys %{$results} ) {
$word_infos{$item} += $results->{$item};
};
return \%word_infos;
}
sub _intersect_query {
# ------------------------------------------------------------------------------
# Takes a list of words and gets all words that match all the keywords
# returns { itemid -> score } of hits that match
#
my $self = shift;
my $words = shift;
my $results = shift || {};
$words or return $results;
keys %{$words} or return $results;
my ( $query, $where, $db, $word_infos, $word_hits );
my ( $table, $wtable, $stable) = $self->_table_names();
# have we left any of our words out?
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
$word_infos = $self->_word_infos( $words ) or return {};
if ( keys %{$word_infos} < keys %{$words} ) {
return {};
}
$self->debug_dumper( "Keyword Intersect words: ", $word_infos ) if ($self->{_debug});
# take the words and get a hash of the word scores
foreach my $word ( keys %{$word_infos} ) {
my $total_freq = 0;
foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
$total_freq += $word_synonyms->[1];
}
$word_hits->{$word} = $total_freq or return;
}
# so now, sort out the words from lowest frequency to highest frequency
my @search_order = sort { $word_hits->{$a} <=> $word_hits->{$b} } keys %{$word_hits};
$self->debug_dumper( "Searching words in this order: ", \@search_order) if ($self->{_debug});
# find out how we're going to handle the searching, if the first elements
################################################################################
### The following part is for smaller intersect subsets
################################################################################
my $intersect = $results;
foreach my $word ( @search_order ) {
# setup the where clause to get all the words associated
my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
# setup the intersect for the previous if required. for iterative intersecting
if ( keys %{$intersect} ) {
$where .= " AND Item_ID in(" . join(",",keys %{$intersect}) . ")";
}
# make the database engine work a little bit
$query = qq!
SELECT Item_ID, SUM(Score) AS Score
FROM $stable
WHERE
$where
GROUP BY Item_ID
!;
$self->debug( "Intersect Query: $query" ) if ($self->{_debug});
my $intersect_sth = $db->prepare( $query );
$intersect_sth->execute();
# get a list of all the matches
my $matches = $intersect_sth->fetchall_arrayref();
$self->debug_dumper( "Matches found for $word: ", $matches ) if ($self->{_debug});
# go through all the matches and intersect them
my %tmp = ();
foreach my $row ( @{$matches} ) {
my ( $itemid, $score ) = @{$row};
$intersect->{$itemid} ||= 0;
$tmp{ $itemid } = $intersect->{$itemid} + $score;
}
# inform the system of that development
%tmp or return;
$intersect = \%tmp;
}
return $intersect;
}
sub _disjoin_query {
#------------------------------------------------------------
my $self = shift;
my $words = shift;
my $results = shift || {};
$words or return $results;
my ( $query, $where, $db, $word_infos, $word_hits );
my ( $table, $wtable, $stable) = $self->_table_names();
$db = $self->{table}->{driver} or return $results;
# have we left any of our words out?
$word_infos = $self->_word_infos( $words ) or return $results;
# if ( keys %{$word_infos} < keys %{$words} ) {
# return $results;
# }
# take the words and get a hash of the word scores
foreach my $word ( keys %{$word_infos} ) {
my $total_freq = 0;
foreach my $word_synonyms ( $word_infos->{$word} ) {
$total_freq += ( $word_synonyms->[0] || 0 );
}
# if the value is null this mean there is actually no results, whoops!
$total_freq and $word_hits->{$word} = $total_freq;
}
# so now, sort out the words from lowest frequency to highest frequency
my @search_order = sort { $word_hits->{$b} <=> $word_hits->{$b} } keys %{$word_hits};
$self->debug_dumper( "Disjoining words in the following order: ", \@search_order) if ($self->{_debug});
################################################################################
### This following part is for smaller disjoin presets
################################################################################
foreach my $word ( @search_order ) {
# setup the where clause to get all the words associated
my $where = "Word_ID in(" . join(",", map( { $_->[0] } @{$word_infos->{$word}} )) . ")";
# setup the intersect for the previous if required. for iterative intersecting
if ( keys %{$results} ) {
$where .= " AND Item_ID in(" . join(",", keys %{$results}) . ")";
}
# make the database engine work a little bit
$query = qq!
SELECT Item_ID
FROM $stable
WHERE
$where
GROUP BY Item_ID
!;
$self->debug($query) if ($self->{_debug});
my $intersect_sth = $db->prepare( $query );
$intersect_sth->execute();
# get a list of all the matches
my $matches = $intersect_sth->fetchall_arrayref();
# strip the matches from the current result set
foreach my $word ( map { $_->[0] } @{$matches}) {
delete $results->{$word};
}
}
return $results;
}
sub _phrase_disjoin_query {
#------------------------------------------------------------
# subtracts the found phrases from the list
my $self = shift;
my $phrases = shift;
my $results = shift || {};
$phrases or return $results;
foreach my $phrase ( values %{$phrases} ) {
my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
# perform disjoin
foreach my $itemid ( keys %{$temp} ) {
$self->debug( "Deleting $itemid from list" ) if ($self->{_debug});
delete $results->{$itemid};
}
}
return $results;
}
sub _phrase_intersect_query {
#------------------------------------------------------------
# intersects phrases together
my $self = shift;
my $phrases = shift;
my $results = shift || {};
$phrases or return $results;
foreach my $phrase ( values %{$phrases} ) {
my $temp = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info} );
# perform intersect
foreach my $itemid ( keys %{$temp} ) {
$temp->{$itemid} += $results->{$itemid} || 0;
}
$results = $temp;
}
return $results;
}
sub _phrase_query {
#------------------------------------------------------------
# this is a phrase union query
my $self = shift;
my $phrases = shift or return;
my $results = shift || {};
foreach my $phrase ( values %{$phrases} ) {
$self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug});
$results = $self->_get_phrase( $phrase->{'phrase'}, $phrase->{word_info}, $results );
}
return $results;
}
sub _get_phrase {
#------------------------------------------------------------
my $self = shift;
my $wordlist= shift;
my $word_info = shift;
my $results = shift || {};
$wordlist or return $results;
my ( $query, $where, $db, $word_infos, %word_hits );
my ( $table, $wtable, $stable) = $self->_table_names();
my ($pk) = $self->{table}->pk;
$self->debug_dumper( "Getting words: ", $wordlist ) if ($self->{_debug});
# get all the word ids that we want to handle
$db = $self->{table}->{driver} or return $self->error( 'NODRIVER', 'FATAL' );
$word_infos = $self->_word_infos( $word_info ) or return;
$self->debug_dumper( "Word infos: ", $word_infos ) if ($self->{_debug});
# take the words and get a hash of the word scores
foreach my $word ( keys %{$word_infos} ) {
@{$word_infos->{$word} || []} or return;
my $total_freq = 0;
foreach my $word_synonyms ( @{$word_infos->{$word}} ) {
$total_freq += $word_synonyms->[1];
}
# if the value is null this mean there is actually no results, whoops!
$word_hits{$word} = $total_freq;
}
$self->debug_dumper( "With synonyms tallied: ", \%word_hits ) if ($self->{_debug});
# so now, setup the order of search
my $i = 0;
my %word_order = map { $_ => $i++ } @{$wordlist};
my @search_order = sort { $word_hits{$a} <=> $word_hits{$b} } keys %word_hits;
$self->debug_dumper( "Word search order: ", \@search_order ) if ($self->{_debug});
################################################################################
### This following part is for smaller phrases
################################################################################
# start getting words in order of their frequency
my %matches = ();
my $index = 0;
foreach my $word ( @search_order ) {
# setup the where clause for the individual words, firstly
if ( keys %matches ) {
my $vector = $word_order{$word} - $index;
$where = '(';
$where =
'(' .
join(
" OR ",
map(
"Item_ID = $_ AND Word_Pos IN(" . join(",", map $_->[0] + $vector, @{$matches{$_}}) . ')',
keys %matches
)
) .
") AND ";
}
else {
$where = '';
}
$where .= "Word_ID IN(" . ( join ",", map { $_->[0] || () } @{$word_infos->{$word}} or return $results ) . ')';
$query = qq!
SELECT
Item_ID, Score, Word_Pos
FROM
$stable
WHERE
$where
!;
$self->debug( "Phrase get for '$word': " . $query ) if ($self->{_debug});
my $sth = $db->prepare( $query );
$sth->execute();
%matches = ();
while (my $hit = $sth->fetchrow_arrayref) {
push @{$matches{$hit->[0]}}, [ $hit->[2], $hit->[1] ];
}
# If there are no values stored in %matches, it means that for
# this keyword, there have been no hits based upon position.
# In that case, terminate and return a null result
keys %matches or last;
# where were we in the string?
$index = $word_order{$word};
}
# now tally up all the scores and merge the new records in
foreach my $itemid ( keys %matches ) {
my $score = 0;
foreach my $sub_total ( @{$matches{$itemid}} ) {
$score += $sub_total->[1];
}
$results->{$itemid} += $score;
}
return $results;
}
sub get_wordids {
# ------------------------------------------------------------------------------
# Get a list of words
#
my $self = shift;
my $elements = shift or return;
my $mode = lc shift || 'keywords';
if ( $mode eq 'keywords' ) {
$elements = $self->_get_wordid($elements);
}
else {
foreach my $phrase ( keys %$elements ) {
my $results = $self->_get_wordid({
map { ($_ => { substring => 0 }) } @{$elements->{$phrase}->{phrase}}
});
$elements->{$phrase}->{word_info} = $results;
}
}
return $elements;
}
sub _get_wordid {
# ------------------------------------------------------------------------------
# Get a list of words
#
my $self = shift;
my $words = shift;
my $tbl = $self->{table};
my ( $table, $wtable, $stable) = $self->_table_names();
foreach my $word ( keys %$words ) {
my $query =
qq!SELECT Word_ID, Frequency FROM $wtable WHERE Word LIKE '! .
quotemeta($word) .
( $words->{$word}->{substring} ? '%' : '' ) .
"'";
my $sth = $tbl->do_query($query) or next;
my $tmp = { $sth->fetchall_list };
$words->{$word}->{word_info} = $tmp;
}
return $words;
}
##
# Internal Use
# $self->_cgi_to_hash ($in);
# --------------------------
# Creates a hash ref from a cgi object.
##
sub _cgi_to_hash {
my ($self, $cgi) = @_;
$cgi and UNIVERSAL::can($cgi, 'param') or return $self->error(NODRIVER => 'FATAL');
my @keys = $cgi->param;
my $result = {};
foreach my $key (@keys) {
my @values = $cgi->param($key);
if (@values == 1) { $result->{$key} = $values[0] }
else { $result->{$key} = \@values }
}
return $result;
}
1;

View File

@ -0,0 +1,239 @@
# ====================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::LUCENE::Indexer
# Author: Scott Beck
# CVS Info : 087,071,086,086,085
# $Id: Indexer.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ====================================================================
#
package GT::SQL::Search::LUCENE::Indexer;
# ------------------------------------------------------------------------------
# Preamble information related to the object
use strict;
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
use Lucene;
use GT::SQL::Search::Base::Indexer;
use GT::TempFile;
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
$ERRORS = {
INDEX_CORRUPT => 'Could not create an Indexer, this probably means your index is corrupted and you should rebuild it. The error was: %s',
DELETE_FAILED => 'Could not delete some records: %s'
};
$ERROR_MESSAGE = 'GT::SQL';
sub load {
my $class = shift;
return $class->new(@_)
}
sub _get_path {
my $self = shift;
my $name = $self->{table}->name;
my $tmpdir = GT::TempFile::find_tmpdir();
my $path = $tmpdir . '/' . $name;
$path = $1 if $path =~ /(.*)/; # XXX untaint
return $path;
}
sub _get_store {
my ($self, $create) = @_;
my $path = $self->_get_path;
return Lucene::Store::FSDirectory->getDirectory($path, $create);
}
sub _get_indexer {
my ($self, $create) = @_;
my %weights = $self->{table}->_weight_cols() or return $self->error(NOWEIGHTS => 'WARN');
my ($pk) = $self->{table}->pk;
if (!$pk) {
return $self->error('NOPRIMARYKEY','WARN');
}
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
my $store = $self->_get_store($create);
my $iw;
eval { $iw = new Lucene::Index::IndexWriter($store, $analyzer, $create); };
if ($@) {
return $self->error('INDEX_CORRUPT', 'WARN', "$@");
}
return $iw;
}
sub drop_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
my $path = $self->_get_path;
require File::Tools;
File::Tools::deldir($path);
return 1;
}
sub add_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
$self->_get_indexer(1) or return;
return 1;
}
sub post_create_table {
# ------------------------------------------------------------------------------
# creates the index tables..
#
return $_[0]->add_search_driver(@_);
}
sub post_drop_table {
# -------------------------------------------------------
# Remove the index tables.
#
return $_[0]->drop_search_driver(@_);
}
sub post_add_record {
# -------------------------------------------------------
# indexes a single record
my ($self, $rec, $insert_sth, $no_optimize) = @_;
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my %weights = $tbl->_weight_cols() or return;
my $indexer = $self->_get_indexer(0) or return $self->{_debug} ? () : 1;
my $doc = new Lucene::Document;
my ($pk) = $self->{table}->pk;
delete $weights{$pk};
for my $column_name (keys %weights) {
my $field = Lucene::Document::Field->UnStored($column_name, $rec->{$column_name});
$field->setBoost($weights{$column_name});
$doc->add($field);
}
$doc->add(Lucene::Document::Field->Keyword($pk, ($tbl->ai && $insert_sth ? $insert_sth->insert_id : $rec->{$pk})));
$indexer->addDocument($doc);
$indexer->optimize if !$no_optimize;
$indexer->close;
undef $indexer;
return 1;
}
sub reindex_all {
# -------------------------------------------------------
my $self = shift;
my $table = shift;
my $opts = shift;
my $tick = $opts->{tick} || 0;
my $max = $opts->{max} || 5000;
my $indexer = $self->_get_indexer(1) or return $self->{_debug} ? () : 1; # clobbers the old one
$indexer->close;
undef $indexer;
my %weights = $self->{table}->_weight_cols() or return;
my @weight_list = keys %weights;
my ($pk) = $self->{table}->pk();
# Go through the table and index each field.
my $iterations = 1;
my $count = 0;
while (1) {
if ($max) {
my $offset = ($iterations-1) * $max;
$table->select_options("LIMIT $offset,$max");
}
my $cond = $opts->{cond} || {};
my $sth = $table->select($cond, [$pk, @weight_list]);
my $done = 1;
while (my $rec = $sth->fetchrow_hashref() ) {
$self->post_add_record($rec, undef, 1);
$done = 0;
if ($tick) {
$count++;
$count % $tick or (print "$count ");
$count % ($tick*10) or (print "\n");
}
}
last if $done;
$iterations++;
last if !$max;
}
$indexer = $self->_get_indexer(0) or return;
$indexer->optimize;
$indexer->close;
undef $indexer;
return 1;
}
sub pre_delete_record {
# -------------------------------------------------------
# Delete a records index values.
#
my ($self, $where) = @_;
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my ($pk) = $tbl->pk();
my $q = $tbl->select($where, [$pk]);
my $reader = eval { Lucene::Index::IndexReader->open($self->_get_store(0)); };
if ($@) {
return $self->{_debug} ? $self->error('INDEX_CORRUPT', 'WARN', "$@") : 1;
}
my @errors;
while (my ($item_id) = $q->fetchrow) {
my $t = new Lucene::Index::Term($pk => $item_id);
eval { $reader->deleteDocuments($t); };
if ($@) {
push @errors, "$@";
}
}
$reader->close;
undef $reader;
if (@errors) {
return $self->{_debug} ? $self->error('DELETE_FAILED', 'WARN', join(", ", @errors)) : 1;
}
return 1;
}
sub post_update_record {
# -------------------------------------------------------
my ( $self, $set_cond, $where_cond, $tmp ) = @_;
# delete the previous record
eval {
$self->pre_delete_record($where_cond) or return $self->{_debug} ? () : 1;
};
#
# the new record
my $tbl = $self->{table} or $self->error( 'NODRIVER', 'FATAL' );
my ($pk) = $tbl->pk();
my %weights = $self->{table}->_weight_cols();
my @weight_list = keys %weights;
my $q = $tbl->select($where_cond, [$pk, @weight_list]);
while (my $href = $q->fetchrow_hashref) {
$self->post_add_record($href);
}
return 1;
}
sub reindex_record {
# -------------------------------------------------------
# reindexes a record. basically deletes all associated records from current db abnd does an index.
# it's safe to use this
my ($self, $rec) = @_;
$self->delete_record($rec);
$self->index_record($rec);
}
1;

View File

@ -0,0 +1,206 @@
NAME
Lucene -- API to the C++ port of the Lucene search engine
SYNOPSIS
Initialize/Empty Lucene index
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 1);
my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
$tmp_writer->close;
undef $tmp_writer;
Choose your Analyzer (string tokenizer)
# lowercases text and splits it at non-letter characters
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
# same as before and removes stop words
my $analyzer = Lucene::Analysis::StopAnalyzer();
# splits text at whitespace characters
my $analyzer = Lucene::Analysis::WhitespaceAnalyzer();
# lowercases text, tokenized it based on a grammer that
# leaves named authorities intact (e-mails, company names,
# web hostnames, IP addresses, etc) and removed stop words
my $analyzer = Lucene::Analysis::Standard::StandardAnalyzer();
Choose your Store (storage engine)
# in-memory storage
my $store = new Lucene::Store::RAMDirectory();
# disk-based storage
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
Open and configure an IndexWriter
my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 0);
# optional settings for power users
$writer->setMergeFactor(100);
$writer->setUseCompoundFile(0);
$writer->setMaxFieldLength(255);
$writer->setMinMergeDocs(10);
$writer->setMaxMergeDocs(100);
Create Documents and add Fields
my $doc = new Lucene::Document;
# field gets analyzed, indexed and stored
$doc->add(Lucene::Document::Field->Text("content", $content));
# field gets indexed and stored
$doc->add(Lucene::Document::Field->Keyword("isbn", $isbn));
# field gets just stored
$doc->add(Lucene::Document::Field->UnIndexed("sales_rank", $sales_rank));
# field gets analyzed and indexed
$doc->add(Lucene::Document::Field->UnStored("categories", $categories));
Add Documents to an IndexWriter
$writer->addDocument($doc);
Optimize your index and close the IndexWriter
$writer->optimize();
$writer->close();
undef $writer;
Delete Documents
my $reader = Lucene::Index::IndexReader->open($store);
my $term = new Lucene::Index::Term("isbn", $isbn);
$reader->deleteDocuments($term);
$reader->close();
undef $reader;
Query index
# initalize searcher and parser
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
my $searcher = new Lucene::Search::IndexSearcher($store);
my $parser = new Lucene::QueryParser("default_field", $analyzer);
# build a query on the default field
my $query = $parser->parse("perl");
# build a query on another field
my $query = $parser->parse("title:cookbook");
# define a sort on one field
my $sortfield = new Lucene::Search::SortField("unixtime");
my $reversed_sortfield = new Lucene::Search::SortField("unixtime", 1);
my $sort = new Lucene::Search::Sort($sortfield);
# define a sort on two fields
my $sort = new Lucene::Search::Sort($sortfield1, $sortfield2);
# use Lucene's INDEXORDER or RELEVANCE sort
my $sort = Lucene::Search::Sort->INDEXORDER;
my $sort = Lucene::Search::Sort->RELEVANCE;
# query index and get results
my $hits = $searcher->search($query);
my $sorted_hits = $searcher->search($query, $sort);
# get number of results
my $num_hits = $hits->length();
# get fields and ranking score for each hit
for (my $i = 0; $i < $num_hits; $i++) {
my $doc = $hits->doc($i);
my $score = $hits->score($i);
my $title = $doc->get("title");
my $isbn = $doc->get("isbn");
}
# free memory and close searcher
undef $hits;
undef $query;
undef $parser;
undef $analyzer;
$searcher->close();
undef $fsdir;
undef $searcher;
}
Close your Store
$store->close;
undef $store;
DESCRIPTION
Like it or not Apache Lucene has become the de-facto standard for
open-source high-performance search. It has a large user-base, is well
documented and has plenty of committers. Unfortunately Apache Lucene is
entirely written in Java and therefore of relatively little use for perl
programmers. Fortunately in the recent years a group of C++ programmers
led by Ben van Klinken decided to port Java Lucene to C++.
The purpose of the module is to export the C++ Lucene API to perl and at
the same time be as close as possible to the original Java API. This has
the combined advantage of providing perl programmers with a
well-documented API and giving them access to a C++ search engine
library that is supposedly faster than the original.
CHARACTER SUPPORT
Currently only ISO 8859-1 (Latin-1) characters are supported. Obviously
this included all ASCII characters.
INDEX COMPATIBLITY
For the moment indices produced by this module are not compatible with
those from Apache Lucene. The reason for this is that this module uses
1-byte character encoding as opposed to 2-byte (widechar) encoding with
Apache Lucene.
INSTALLATION
This module requires the clucene library to be installed. The best way
to get it is to go to the following page
http://sourceforge.net/projects/clucene/
and download the latest STABLE clucene-core version. Currently it is
clucene-core-0.9.15. Make sure you compile it in ASCII mode and install
it in your standard library path.
On a Linux platform this goes as follows:
wget http://kent.dl.sourceforge.net/sourceforge/clucene/clucene-core-0.9.15.tar.gz
cd clucene-core-0.9.15
./autogen.sh
./configure --disable-debug --prefix=/usr --exec-prefix=/usr --enable-ascii
make
make check
(as root) make install
To install the perl module itself, run the following commands:
perl Makefile.PL
make
make test
(as root) make install
AUTHOR
Thomas Busch <tbusch at cpan dot org>
COPYRIGHT AND LICENSE
Copyright (c) 2006 Thomas Busch
This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
SEE ALSO
Plucene - a pure-Perl implementation of Lucene
KinoSearch - a search engine library inspired by Lucene
DISCLAIMER OF WARRANTY
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
NECESSARY SERVICING, REPAIR, OR CORRECTION.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.

View File

@ -0,0 +1,115 @@
# ====================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::STH
# Author: Aki Mimoto
# CVS Info : 087,071,086,086,085
# CVS Info : 087,071,086,086,085
# $Id: STH.pm,v 1.1 2006/12/07 07:04:51 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ====================================================================
#
package GT::SQL::Search::LUCENE::STH;
#--------------------------------------------------------------------------------
use strict;
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG $ERRORS $ERROR_MESSAGE /;
require GT::SQL::Search::Base::STH;
@ISA = ('GT::SQL::Search::STH');
$ATTRIBS = {
'db_sort' => 1,
'hits' => undef
};
$ERROR_MESSAGE = 'GT::SQL::Search::STH';
sub init {
#--------------------------------------------------------------------------------
# GT::SQL::Search::STH expects a full set of results in $self->{results}. For
# Lucene the only time a full set of results is there is when we are sorting
# on a field that is not weighted, otherwise the results in $self->{results} is
# the proper page and number of results.
my $self = shift;
$self->set(@_);
--$self->{nh} if $self->{nh};
# Here we allow hits to override our concept of rows. This is only useful
# when !$self->{db_sort}
$self->{rows} = $self->{hits}
? $self->{hits}
: $self->{results}
? scalar(keys %{$self->{results}})
: 0;
if ($self->{db_sort}) {
$self->get_db_sorted_results;
}
else {
$self->get_sorted_results;
}
}
sub get_sorted_results {
# Just sorts the results out of $self->{results} which should have been setup
# by a search driver
my ($self) = @_;
my $results = $self->{results};
$self->{index} = 0;
$self->{max_index} = $self->{mh} - 1;
$self->{'order'} = [ sort {
( $results->{$b} || return 0 ) <=> ( $results->{$a} || 0 )
} keys %{$results} ];
$self->debug_dumper( "Results will be presented in the following order: ", $self->{'order'} ) if ($self->{_debug});
}
sub get_db_sorted_results {
# This assumes $self->{results} has a full result set, i.e. without any LIMIT
# It then selects the result set using the SQL driver to do the sorting. This
# is for Search modules which can not handle their own sorting
my ($self) = @_;
my $sb = GT::SQL::Search::Base::Search->clean_sb($self->{sb}, $self->{so});
$self->{index} = $self->{nh} * $self->{mh} || 0;
$self->{max_index} = $self->{index} + $self->{mh} - 1; # index + mh is the first record which should not be returned.
if ($self->{max_index} > $self->{rows}) {
$self->{max_index} = $self->{rows};
$self->{rows} = $self->{rows} - $self->{index};
$self->{rows} < 0 ? $self->{rows} = 0 : 0;
}
else {
$self->{rows} = $self->{mh};
}
my ( $table, $pk ) = $self->_table_info();
my ( $query, $where, $st, $limit );
$where = qq!$pk in(! . join( ",", keys %{$self->{results}}) . ')';
$self->{mh} and $limit = qq!LIMIT $self->{index}, $self->{rows}!;
$query = qq!
SELECT $pk
FROM $table
WHERE $where
$sb
$limit
!;
$self->debug( "Row fetch query: $query" ) if ($self->{_debug});
my $sth = $self->{table}->{driver}->prepare( $query );
$sth->execute();
$self->{index} = 0;
$self->{max_hits} = $self->{rows};
# Fetch the results in sorted order
my $order = $sth->fetchall_arrayref();
$sth->finish();
$self->{'order'} = [ map { $_->[0] } @{$order} ];
}
1;

View File

@ -0,0 +1,260 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::LUCENE::Search
# Author : Scott Beck
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.2 2006/12/07 22:42:16 aki Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to search indexed tables.
#
package GT::SQL::Search::LUCENE::Search;
# ------------------------------------------------------------------------------
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS $ERRORS $ERROR_MESSAGE /;
use Lucene;
use GT::TempFile;
use GT::SQL::Search::LUCENE::STH;
use GT::SQL::Search::Base::Search;
@ISA = qw( GT::SQL::Search::Base::Search );
# ------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
$ERRORS = {
SEARCH_ERROR => "Error searching: %s",
QUERY_ERROR => "Query error: %s"
};
$ERROR_MESSAGE = 'GT::SQL';
sub load {
shift;
return GT::SQL::Search::LUCENE::Search->new(@_)
}
sub _get_path {
my $self = shift;
my $name = $self->{table}->name;
my $tmpdir = GT::TempFile::find_tmpdir();
my $path = $tmpdir . '/' . $name;
$path = $1 if $path =~ /(.*)/; # XXX untaint
return $path;
}
sub _get_store {
my ($self, $create) = @_;
my $path = $self->_get_path;
return Lucene::Store::FSDirectory->getDirectory($path, $create);
}
sub query {
# --------------------------------------------------
# Returns a sth based on a query
#
# Options:
# - paging
# mh : max hits
# nh : number hit (or page of hits)
#
# - searching
# ww : whole word
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
# substring : search for substrings of words
# bool : 'and' => and search, 'or' => or search, '' => regular query
# query : the string of things to ask for
#
# - filtering
# field_name : value # Find all rows with field_name = value
# field_name : ">value" # Find all rows with field_name > value.
# field_name : "<value" # Find all rows with field_name < value.
# field_name-gt : value # Find all rows with field_name > value.
# field_name-lt : value # Find all rows with field_name < value.
#
# Parameters:
# ( $CGI ) : a single cgi object
# ( $HASH ) : a hash of the parameters
#
my $self = shift;
# create an easily accessible argument hash
my $args = $self->common_param(@_);
my $tbl = $self->{table};
# see if we can setup the filtering constraints
my $filter = { %$args };
my $query = delete $args->{query} || $self->{query} || '';
my $ftr_cond;
# parse query
$self->debug( "Search Query: $query" ) if ($self->{_debug});
my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
$self->{rejected_keywords} = $rejected;
# setup the additional input parameters
$query_struct = $self->_preset_options( $query_struct, $args );
# now sort into distinct buckets
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
# with the buckets, it's now possible to create a query string
# that can be passed directly into the Lucene search.
my $query_string = '';
foreach my $search_type ( keys %$buckets ) {
my $bucket = $buckets->{$search_type};
foreach my $token ( keys %$bucket ) {
next unless $token;
my $properties = $bucket->{$token} or next;
$token =~ s/(["()])/\\$1/g;
$token =~ s/\b(or|and)\b/ /g;
my $e = ' ';
# handle boolean operations
$properties->{mode} ||= '';
if ( $properties->{mode} eq 'must' ) {
$e .= '+';
}
elsif ( $properties->{mode} eq 'cannot' ) {
$e .= '-';
}
# deal with phrase vs keyword
if ( $properties->{phrase} ) {
$e .= '"' . $token . '"' unless $token =~ /^"|"$/;
}
else {
$e .= $token;
# substring match
if ($properties->{mode} ne 'substring') {
$e .= '*' if $properties->{substring};
}
}
$query_string .= $e;
}
}
# calculate the cursor constraints
foreach my $k (qw( nh mh so sb )) {
next if defined $args->{$k};
$args->{$k} = $self->{$k} || '';
}
$args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
$args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
$args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score';
# Score is the default
$args->{so} = (defined $args->{so} and $args->{so} =~ /^(asc|desc)(?:end)?$/i) ? lc($1) : 'asc';
my %weights = $tbl->_weight_cols();
my @sortfields;
my $do_mysql_sort = 0;
for (ref($args->{sb}) eq 'ARRAY' ? @{$args->{sb}} : $args->{sb}) {
if (!exists $weights{$_}) {
$do_mysql_sort = 1 if $_ ne 'score';
next;
}
push @sortfields, new Lucene::Search::SortField($_, $args->{so} ne 'asc');
}
my $sort = @sortfields ? new Lucene::Search::Sort(@sortfields) : Lucene::Search::Sort->RELEVANCE;
my $store = $self->_get_store(0);
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer;
my $searcher = eval { new Lucene::Search::IndexSearcher($store); };
if ($@) {
$self->{_debug} and $self->error('SEARCH_ERROR', 'WARN', "$@");
return $self->sth({}, 0); # no hits
}
# Random default field, it's not used
my $parser = new Lucene::MultiFieldQueryParser((keys %weights)[0], $analyzer);
my $pquery = eval { $parser->parse($query_string, [keys %weights], $analyzer); };
if ($@) {
$self->{_debug} and $self->error('QUERY_ERROR', 'WARN', "$@");
return $self->sth({}, 0); # no hits
}
my $hits = $searcher->search($pquery, $sort);
my $num_hits = $hits->length;
## Setup a limit only if there is no callback. The callback argument requires a full results list
my ($offset, $max_hits) = (0, $num_hits);
unless ($self->{callback} or $do_mysql_sort) {
$offset = ( $args->{nh} - 1 ) * $args->{mh};
$max_hits = $offset + $args->{mh};
}
$max_hits = $num_hits if $max_hits > $num_hits;
my ($pk) = $self->{table}->pk;
my @indexes;
my $results = {};
for (my $i= $offset; $i < $max_hits; ++$i) {
my $doc = $hits->doc($i);
my $value = $doc->get($pk);
my $score = $hits->score($i);
$results->{$value} = $score;
}
# now handle filters
my $cols = $self->{'table'}->cols();
my %filters = map {
(my $tmp = $_) =~ s/-[lg]t$//;
$cols->{$tmp} ? ($_ => $args->{$_}) : ()
} keys %{$args};
if (keys %filters) {
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
$results = $self->filter(\%filters, $results);
}
elsif ($self->{filter}) {
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
$results = $self->_filter_query( $self->{filter}, $results );
}
else {
$self->debug( "No filters being used.") if ($self->{_debug});
}
# now this query should probably clear the filters once it's been used, so i'll dothat here
$self->{filter} = undef;
# now run through a callback function if needed.
if ($self->{callback}) {
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
$self->{_debug} and $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
return $self->sth({}, 0); # no hits
}
$self->debug_dumper ("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
$results = $self->{callback}->($self, $results);
$self->debug_dumper ("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
}
$self->{rows} = $num_hits;
return $self->sth($results, $do_mysql_sort);
}
sub sth {
#--------------------------------------------------------------------------------
my ($self, $results, $db_sort) = @_;
my $sth = GT::SQL::Search::LUCENE::STH->new(
'results' => $results,
'hits' => $self->{rows},
'db' => $self->{table}->{driver},
'db_sort' => $db_sort,
# pass the following attributes down to the STH handler
map({ ($_ => $self->{$_}) } qw/ table sb so score_col score_sort nh mh rows _debug /)
);
return $sth;
}
1;

View File

@ -0,0 +1,98 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::MSSQL::Indexer
# Author: Alex Krohn
# CVS Info : 087,071,086,086,085
# $Id: Indexer.pm,v 1.6 2004/08/28 03:53:48 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Supports MS SQL full text indexer on MS SQL 2000 only.
#
package GT::SQL::Search::MSSQL::Indexer;
#--------------------------------------------------------------------------------
use strict;
use vars qw/@ISA $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
use GT::SQL::Search::Base::Indexer;
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.6 $ =~ /(\d+)\.(\d+)/;
$ERRORS = {
NOTFROMWEB => 'There are far too many records in table %s for create/destroy of this indexing scheme from the web. Please use alternative method.',
MSSQLNONSUPPORT => 'You must be using MS SQL 2000 in order to use full text indexing. Current Database: %s',
CREATEINDEX => 'Problem Creating Full Text Index: %s'
};
$ERROR_MESSAGE = 'GT::SQL';
sub load {
my $class = shift;
return $class->new(@_);
}
sub ok {
#--------------------------------------------------------------------------------
my ($class, $tbl) = @_;
unless (uc $tbl->{connect}->{driver} eq 'ODBC') {
return $class->error ('MSSQLNONSUPPORT', 'WARN', $tbl->{connect}->{driver});
}
return 1;
}
sub drop_search_driver {
#--------------------------------------------------------------------------------
my $self = shift;
my $table = $self->{table};
my $name = $table->name;
my $cat = $name . '_ctlg';
my $res = eval {
$table->do_query(" sp_fulltext_table '$name', 'drop' ");
$table->do_query(" sp_fulltext_catalog '$cat', 'drop' ");
1;
};
$res ? return 1 : return;
}
sub add_search_driver {
#--------------------------------------------------------------------------------
my $self = shift;
my $table = $self->{table};
my $name = $table->name;
my $cat = $name . '_ctlg';
my %weights = $table->weight;
my ($pk) = $table->pk;
# Enable a database for full text indexing
$table->do_query(" sp_fulltext_database 'enable' ") or $self->error('CREATEINDEX', 'FATAL', $GT::SQL::error);
# Create a full text catalog to store the data.
$table->do_query(" sp_fulltext_catalog '$cat', 'create' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
# Make a unique index on primary key (not sure why it isn't by default.
$table->do_query(" create unique index PK_$name on $name ($pk) ");
# Mark this table as using the full text catalog created
$table->do_query(" sp_fulltext_table '$name', 'create', '$cat', 'PK_$name' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
# Specify which columns are to be indexed
foreach my $col (keys %weights) {
if ($weights{$col}) {
$table->do_query(" sp_fulltext_column '$name', '$col', 'add' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
}
}
# Must have a timestamp field.
$table->do_query(" alter table $name add timestamp ");
# Build the index.
$table->do_query(" sp_fulltext_table '$name', 'start_change_tracking' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
$table->do_query(" sp_fulltext_table '$name', 'start_background_updateindex' ") or $self->error('CREATEINDEX', 'WARN', $GT::SQL::error);
return 1;
}
sub post_create_table {
#--------------------------------------------------------------------------------
shift->add_search_driver(@_);
}
1;

View File

@ -0,0 +1,179 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::MSSQL::Search
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.9 2004/08/28 03:53:48 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to search indexed tables.
#
package GT::SQL::Search::MSSQL::Search;
#--------------------------------------------------------------------------------
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
use GT::SQL::Search::Base::Search;
@ISA = qw( GT::SQL::Search::Base::Search );
#--------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.9 $ =~ /(\d+)\.(\d+)/;
$ATTRIBS = {
min_word_size => 2,
};
sub load {
shift;
return GT::SQL::Search::MSSQL::Search->new(@_)
}
sub query {
#--------------------------------------------------------------------------------
# overruns the usual query system with the mssql version
#
my $self = shift;
# Find out what sort of a parameter we're dealing with
my $input = $self->common_param(@_);
# Add additional parameters if required
foreach my $parameter ( keys %{$ATTRIBS} ) {
if ( not exists $input->{$parameter} ) {
$input->{$parameter} = $self->{$parameter};
}
}
# Parse query...,
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
$self->{'rejected_keywords'} = $rejected;
# Setup the additional input parameters
$query = $self->_preset_options( $query, $input );
# Now sort into distinct buckets
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query );
my $tbl = $self->{table} or $self->error( 'BADARGS', 'FATAL', 'Must have table object defined' );
my $string = $self->_string ($buckets);
return $self->sth({}) unless ($string =~ /\w/);
my $table_name = $tbl->name();
my ($pk) = $tbl->pk;
# create the filter
my $filter_sql = $self->{filter} ? "WHERE ( " . $self->{filter}->sql . ' )' : '';
# If we have a callback, we need all results.
if ($self->{callback}) {
$query = qq!
SELECT $pk, K.RANK
FROM $table_name AS T INNER JOIN
CONTAINSTABLE ( $table_name, *,
'$string'
) AS K
ON T.$pk = K.[KEY]
$filter_sql
!;
my %results = $tbl->do_query($query)->fetchall_list;
my $results = $self->{callback}->($self, \%results);
$self->{rows} = $results ? scalar keys %$results : 0;
return $self->sth($results);
}
else {
my $mh = (defined $input->{nh} and $input->{nh} =~ /^(\d+)$/) ? $1 : 1;
my $nh = (defined $input->{mh} and $input->{mh} =~ /^(\d+)$/) ? $1 : 25;
# First get the total.
$query = qq!
SELECT COUNT(*)
FROM $table_name AS T INNER JOIN
CONTAINSTABLE ( $table_name, *,
'$string'
) AS K
ON T.$pk = K.[KEY]
$filter_sql
!;
my ($count) = $tbl->do_query($query)->fetchrow;
# Now get results.
$query = qq!
SELECT $pk, K.RANK
FROM $table_name AS T INNER JOIN
CONTAINSTABLE ( $table_name, *,
'$string'
) AS K
ON T.$pk = K.[KEY]
$filter_sql
ORDER BY K.RANK DESC
!;
my %results = $tbl->do_query($query)->fetchall_list;
$self->{rows} = $count;
return $self->sth(\%results);
}
}
sub _string {
# -------------------------------------------------------------------
# Returns the string to use for containstable.
#
my ($self, $buckets) = @_;
# union
my $tmp_bucket = $buckets->{keywords};
my $union_request_str = join(
" or ",
map(
qq!"$_"!,
keys %{$buckets->{phrases}}
),
map(
($tmp_bucket->{$_}->{substring} ? "$_*" : $_),
keys %$tmp_bucket
)
);
# intersect
$tmp_bucket = $buckets->{keywords_must};
my $intersect_request_str = join(
" and ",
map(
qq!"$_"!,
keys %{$buckets->{phrases_must}}
),
map(
($tmp_bucket->{$_}->{substring} ? "$_*" : $_),
keys %$tmp_bucket
)
);
# disjoin
$tmp_bucket = $buckets->{keywords_cannot};
my $disjoin_request_str = join(
" and ",
map(
qq!"$_"!,
keys %{$buckets->{phrases_cannot}}
),
map(
($tmp_bucket->{$_}->{substring} ? "$_*" : $_),
keys %$tmp_bucket
)
);
# now build the query
my $tmp_request_str = join(
" and ",
($union_request_str ? "( $union_request_str )" : ()),
($intersect_request_str ? "( $intersect_request_str )" : ()),
($disjoin_request_str ? "NOT ( $disjoin_request_str )" : ())
);
return $tmp_request_str;
}
1;

View File

@ -0,0 +1,187 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::MYSQL::Indexer
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Indexer.pm,v 1.17 2004/08/28 03:53:49 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to search indexed tables.
#
package GT::SQL::Search::MYSQL::Indexer;
# ------------------------------------------------------------------------------
use strict;
use vars qw/@ISA $VERSION $DEBUG $ERRORS $ERROR_MESSAGE/;
use GT::SQL::Search::Base::Indexer;
@ISA = qw/GT::SQL::Search::Base::Indexer/;
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.17 $ =~ /(\d+)\.(\d+)/;
$ERRORS = {
NOTFROMWEB => 'There are far too many records in table %s for create/destroy of this indexing scheme from the web. Please use alternative method.',
MYSQLNONSUPPORT => 'Driver MYSQL requires MySQL version 3.23.23 or greater. Currently MySQL version: %s'
};
@$GT::SQL::ERRORS{ keys %$ERRORS } = values %$ERRORS;
$ERROR_MESSAGE = 'GT::SQL';
sub load {
my $class = shift;
return $class->new(@_);
}
sub ok {
# ------------------------------------------------------------------------------
my ($class, $tbl) = @_;
unless (uc $tbl->{connect}->{driver} eq 'MYSQL') {
return $class->error ('MYSQLNONSUPPORT', 'WARN', $tbl->{connect}->{driver});
}
my $sth = $tbl->do_query(qq!SELECT VERSION()!);
my $version = $sth->fetchrow;
my ($maj, $min) = split (/\./, $version);
unless ($maj > 3 or ($maj == 3 and $min >= 23)) {
return $class->error(MYSQLNONSUPPORT => WARN => $version);
}
return 1;
}
sub drop_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
$self->too_much() and return;
my $tbl = $self->{table} or return;
$tbl->connect();
my %weights = $tbl->weight() or return;
my $tblname = $tbl->name();
# Group the fulltext columns by value of the weight
my %cols_grouped;
foreach ( keys %weights ) {
my $val = $weights{$_} or next;
push @{$cols_grouped{$val}}, $_;
}
# Drop unified fulltext columns if required
if ( keys %cols_grouped > 1 ) {
$cols_grouped{-1} = [ grep { $weights{$_} } keys %weights ];
}
# For each value grouped column set create a full text
# column
foreach my $v ( keys %cols_grouped ) {
my $ft_name = 'ft_'.join("_", sort @{$cols_grouped{$v}});
my $res = eval {
$tbl->do_query(qq!
ALTER TABLE $tblname
DROP INDEX $ft_name
!);
};
# Break on errors that can't be handled
if ( $@ ) {
next if $@ !~ /exist/i;
$self->warn( "$@" );
return;
}
}
return 1;
}
sub add_search_driver {
# ------------------------------------------------------------------------------
my $self = shift;
$self->too_much() and return;
my $tbl = $self->{table} or return $self->error(BADARGS => FATAL => "table must be passed into add_search_driver.");
my %weights = $tbl->weight() or return $self->error(NOWEIGHTS => 'WARN');
my $tblname = $tbl->name() or return $self->error(BADARGS => FATAL => "table does not have a name?");
# group the fulltext columns by value of the weight
my %cols_grouped;
foreach ( keys %weights ) {
my $val = $weights{$_} or next;
push @{$cols_grouped{$val}}, $_;
}
# Create unified fulltext columns if required
if ( keys %cols_grouped > 1 ) {
$cols_grouped{-1} = [ grep { $weights{$_} } keys %weights ];
}
# for each value grouped column set create a full text
# column
foreach my $v ( keys %cols_grouped ) {
my $cols = join(",", sort @{$cols_grouped{$v}});
my $ft_name = 'ft_'.join("_", sort @{$cols_grouped{$v}});
my $res = eval {
$tbl->do_query(qq!
ALTER TABLE $tblname
ADD FULLTEXT $ft_name ( $cols )
!);
};
# break on errors that can't be handled
if ( $@ ) {
next if $@ =~ /duplicate/i;
$self->warn( "$@" );
return;
}
}
return 1;
}
sub too_much {
# ------------------------------------------------------------------------------
# returns true if there are too many records to be used on the Web
#
if ( $ENV{REQUEST_METHOD} ) {
my $self = shift;
my $tbl = $self->{table};
if ( $tbl->count() > 5000 ) {
$self->error( 'NOTFROMWEB', 'WARN', $tbl->name() );
return 1
}
}
return;
}
sub post_create_table {
# ------------------------------------------------------------------------------
shift->add_search_driver(@_);
}
sub reindex_all {
# ------------------------------------------------------------------------------
# this will drop all the fulltext columns and reindex all of them. This should
# not be required unless the user changes the weights on one of their columns.
# Unfortunately, this method is not particularly smart and risks not dropping
# certain index columns and reindexes even when it's not required. It must be
# recoded at a future date, but as this action won't happen frequently and will
# rarely affect the user, it is not a priority.
#
my $self = shift;
$self->drop_search_driver;
$self->add_search_driver;
}
1;

View File

@ -0,0 +1,51 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::MYSQL::Search
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.14 2004/01/13 01:35:19 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to search indexed tables.
#
package GT::SQL::Search::MYSQL::Search;
# ------------------------------------------------------------------------------
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
use GT::SQL::Search::Base::Search;
@ISA = qw( GT::SQL::Search::Base::Search );
# ------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.14 $ =~ /(\d+)\.(\d+)/;
$ATTRIBS = {
min_word_size => 4
};
sub load {
# --------------------------------------------------
my $self = shift;
my $opts = $self->common_param( @_ );
# determine which mysql search variant to use.
my $tbl = $opts->{table};
my $ver_sth = $tbl->do_query( 'SELECT VERSION()' );
my $version = $ver_sth->fetchrow_array();
my ( $maj, $min ) = split /\./, $version;
my $pkg = 'GT::SQL::Search::MYSQL::';
$pkg .= $maj > 3 ? 'VER4' : 'VER3';
eval "require $pkg";
return $pkg->new(@_)
}
1;

View File

@ -0,0 +1,178 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::MYSQL::VER3
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: VER3.pm,v 1.3 2004/08/28 03:53:49 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to search indexed tables.
#
package GT::SQL::Search::MYSQL::VER3;
# ------------------------------------------------------------------------------
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD /;
use GT::SQL::Search::Base::Search;
@ISA = qw( GT::SQL::Search::Base::Search );
# ------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.3 $ =~ /(\d+)\.(\d+)/;
$ATTRIBS = {
min_word_size => 4
};
sub _phrase_query {
# ------------------------------------------------------------------------------
my $self = shift;
my $phrases = shift or return $_[0];
my $results = shift || {};
foreach my $phrase ( values %{$phrases} ) {
$self->debug_dumper( "Unioning: ", $phrase ) if ($self->{_debug});
my $tmp = {};
foreach my $keyword ( @{ $phrase->{phrase}|| [] } ) {
$tmp = $self->_get_words ( [ $keyword ], $tmp, 'intersect' );
keys %$tmp or return {};
}
foreach my $key ( keys %$tmp ) { $results->{$key} += $tmp->{$key} }
}
return $results;
}
sub _get_phrase {
# ------------------------------------------------------------------------------
# one day change this so it does words properly
return _get_words(@_);
}
sub _union_query {
# ------------------------------------------------------------------------------
return _get_words(@_);
}
sub _intersect_query {
# ------------------------------------------------------------------------------
my ( $self, $keywords, $results ) = @_;
$keywords or return $results;
foreach my $keyword ( keys %{ $keywords || {} } ) {
$results = $self->_get_words ( [ $keyword ], $results, 'intersect' );
keys %$results or return {};
}
return $results;
}
sub _phrase_intersect_query {
# ------------------------------------------------------------------------------
my $self = shift;
my $phrases = shift or return $_[0];
my $results = shift || {};
my $tmp = $self->_phrase_query ( $phrases, $results );
keys %$results or return $tmp;
foreach my $key ( keys %$results ) {
if ( $tmp->{$key} ) {
$results->{$key} += $tmp->{$key};
}
else {
delete $results->{$key}
}
}
return $results;
}
sub _disjoin_query {
# ------------------------------------------------------------------------------
my $self = shift;
my $words = shift or return shift;
my $results = shift || {};
$results = $self->_get_words([ keys %{$words || {}} ], $results, 'disjoin' );
return $results;
}
sub _phrase_disjoin_query {
# ------------------------------------------------------------------------------
my $self = shift;
my $phrases = shift or return shift;
my $results = shift || {};
my $tmp = $self->_phrase_query ( $phrases, $results );
keys %$results or return $tmp;
foreach my $key ( keys %$results ) {
$tmp->{$key} and delete $results->{$key};
}
}
sub _get_words {
# ------------------------------------------------------------------------------
my $self = shift;
my $words = shift or return $_[0] || {};
my $results = shift || {};
my $mode = lc shift;
my $tbl = $self->{table} or $self->error( 'BADARGS', 'FATAL', 'Must have table object defined' );
my $tname = $tbl->name();
my $wordlist = join " ", ( ref $words ? ( ref $words eq 'HASH' ? keys %$words : @$words ) : $words ) ;
my ($pk) = $tbl->pk;
my %weights = $tbl->_weight_cols();
my $cols = join(",", keys %weights);
my $qwrds = quotemeta( $wordlist );
my $where = ( $results and keys %$results )
? ("AND $pk IN(" . join(',', keys %$results) . ")")
: '';
my $query = qq!
SELECT $pk, MATCH($cols) AGAINST ('$qwrds') AS SCORE
FROM $tname
WHERE MATCH($cols) AGAINST ('$qwrds')
$where
!;
my $sth = $tbl->do_query( $query ) or return;
if ( $mode eq 'disjoin' ) {
while ( my $result = $sth->fetchrow ) {
delete $results->{$result};
}
}
elsif ( $mode eq 'intersect' ) {
my $tmp = {};
while ( my $aref = $sth->fetchrow_arrayref ) {
$tmp->{$aref->[0]} = $aref->[1];
}
if ( $results and keys %$results ) {
while (my ($id, $score) = each %$results) {
if (not defined $tmp->{$id}) {
delete $results->{$id};
next;
}
$results->{$id} += $score;
}
}
else {
$results = $tmp;
}
}
else {
while ( my $aref = $sth->fetchrow_arrayref ) {
$results->{$aref->[0]} += $aref->[1];
}
}
return $results;
}
1;

View File

@ -0,0 +1,355 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Search::MYSQL::VER4
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: VER4.pm,v 1.9 2004/01/13 01:35:19 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Class used to search indexed tables.
#
package GT::SQL::Search::MYSQL::VER4;
# ------------------------------------------------------------------------------
use strict;
use vars qw/ @ISA $ATTRIBS $VERSION $DEBUG $AUTOLOAD $STOPWORDS /;
use GT::SQL::Search::Base::Search;
@ISA = qw( GT::SQL::Search::Base::Search );
# ------------------------------------------------------------------------------
# Preamble information related to the object
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.9 $ =~ /(\d+)\.(\d+)/;
$STOPWORDS = { map { $_ => 1 } qw/
a's able about above according accordingly across actually after
afterwards again against ain't all allow allows almost alone
along already also although always am among amongst an and another
any anybody anyhow anyone anything anyway anyways anywhere apart
appear appreciate appropriate are aren't around as aside ask asking
associated at available away awfully be became because become becomes
becoming been before beforehand behind being believe below beside
besides best better between beyond both brief but by c'mon c's came
can can't cannot cant cause causes certain certainly changes clearly
co com come comes concerning consequently consider considering
contain containing contains corresponding could couldn't course currently
definitely described despite did didn't different do does doesn't
doing don't done down downwards during each edu eg eight either else
elsewhere enough entirely especially et etc even ever every everybody
everyone everything everywhere ex exactly example except far few
fifth first five followed following follows for former formerly
forth four from further furthermore get gets getting given gives
go goes going gone got gotten greetings had hadn't happens hardly
has hasn't have haven't having he he's hello help hence her here
here's hereafter hereby herein hereupon hers herself hi him himself
his hither hopefully how howbeit however i'd i'll i'm i've ie if ignored
immediate in inasmuch inc indeed indicate indicated indicates inner
insofar instead into inward is isn't it it'd it'll it's its itself
just keep keeps kept know knows known last lately later latter latterly
least less lest let let's like liked likely little look looking looks
ltd mainly many may maybe me mean meanwhile merely might more
moreover most mostly much must my myself name namely nd near nearly
necessary need needs neither never nevertheless new next nine no
nobody non none noone nor normally not nothing novel now nowhere
obviously of off often oh ok okay old on once one ones only onto
or other others otherwise ought our ours ourselves out outside over
overall own particular particularly per perhaps placed please plus
possible presumably probably provides que quite qv rather rd re
really reasonably regarding regardless regards relatively respectively
right said same saw say saying says second secondly see seeing seem
seemed seeming seems seen self selves sensible sent serious seriously
seven several shall she should shouldn't since six so some somebody
somehow someone something sometime sometimes somewhat somewhere
soon sorry specified specify specifying still sub such sup sure
t's take taken tell tends th than thank thanks thanx that that's
thats the their theirs them themselves then thence there there's
thereafter thereby therefore therein theres thereupon these they
they'd they'll they're they've think third this thorough thoroughly
those though three through throughout thru thus to together too
took toward towards tried tries truly try trying twice two un
under unfortunately unless unlikely until unto up upon us use used
useful uses using usually value various very via viz vs want wants
was wasn't way we we'd we'll we're we've welcome well went were
weren't what what's whatever when whence whenever where where's
whereafter whereas whereby wherein whereupon wherever whether
which while whither who who's whoever whole whom whose why will
willing wish with within without won't wonder would would wouldn't
yes yet you you'd you'll you're you've your yours yourself
yourselves zero
/ };
$ATTRIBS = {
min_word_size => 4,
stopwords => $STOPWORDS,
};
sub query {
# --------------------------------------------------
# Returns a sth based on a query
#
# Options:
# - paging
# mh : max hits
# nh : number hit (or page of hits)
#
# - searching
# ww : whole word
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
# substring : search for substrings of words
# bool : 'and' => and search, 'or' => or search, '' => regular query
# query : the string of things to ask for
#
# - filtering
# field_name : value # Find all rows with field_name = value
# field_name : ">value" # Find all rows with field_name > value.
# field_name : "<value" # Find all rows with field_name < value.
# field_name-gt : value # Find all rows with field_name > value.
# field_name-lt : value # Find all rows with field_name < value.
#
# Parameters:
# ( $CGI ) : a single cgi object
# ( $HASH ) : a hash of the parameters
#
my $self = shift;
# create an easily accessible argument hash
my $args = $self->common_param(@_);
# see if we can setup the filtering constraints
my $filter = { %$args };
my $query = delete $args->{query} || $self->{query} || '';
my $ftr_cond;
# parse query
$self->debug( "Search Query: $query" ) if ($self->{_debug});
my ( $query_struct, $rejected ) = $self->_parse_query_string( $query );
$self->{rejected_keywords} = $rejected;
# setup the additional input parameters
$query_struct = $self->_preset_options( $query_struct, $args );
# now sort into distinct buckets
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query_struct );
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
# with the buckets, it's now possible to create a query string
# that can be passed directly into the FULLTEXT search.
my $query_string = '';
foreach my $search_type ( keys %$buckets ) {
my $bucket = $buckets->{$search_type};
foreach my $token ( keys %$bucket ) {
next unless $token;
my $properties = $bucket->{$token} or next;
my $e = ' ';
# handle boolean operations
$properties->{mode} ||= '';
if ( $properties->{mode} eq 'must' ) {
$e .= '+';
}
elsif ( $properties->{mode} eq 'cannot' ) {
$e .= '-';
}
# deal with phrase vs keyword
if ( $properties->{phrase} ) {
$e .= '"' . quotemeta( $token ) . '"';
}
else {
$e .= quotemeta $token;
# substring match
$e .= '*' if $properties->{substring};
}
$query_string .= $e;
}
}
# start building the GT::SQL::COndition object that will allow us to
# to retreive the data
require GT::SQL::Condition;
my $tbl = $self->{table};
my $constraints = GT::SQL::Condition->new;
# create the GT::SQL::Condition object that will become the filtering
# constraints
my $filt = $self->{filter};
if ( $filt and ref $filt eq 'HASH' ) {
foreach my $fkey ( keys %$filt ) {
next if exists $args->{$fkey};
$args->{$fkey} = $filt->{$fkey};
}
}
if ( my $filter_cond = $tbl->build_query_cond( $args ) ) {
$constraints->add( $filter_cond );
}
# if the cached filter object is a Condition object, append
# it to the filter set
if ( $filt and UNIVERSAL::isa( $filt, 'GT::SQL::Condition' ) ) {
$constraints->add( $filt );
}
# create our fulltext query condition
my %weights = $tbl->_weight_cols();
my $cols = join(",", keys %weights);
if ( $query_string ) {
$constraints->add( GT::SQL::Condition->new(
"MATCH( $cols )",
"AGAINST",
\"('$query_string' IN BOOLEAN MODE)" ) );
}
# calculate the cursor constraints
foreach my $k (qw( nh mh so sb )) {
next if defined $args->{$k};
$args->{$k} = $self->{$k} || '';
}
$args->{nh} = (defined $args->{nh} and $args->{nh} =~ /^(\d+)$/) ? $1 : 1;
$args->{mh} = (defined $args->{mh} and $args->{mh} =~ /^(\d+)$/) ? $1 : 25;
$args->{sb} = (defined $args->{sb} and $args->{sb} =~ /^([\w ]+)$/ ) ? $1 : 'score';
# if the sorting method is "score" the order is forced to "descend" (as there
# is almost no reason to order by worst matches)
# if the storing key is not "score", the default order will be "ascend"
$args->{so} =
$args->{sb} eq 'score' ? 'desc' : # comment out this entire line to prevent "descend" order forcing
( (defined $args->{so} and $args->{so} =~ /^(asc(?:end)?|desc(?:end)?)$/i) ? $1 : 'asc' );
# check that sb is not dangerous
my $sb = $self->clean_sb($args->{sb}, $args->{so});
$self->debug_dumper( "About to query. Constraint object: ", $constraints) if ($self->{_debug});
# Setup a limit only if there is no callback. The callback argument requires a full results list
unless ( $self->{callback} ) {
my $offset = ( $args->{nh} - 1 ) * $args->{mh};
$tbl->select_options($sb) if ($sb);
$tbl->select_options("LIMIT $offset, $args->{mh}");
}
my $sth;
# if the weights are all the same value, the query can be optimized
# to use just one MATCH AGAINST argument. However, if the weights
# are different, each element must be sectioned and queried separately
# with the weight value multipler
# check to see if all the weight values are the same.
my $base_weight;
my $weights_same = 1;
foreach ( values %weights ) {
$base_weight ||= $_ or next; # init and skip 0s
next if $base_weight == $_;
$weights_same = 0;
last;
}
# multiplex the action
my $result_cols = $self->{callback} ? ($tbl->pk)[0] : '*';
unless ( $query_string ) {
$sth = $tbl->select( [ $result_cols ], $constraints ) or return;
}
elsif ( $weights_same ) {
$sth = $tbl->select( [ $result_cols, "MATCH($cols) AGAINST ('$query_string' IN BOOLEAN MODE) AS score" ], $constraints )
or return;
}
else {
# group the multiplier counts
my %column_multiplier;
foreach ( keys %weights ) {
push @{$column_multiplier{$weights{$_}}}, $_;
}
my @search_parameters;
foreach my $val ( keys %column_multiplier ) {
next unless $val;
my $cols_ar = $column_multiplier{ $val } or next;
my $search_cols = join ",", @$cols_ar;
if ( $val > 1 ) {
push @search_parameters, "( ( MATCH($search_cols) AGAINST ('$query_string' IN BOOLEAN MODE) ) * $val )";
}
else {
push @search_parameters, "( MATCH($search_cols) AGAINST ('$query_string' IN BOOLEAN MODE) )";
}
}
my $search_sql = "( " . join( " + ", @search_parameters ) . " ) AS score";
$sth = $tbl->select( [ $result_cols, $search_sql ], $constraints )
or return;
}
# If we have a callback, we fetch the primary key => score and pass that hash into
# the filter.
if ($self->{callback}) {
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
return $self->error('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
}
my %results = map { $_->[0] => $_->[1] } @{$sth->fetchall_arrayref};
$self->debug_dumper("Running results through callback. Had: " . scalar (keys %results) . " results.", \%results) if ($self->{_debug});
my $filtered = $self->{callback}->($self, \%results) || {};
$self->debug_dumper("New result set: " . scalar (keys %$filtered) . " results.", $filtered) if ($self->{_debug});
$self->{rows} = scalar keys %$filtered;
return $self->sth($filtered);
}
# count the number of hits. create a query for this purpose only if we are required to.
$self->{rows} = $sth->rows();
if (($args->{nh} > 1) or ($self->{rows} == $args->{mh})) {
$self->{rows} = $tbl->count($constraints);
}
return $sth;
}
sub clean_sb {
# -------------------------------------------------------------------------------
# Convert the sort by, sort order into an sql string.
#
my ($class, $sb, $so) = @_;
my $output = '';
return $output unless ($sb);
if ($sb and not ref $sb) {
if ($sb =~ /^[\w\s,]+$/) {
if ($sb =~ /\s(?:asc|desc)/i) {
$output = 'ORDER BY ' . $sb;
}
else {
$output = 'ORDER BY ' . $sb . ' ' . $so;
}
}
else {
$class->error('BADSB', 'WARN', $sb);
}
}
elsif (ref $sb eq 'ARRAY') {
foreach ( @$sb ) {
/^[\w\s,]+$/ or $class->error( 'BADSB', 'WARN', $sb ), next;
}
$output = 'ORDER BY ' . join(',', @$sb);
}
return $output;
}
1;

View File

@ -0,0 +1,25 @@
# ====================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::NONINDEXED::Indexer
# Author: Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: Indexer.pm,v 1.3 2004/01/13 01:35:20 jagerman Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ====================================================================
#
package GT::SQL::Search::NONINDEXED::Indexer;
#--------------------------------------------------------------------------------
use strict;
use vars qw/@ISA $DEBUG/;
use GT::SQL::Search::Base::Indexer;
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
sub load {
shift;
return GT::SQL::Search::NONINDEXED::Indexer->new(@_)
}
1;

View File

@ -0,0 +1,257 @@
# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::SQL::Search::NONINDEXED::Search
# Author : Alex Krohn
# CVS Info : 087,071,086,086,085
# $Id: Search.pm,v 1.30 2006/08/09 06:58:39 brewt Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Nonindex search system
#
package GT::SQL::Search::NONINDEXED::Search;
# ==================================================================
use strict;
use vars qw/@ISA $ATTRIBS $VERSION $DEBUG/;
use GT::SQL::Search::Base::Search;
use GT::SQL::Condition;
@ISA = qw( GT::SQL::Search::Base::Search );
$DEBUG = 0;
$VERSION = sprintf "%d.%03d", q$Revision: 1.30 $ =~ /(\d+)\.(\d+)/;
$ATTRIBS = {
# parse based on latin characters
latin_query_parse => 0
};
sub load {
shift;
return GT::SQL::Search::NONINDEXED::Search->new(@_)
}
sub query {
#--------------------------------------------------------------------------------
# Returns a sth based on a query
#
# Options:
# - paging
# mh : max hits
# nh : number hit (or page of hits)
#
# - searching
# ww : whole word
# ma : 1 => OR match, 0 => AND match, undefined => QUERY
# substring : search for substrings of words
# bool : 'and' => and search, 'or' => or search, '' => regular query
# query : the string of things to ask for
#
# - filtering
# field_name : value # Find all rows with field_name = value
# field_name : ">value" # Find all rows with field_name > value.
# field_name : "<value" # Find all rows with field_name < value.
# field_name-gt : value # Find all rows with field_name > value.
# field_name-lt : value # Find all rows with field_name < value.
#
# Parameters:
# ( $CGI ) : a single cgi object
# ( $HASH ) : a hash of the parameters
#
my $self = shift;
# find out what sort of a parameter we're dealing with
my $input = $self->common_param(@_);
# add additional parameters if required
foreach my $parameter ( keys %{$ATTRIBS} ) {
if ( not exists $input->{$parameter} ) {
$input->{$parameter} = $self->{$parameter};
}
}
# parse query
$self->debug( "Search Query: $$input{query}", 1 ) if ($self->{_debug});
my ( $query, $rejected ) = $self->_parse_query_string( $input->{'query'} );
$self->{rejected_keywords} = $rejected;
# setup the additional input parameters
$query = $self->_preset_options( $query, $input );
$self->debug( "Set the pre-options: ", $query ) if ($self->{_debug});
# now sort into distinct buckets
my $buckets = GT::SQL::Search::Base::Search::_create_buckets( $query );
$self->debug_dumper( "Created Buckets for querying: ", $buckets ) if ($self->{_debug});
require GT::SQL::Condition;
my $query_condition = new GT::SQL::Condition;
# now handle the separate possibilities
# the union
my $union_cond = $self->_get_condition( $buckets->{keywords}, $buckets->{phrases} );
$query_condition->add(GT::SQL::Condition->new(@$union_cond, 'OR')) if $union_cond;
# the intersect
my $intersect_cond = $self->_get_condition( $buckets->{keywords_must}, $buckets->{phrases_must} );
$query_condition->add(GT::SQL::Condition->new(@$intersect_cond)) if $intersect_cond;
# the disjoin
my $disjoin_cond = $self->_get_condition( $buckets->{keywords_cannot}, $buckets->{phrases_cannot} );
$query_condition->add(GT::SQL::Condition->new(@$disjoin_cond, 'OR')->not) if $disjoin_cond;
# now handle filters
my $cols = $self->{'table'}->cols();
my %filters = map {
(my $column = $_) =~ s/-[lg]t$//;
exists $cols->{$column}
? ($_ => $input->{$_})
: ()
} keys %{$input};
# if there was no query nor filter return nothing.
keys %$query or keys %filters or return $self->sth({});
if (keys %filters) {
$self->debug( "Creating Filters: ", \%filters ) if ($self->{_debug});
$self->_add_filters( \%filters );
$query_condition = GT::SQL::Condition->new( keys %$query ? $query_condition : (), $self->{filter} );
}
elsif ($self->{filter} and keys %{$self->{filter}} ) {
$self->debug( "Filtering results", $self->{filter} ) if ($self->{_debug});
$query_condition = GT::SQL::Condition->new( keys %$query ? $query_condition : (), $self->{filter} );
}
else {
$self->debug( "No filters being used.") if ($self->{_debug});
}
# now this query should probably clear the filters once it's been used, so i'll do that here
$self->{filter} = undef;
my $tbl = $self->{table};
my ($pk) = $tbl->pk;
# now run through a callback function if needed.
if ($self->{callback}) {
# Warning: this slows things a heck of a lot.
unless (ref $self->{callback} and ref $self->{callback} eq 'CODE') {
return $self->error ('BADARGS', 'FATAL', "callback '$self->{callback}' must be a code ref!");
}
my $sth = $tbl->select( [ $pk ], $query_condition );
my $results = {};
while (my $result = $sth->fetchrow) {
$results->{$result} = undef;
}
$self->debug_dumper("Running results through callback. Had: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
$results = $self->{callback}->($self, $results);
$self->debug_dumper("New result set: " . scalar (keys %$results) . " results.", $results) if ($self->{_debug});
$self->{rows} = scalar($results ? keys %{$results} : ());
return $self->sth( $results );
}
# and now create a search sth object to handle all this
$input->{nh} = (defined $input->{nh} and $input->{nh} =~ /^(\d+)$/) ? $1 : 1;
$input->{mh} = (defined $input->{mh} and $input->{mh} =~ /^(\d+)$/) ? $1 : 25;
$input->{so} = (defined $input->{so} and $input->{so} =~ /^(asc(?:end)?|desc(?:end)?)$/i) ? $1 : '';
# check that sb is not dangerous
my $sb = $self->clean_sb($input->{sb}, $input->{so});
my $offset = ( $input->{nh} - 1 ) * $input->{mh};
$tbl->select_options($sb) if ($sb);
$tbl->select_options("LIMIT $offset, $input->{mh}");
my $sth = $tbl->select( $query_condition ) or return;
# so how many hits did we get?
$self->{rows} = $sth->rows();
if (($input->{nh} > 1) or ($self->{rows} == $input->{mh})) {
$self->{rows} = $tbl->count($query_condition);
}
return $sth;
}
sub _get_condition {
#-------------------------------------------------------------------------------
my ( $self, $keywords, $phrases ) = @_;
my @list = ( keys %$keywords, keys %$phrases );
my $tbl = $self->{table} or return $self->error( 'NODRIVER', 'FATAL' );
my @cond = ();
my %tmp = $tbl->weight();
my @weights = keys %tmp or return;
foreach my $element ( @list ) {
my @where = ();
foreach my $cols ( @weights ) {
push @where, [$cols, 'LIKE', "%$element%"]; # Condition does quoting by default.
}
push @cond, GT::SQL::Condition->new(@where, 'OR');
}
@cond or return;
return \@cond;
}
sub _parse_query_string {
#------------------------------------------------------------
# Parses a query string '+foo -"bar this" alpha' into a hash of
# words and modes.
#
my ($self, $text) = @_;
my %modes = (
'+' => 'must',
'-' => 'cannot',
'<' => 'greater',
'>' => 'less'
);
# Latin will break up on actual words and punctuation.
if ($self->{latin_query_parse}) {
return $self->SUPER::_parse_query_string( $text );
}
else {
my $words = {};
my @terms;
my $i = 0;
foreach my $term (split /"/, $text) {
push @terms, ($i++ % 2 ? $term : split ' ', $term);
}
for (my $i = 0; $i < @terms; $i++) {
my $word = $terms[$i];
$word =~ s/^\s*|\s*$//g;
next if ($word eq '');
if ($i < $#terms) {
($word eq '-') and ($word = '-' . $terms[++$i]);
($word eq '+') and ($word = '+' . $terms[++$i]);
}
$word =~ s/^([<>+-])//;
my $mode = ($1 and $modes{$1} or 'can');
my $substring = ($word =~ s/\*$//) || 0;
if ($word =~ /\s/) {
$words->{$word} = {
mode => $mode,
phrase => 1,
substring => $substring,
keyword => 0,
};
}
elsif ($word) {
$words->{$word} = {
mode => $mode,
phrase => 0,
substring => $substring,
keyword => 1,
};
}
}
return $words;
}
}
1;