586 lines
21 KiB
Perl
586 lines
21 KiB
Perl
# ==================================================================
|
|
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
|
#
|
|
# GT::SQL::Search
|
|
# Author : Aki Mimoto
|
|
# CVS Info : 087,071,086,086,085
|
|
# $Id: Search.pm,v 1.62 2008/09/23 23:55:26 brewt Exp $
|
|
#
|
|
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
|
# ==================================================================
|
|
#
|
|
# Description:
|
|
# highlevel class for searching, works with GT::SQL::Indexer
|
|
#
|
|
|
|
package GT::SQL::Search;
|
|
#--------------------------------------------------------------------------------
|
|
|
|
# pragmas
|
|
use strict;
|
|
use vars qw/@ISA $ERRORS $ERROR_MESSAGE $VERSION/;
|
|
|
|
# includes
|
|
use GT::Base;
|
|
use GT::AutoLoader;
|
|
|
|
# variables
|
|
$VERSION = sprintf "%d.%03d", q$Revision: 1.62 $ =~ /(\d+)\.(\d+)/;
|
|
@ISA = qw(GT::Base);
|
|
$ERROR_MESSAGE = 'GT::SQL';
|
|
$ERRORS = {
|
|
UNKNOWNDRIVER => 'Unknown driver requested: %s',
|
|
NOTABLE => 'Cannot find reference to table object'
|
|
};
|
|
|
|
sub load_search {
|
|
#--------------------------------------------------------------------------------
|
|
# checks if there is driver for this current database and if so, loads that
|
|
# instead (since it would be faster)
|
|
#
|
|
my $class = shift;
|
|
my $opts = ref $_[0] ? $_[0] : {@_};
|
|
$opts->{mode} = 'Search';
|
|
my $driver = $class->load_driver( $opts ) or return;
|
|
my $pkg = "GT::SQL::Search::${driver}::Search";
|
|
return $pkg->load(@_);
|
|
}
|
|
|
|
sub load_indexer {
|
|
#--------------------------------------------------------------------------------
|
|
# checks if there is driver for this current database and if so, loads that
|
|
# instead (since it would be faster)
|
|
#
|
|
my $class = shift;
|
|
my $opts = ref $_[0] ? $_[0] : {@_};
|
|
$opts->{mode} = 'Indexer';
|
|
my $driver = $class->load_driver( $opts ) or return;
|
|
my $pkg = "GT::SQL::Search::${driver}::Indexer";
|
|
|
|
return $pkg->load(@_);
|
|
}
|
|
|
|
sub driver_ok {
|
|
#--------------------------------------------------------------------------------
|
|
# checks to see if a particular driver is allowed on this system
|
|
#
|
|
my $class = shift;
|
|
my $driver = uc shift or return;
|
|
my $opts = ref $_[0] ? $_[0] : {@_};
|
|
my $mode = $opts->{mode} || 'Indexer';
|
|
my $tbl = $opts->{table} or return GT::SQL::Search->error( 'NOTABLE', 'FATAL' );
|
|
my $pkg = 'GT::SQL::Search::' . $driver . '::' . $mode;
|
|
|
|
eval { require "GT/SQL/Search/$driver/$mode.pm" };
|
|
$@ and return GT::SQL::Search->error('UNKNOWNDRIVER', 'WARN', $driver);
|
|
return $pkg->can('ok') ? $pkg->ok($tbl) : 1;
|
|
}
|
|
|
|
sub load_driver {
|
|
#--------------------------------------------------------------------------------
|
|
# Loads a driver into memory.
|
|
#
|
|
my $class = shift;
|
|
my $opts = ref $_[0] ? $_[0] : {@_};
|
|
my $tbl = $opts->{table};
|
|
my $mode = $opts->{mode} || 'Indexer';
|
|
my $driver = uc($opts->{driver} || $tbl->{schema}->{search_driver} || 'NONINDEXED');
|
|
|
|
require "GT/SQL/Search/$driver/$mode.pm";
|
|
return $driver;
|
|
}
|
|
|
|
sub available_drivers {
|
|
#--------------------------------------------------------------------------------
|
|
# Returns a list of available drivers.
|
|
#
|
|
my $class = shift;
|
|
|
|
(my $path = $INC{'GT/SQL/Search.pm'}) =~ s/\.pm$//;
|
|
opendir DHANDLE, $path or return $class->fatal(CANTOPENDIR => $path, "$!");
|
|
my @arr;
|
|
for my $driver_name (readdir DHANDLE) {
|
|
next if $driver_name =~ y/a-z//;
|
|
next if $driver_name eq 'LUCENE';
|
|
-f "$path/$driver_name/Search.pm" and -r _ or next;
|
|
-f "$path/$driver_name/Indexer.pm" and -r _ or next;
|
|
my $loaded = eval {
|
|
require "GT/SQL/Search/$driver_name/Search.pm";
|
|
require "GT/SQL/Search/$driver_name/Indexer.pm";
|
|
};
|
|
push @arr, $driver_name if $loaded;
|
|
}
|
|
closedir DHANDLE;
|
|
return wantarray ? @arr : \@arr;
|
|
}
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
GT::SQL::Search - internal driver for searching
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
This implements the query string based searching scheme for GT::SQL. Driver
|
|
based, it is designed to take advantage of the different indexing schemes
|
|
available on different database engines.
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
Instead of describing how Search.pm is interfaced* this will describe how a
|
|
driver should be structured and how a new driver can be implemented.
|
|
|
|
* as it is never accessed directly by the programmer as it was designed to be
|
|
called through the functions GT::SQL::Table::query and GT::SQL::Table::query_sth
|
|
|
|
=head2 Drivers
|
|
|
|
A driver has two parts. The Indexer and the Search packages are the most
|
|
important. Howserver, for any driver in the search, there must exist a directory
|
|
with the name of the driver in ALL CAPS. For exampel, MYSQL for MySQL, POSTGRES
|
|
for Postgres. Within each driver directory, The Indexer and Search portions of
|
|
the driver contains all the information required for initializing the database
|
|
table and searching the database.
|
|
|
|
The Indexing package of the driver handles all the data that is manipulated in
|
|
the database and also the initializes and the database for indexing.
|
|
|
|
The Search package handles the queries and retrieves results for the eventual
|
|
consumption by the calling program.
|
|
|
|
Drivers are simply subclasses of the base driver module, GT::SQL::Search::Base
|
|
and operate by overriding certain key functions.
|
|
|
|
The next few sections will cover how to create a search driver, and assumes a
|
|
fair bit of familiarity with GT::SQL.
|
|
|
|
=head2 Structure of an Indexing Driver
|
|
|
|
The following is an absolutely simple skeleton driver that does nothing and but
|
|
called "CUSTOM". Found in the CUSTOM directory, this is the search package, and
|
|
would be call Search.pm in the GT/SQL/Search/CUSTOM library directory.
|
|
|
|
package GT::SQL::Search::CUSTOM::Search;
|
|
#------------------------------------------
|
|
use strict;
|
|
use vars qw/ @ISA /;
|
|
use GT::SQL::Search::Base::Search;
|
|
@ISA = qw( GT::SQL::Search::Base::Search );
|
|
|
|
sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Search->new(@_) };
|
|
|
|
# overrides would go here
|
|
|
|
1;
|
|
|
|
For the indexer, another file, Indexer.pm would be found in the
|
|
GT/SQL/Search/CUSTOM directory.
|
|
|
|
package GT::SQL::Search::CUSTOM::Indexer;
|
|
#------------------------------------------
|
|
|
|
use strict;
|
|
use vars qw/ @ISA /;
|
|
use GT::SQL::Search::Base;
|
|
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
|
|
|
sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Indexer->new(@_) };
|
|
|
|
# overrides would go here
|
|
|
|
1;
|
|
|
|
The almost empty subs that immediately return with a value are functions that
|
|
can be overridden to do special tasks. More will be detailed later.
|
|
|
|
The Driver has been split into two packages. The original package name,
|
|
GT::SQL::Search::Nothing, houses the Search package.
|
|
GT::SQL::Search::Nothing::Indexer is the Indexing portion of the seach system.
|
|
"::Indexer" must be appended to the orginial search name for the indexer.
|
|
|
|
Each of the override functions are triggered at points just before and after a
|
|
major event occurs in GT::SQL. Depending on the type of actions you require, you
|
|
pick and chose which events you'd like your driver to attach to.
|
|
|
|
=head2 Structure of Indexing Driver
|
|
|
|
The Indexer is responsible for creating all the indexes, maintaining them and
|
|
when the table is dropped, removing all the associated indexes.
|
|
|
|
The following header must be defined for the Indexer.
|
|
GT::SQL::Search::Base::Indexer is the superclass that our driver inherits from.
|
|
|
|
package GT::SQL::Search::CUSTOM::Indexer;
|
|
#------------------------------------------
|
|
|
|
use strict;
|
|
use vars qw/ @ISA /;
|
|
use GT::Base;
|
|
use GT::SQL::Search::Base::Indexer;
|
|
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
|
|
|
In addition to the header, the following function must be defined.
|
|
GT::SQL::Search::Driver::Indexer::load creates the new object and allows for
|
|
special preinitialization that must occur. You can also create another driver
|
|
silently (such as defaulting to INTERNAL after a version check fails).
|
|
|
|
sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Indexer->new(@_) };
|
|
|
|
Finally, there are the overrides. None of the override functions need be defined
|
|
in your driver. Any calls made to undefined methods will silently fallback to
|
|
the superclass driver's methods. When a method has been overridden, the function
|
|
must return a true value when it is successful, otherwise the action will fail
|
|
and an error generated.
|
|
|
|
Whenever a object is created it will receive one property $self->{table} which
|
|
is the table that is being worked upon. This property is available in all the
|
|
method calls and is required for methods such as _create_table and
|
|
_drop_search_driver methods.
|
|
|
|
When a table is first created or when a table is destroyed the following two
|
|
functions are called. They are not passed any special values, however, these are
|
|
all class methods and $self->{table} will be a reference to the current table in
|
|
use.
|
|
|
|
This set of overrides are used by GT::SQL::Creator when the ::create method is
|
|
called. They are called just prior and then after the create table sql query has
|
|
been executed.
|
|
|
|
=over 2
|
|
|
|
=item pre_create_table
|
|
|
|
=item post_create_table
|
|
|
|
These functions receive no special parameters. They will receive the data to the
|
|
table in the $self->{table} property.
|
|
|
|
=back
|
|
|
|
This next set of functions take place in GT::SQL::Editor.
|
|
|
|
=over 2
|
|
|
|
=item drop_search_driver
|
|
|
|
This method receives no special parameters but is responsible for removing all
|
|
indexes and "things" associated with the indexing schema.
|
|
|
|
=item add_search_driver
|
|
|
|
Receives no extra parameters. Creates all indexes and does all actions required
|
|
to initialize indexing scheme.
|
|
|
|
=item pre_add_column
|
|
|
|
=item post_add_column
|
|
|
|
The previous two functions are called just before and after a new column is
|
|
added.
|
|
|
|
pre_add_column accepts $name (of column), $col (hashref of column attributes).
|
|
The method will only be called if the column has a weight associated with it.
|
|
The function must return a non-zero value if successful. Note that the returned
|
|
value will be passed into the post_add_column so temporary values can be passed
|
|
through if required.
|
|
|
|
post_add_column accepts $name (of column), $col (hashref of column attributes),
|
|
$results (of pre_add_column). This method is called just after the column has
|
|
been inserted into the database.
|
|
|
|
=item pre_delete_column
|
|
|
|
=item post_delete_column
|
|
|
|
These previous functions are called just before and after the sql for a old
|
|
column is deleted. They must remove all objects and "things" associated with a
|
|
particular column's index.
|
|
|
|
pre_delete_column accepts $name (of column), $col (hashref of column
|
|
attributes). The method will only be called if the column has a weight
|
|
associated with it. The function must return a non-zero value if successful.
|
|
Note that the returned value will be passed into the post_delete_column so
|
|
temporary values can be passed through if required.
|
|
|
|
post_delete_column accepts $name (of column), $col (hashref of column
|
|
attributes), $results (of pre_add_column). This method is called just after the
|
|
column has been dropped from the database.
|
|
|
|
=item pre_drop_table
|
|
|
|
=item post_drop_table
|
|
|
|
The two previous methods are used before and after the table is dropped. The
|
|
methods must remove any tables or "things" related to indexing from the table.
|
|
|
|
pre_drop_table receives no arguments. It can find a copy of the current table
|
|
and columns associated in $self->{table}.
|
|
|
|
post_drop_table receives one argument, which is the result of the
|
|
pre_drop_table.
|
|
|
|
=back
|
|
|
|
The following set of functions take place in GT::SQL::Table
|
|
|
|
=over 2
|
|
|
|
=item pre_add_record
|
|
|
|
=item post_add_record
|
|
|
|
Called just before and after an insert occurs. These functions take the record
|
|
and indexes them as required.
|
|
|
|
pre_add_record will receive one argument, $rec, hashref, which is the record
|
|
that will be inserted into the database. Table information can be found by
|
|
accessing $self->{table} Much like the other functions, on success the result
|
|
will be cached and fed into the post_add_record function.
|
|
|
|
post_add_record receives $rec, a hashref to describing the new result, the $sth
|
|
of the insert query, and the result of the pre_add_record method. The result
|
|
from $sth->insert_id if there is a ai field will be the new unique primary key.
|
|
|
|
=item pre_update_record
|
|
|
|
=item post_update_record
|
|
|
|
Intercepts the update request before and just after the sql query is executed.
|
|
This override has the potential of being rather messy. More than one record can
|
|
be modified in this action and the indexer must work a lot to ensure the
|
|
database is up to snuff.
|
|
|
|
pre_update_record receives two parameters, $set_cond, $where_cond. $set_cond is
|
|
a hashref containing the new values that must be set, and $where_cond is a
|
|
GT::SQL::Condition object selecting records to update. The result once again, is
|
|
cached and if undef is considered an error.
|
|
|
|
post_update_record takes the same parameters as pre_update_record, except one
|
|
extra paremeter, the result of pre_update_record.
|
|
|
|
=item pre_delete_record
|
|
|
|
=item post_delete_record
|
|
|
|
Called just before and after the deletion request for records are called.
|
|
|
|
pre_delete_record, has only one parameter, $where, a GT::SQL::Condition object
|
|
telling which records to delete. The results of this method are passed to
|
|
post_delete_record.
|
|
|
|
post_delete_record, has one addition parameter to pre_delete_record and like
|
|
most post_ methods, is the result of the pre_delete_record method.
|
|
|
|
=item pre_delete_all_records
|
|
|
|
=item post_delete_all_records
|
|
|
|
These two functions are quite simple, but they are different from drop search
|
|
driver in that though the records are all dropped, the framework for all the
|
|
indexing is not dropped as well.
|
|
|
|
Neither function is passed any special data, except for post_delete_all_records
|
|
which receives the rsults of the pre_delete_all_records method.
|
|
|
|
=item reindex_all
|
|
|
|
This function is sometimes called by the user to refresh the index. The
|
|
motivation for this, in the case of the INTERNAL driver, is sometimes due to
|
|
outside manipulation of the database tables, the index can become
|
|
non-representative of the data in the tables. This method is to force the
|
|
indexing system to fix errors that have passed.
|
|
|
|
=item ok
|
|
|
|
This function is called by GT::SQL::Search as a package method,
|
|
GT::SQL::Search::EXAMPLE::Indexer->ok( $table ); and is passed a table object
|
|
reference. What this function must do is to return a true or false value that
|
|
tells the search system if this driver can be used. The MYSQL driver has a good
|
|
example for this, it tests to ensure that the mysql database system version is
|
|
at least 3.23.23.
|
|
|
|
=back
|
|
|
|
=head2 Structure of a Search Driver
|
|
|
|
The Searcher is responsible for only one thing, to return results from a query
|
|
search. You can override the parser, however, subclassing the following methods
|
|
will have full parsing for all things such as +/-, string parsing and substring
|
|
matching.
|
|
|
|
The structures passed into the methods get a little complicated so beware!
|
|
|
|
ALL the following functions receive two parameters, the first is a search
|
|
parameters detailing the words/phrases to search for, the second parameter is
|
|
the current result set of IDs => scores.
|
|
|
|
There are two types of search parameters, one for words and the other for
|
|
phrases. The structure is a little messy so I'll detail them here.
|
|
|
|
For words, the structure is like the following:
|
|
|
|
$word_search = {
|
|
'word' => {
|
|
substring => '1', # set to 1 if this is substring match
|
|
phrase => 0, # not a phrase
|
|
keyword => 1, # is a keyword
|
|
mode => '', # can also be must, cannot to mean +/-
|
|
},
|
|
'word2' => ...
|
|
}
|
|
|
|
For phrases the structure will become:
|
|
|
|
$phrase_search => {
|
|
'phrase' => {
|
|
substring => undef # never required
|
|
phrase => [
|
|
'word1',
|
|
'word2',
|
|
'word3',
|
|
...
|
|
], # for searching by indiv word if required
|
|
keyword => 0, # not a keyword
|
|
mode => '' # can also be must, cannot
|
|
},
|
|
'phrase2' => ...
|
|
}
|
|
|
|
Based on these structures, hopefully it will be easy enough to build whatever is
|
|
required to grab the appropriate records.
|
|
|
|
Finally, the second item passed in will be a hash filled with ID => score values
|
|
of search results. They look something like this:
|
|
|
|
$results = {
|
|
1 => 56,
|
|
2 => 31,
|
|
4 => 6
|
|
}
|
|
|
|
It is important for all the methods to take the results and return the results,
|
|
as the result set will be daisychained down like a set to be operated on by
|
|
various searching schemes.
|
|
|
|
At the end of the query, the results in this set will be sorted and returned to
|
|
the user as an sth.
|
|
|
|
Operations on this set are preformed by the following five methods.
|
|
|
|
=over 2
|
|
|
|
=item _query
|
|
|
|
This method is called just after all the query string has been parsed and put
|
|
into their proper buckets. This method is overridden by the INTERNAL driver to
|
|
decide it wants to switch to the NONINDEX driver for better performance.
|
|
|
|
Two parameters are passed in, ( $input, $buckets ). $input is a hash that
|
|
contains all the form/cgi parameters passed to the $tbl->query function and
|
|
$buckets is s the structure that is created after the query string is parsed.
|
|
You may also call $self->SUPER::_query( $input, $buckets ) to pass the request
|
|
along normally.
|
|
|
|
You must return undef or an STH from this function.
|
|
|
|
=item _union_query
|
|
|
|
This method takes a $word_search and does a simple match query. If it finds
|
|
records with any of the words included, it will append the results to the list.
|
|
Passed in is the $results and it must return the altered results set.
|
|
|
|
This method must also implement substring searching.
|
|
|
|
=item _phrase_query
|
|
|
|
Just like the union_query, however it searches based on phrases.
|
|
|
|
=item _phrase_intersect_query
|
|
|
|
This takes a $phrase_search and a $result as parameters. This method must look
|
|
to find results that are found within the current result set that have the
|
|
passed phrases as well. However, if there are no results found, this method can
|
|
look for more results.
|
|
|
|
=item _intersect_query
|
|
|
|
Takes two parameters, a $word_search, and $results. Just like the
|
|
_phrase_intersect query, if there are results already, tries to whittle away the
|
|
result set. If there are no results, tries to look for results that have all the
|
|
keywords in a record.
|
|
|
|
This method must also implement substring searching.
|
|
|
|
=item _disjoin_query
|
|
|
|
Takes two parameters, a $word_search, and $results. This will look through the
|
|
result set and remove all matches to any of the keywords.
|
|
|
|
This method must also implement substring searching.
|
|
|
|
=item _phrase_disjoin_query
|
|
|
|
Two parameters, $phrase_search and $results are passed to this method. This does
|
|
the exact same thing as _disjoin_query but it looks for phrases.
|
|
|
|
=item query
|
|
|
|
If you choose to override this method, you will have full control of the query.
|
|
|
|
This method accepts a $CGI or a $HASH object and performs the following
|
|
|
|
Options:
|
|
- paging
|
|
mh : max hits
|
|
nh : number hit (or page of hits)
|
|
sb : column to sort by (default is by score)
|
|
|
|
- searching
|
|
ww : whole word
|
|
ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
|
substring : search for substrings of words
|
|
bool : 'and' => and search, 'or' => or search, '' => regular query
|
|
query : the string of things to ask for
|
|
|
|
- filtering
|
|
field_name : value # Find all rows with field_name = value
|
|
field_name : ">value" # Find all rows with field_name > value.
|
|
field_name : "<value" # Find all rows with field_name < value.
|
|
field_name-gt : value # Find all rows with field_name > value.
|
|
field_name-lt : value # Find all rows with field_name < value.
|
|
|
|
The function must return a STH object. However, you may find useful the
|
|
GT::SQL::Search::STH object, which will automatically handle mh, nh, and
|
|
alternative sorting requests. All you will have to do is
|
|
|
|
sub query { ... your code ... return $self->sth( $results ); }
|
|
|
|
Where results is a hashref containing primarykeyvalue => scorevalues.
|
|
|
|
=item alternate_driver_query
|
|
|
|
There is no reason to override this method, however, if you would like to use
|
|
another driver's search instead of the current, this method will let you do so.
|
|
|
|
Accepting 2 parameters, ( $drivername, $input ), where $drivername is the name
|
|
of the driver you'd like to use and $input is the parameters passed to the
|
|
method. Returned is an $sth value (undef if an error has occurred). This method
|
|
was used in the INTERNAL driver to shunt to NONINDEXED if it found the search
|
|
would take too long.
|
|
|
|
=back
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
|
http://www.gossamer-threads.com/
|
|
|
|
=head1 VERSION
|
|
|
|
Revision: $Id: Search.pm,v 1.62 2008/09/23 23:55:26 brewt Exp $
|
|
|
|
=cut
|