First pass at adding key files
This commit is contained in:
585
site/slowtwitch.com/cgi-bin/articles/admin/GT/SQL/Search.pm
Normal file
585
site/slowtwitch.com/cgi-bin/articles/admin/GT/SQL/Search.pm
Normal file
@ -0,0 +1,585 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||||
#
|
||||
# GT::SQL::Search
|
||||
# Author : Aki Mimoto
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: Search.pm,v 1.62 2008/09/23 23:55:26 brewt Exp $
|
||||
#
|
||||
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# highlevel class for searching, works with GT::SQL::Indexer
|
||||
#
|
||||
|
||||
package GT::SQL::Search;
|
||||
#--------------------------------------------------------------------------------
|
||||
|
||||
# pragmas
|
||||
use strict;
|
||||
use vars qw/@ISA $ERRORS $ERROR_MESSAGE $VERSION/;
|
||||
|
||||
# includes
|
||||
use GT::Base;
|
||||
use GT::AutoLoader;
|
||||
|
||||
# variables
|
||||
$VERSION = sprintf "%d.%03d", q$Revision: 1.62 $ =~ /(\d+)\.(\d+)/;
|
||||
@ISA = qw(GT::Base);
|
||||
$ERROR_MESSAGE = 'GT::SQL';
|
||||
$ERRORS = {
|
||||
UNKNOWNDRIVER => 'Unknown driver requested: %s',
|
||||
NOTABLE => 'Cannot find reference to table object'
|
||||
};
|
||||
|
||||
sub load_search {
|
||||
#--------------------------------------------------------------------------------
|
||||
# checks if there is driver for this current database and if so, loads that
|
||||
# instead (since it would be faster)
|
||||
#
|
||||
my $class = shift;
|
||||
my $opts = ref $_[0] ? $_[0] : {@_};
|
||||
$opts->{mode} = 'Search';
|
||||
my $driver = $class->load_driver( $opts ) or return;
|
||||
my $pkg = "GT::SQL::Search::${driver}::Search";
|
||||
return $pkg->load(@_);
|
||||
}
|
||||
|
||||
sub load_indexer {
|
||||
#--------------------------------------------------------------------------------
|
||||
# checks if there is driver for this current database and if so, loads that
|
||||
# instead (since it would be faster)
|
||||
#
|
||||
my $class = shift;
|
||||
my $opts = ref $_[0] ? $_[0] : {@_};
|
||||
$opts->{mode} = 'Indexer';
|
||||
my $driver = $class->load_driver( $opts ) or return;
|
||||
my $pkg = "GT::SQL::Search::${driver}::Indexer";
|
||||
|
||||
return $pkg->load(@_);
|
||||
}
|
||||
|
||||
sub driver_ok {
|
||||
#--------------------------------------------------------------------------------
|
||||
# checks to see if a particular driver is allowed on this system
|
||||
#
|
||||
my $class = shift;
|
||||
my $driver = uc shift or return;
|
||||
my $opts = ref $_[0] ? $_[0] : {@_};
|
||||
my $mode = $opts->{mode} || 'Indexer';
|
||||
my $tbl = $opts->{table} or return GT::SQL::Search->error( 'NOTABLE', 'FATAL' );
|
||||
my $pkg = 'GT::SQL::Search::' . $driver . '::' . $mode;
|
||||
|
||||
eval { require "GT/SQL/Search/$driver/$mode.pm" };
|
||||
$@ and return GT::SQL::Search->error('UNKNOWNDRIVER', 'WARN', $driver);
|
||||
return $pkg->can('ok') ? $pkg->ok($tbl) : 1;
|
||||
}
|
||||
|
||||
sub load_driver {
|
||||
#--------------------------------------------------------------------------------
|
||||
# Loads a driver into memory.
|
||||
#
|
||||
my $class = shift;
|
||||
my $opts = ref $_[0] ? $_[0] : {@_};
|
||||
my $tbl = $opts->{table};
|
||||
my $mode = $opts->{mode} || 'Indexer';
|
||||
my $driver = uc($opts->{driver} || $tbl->{schema}->{search_driver} || 'NONINDEXED');
|
||||
|
||||
require "GT/SQL/Search/$driver/$mode.pm";
|
||||
return $driver;
|
||||
}
|
||||
|
||||
sub available_drivers {
|
||||
#--------------------------------------------------------------------------------
|
||||
# Returns a list of available drivers.
|
||||
#
|
||||
my $class = shift;
|
||||
|
||||
(my $path = $INC{'GT/SQL/Search.pm'}) =~ s/\.pm$//;
|
||||
opendir DHANDLE, $path or return $class->fatal(CANTOPENDIR => $path, "$!");
|
||||
my @arr;
|
||||
for my $driver_name (readdir DHANDLE) {
|
||||
next if $driver_name =~ y/a-z//;
|
||||
next if $driver_name eq 'LUCENE';
|
||||
-f "$path/$driver_name/Search.pm" and -r _ or next;
|
||||
-f "$path/$driver_name/Indexer.pm" and -r _ or next;
|
||||
my $loaded = eval {
|
||||
require "GT/SQL/Search/$driver_name/Search.pm";
|
||||
require "GT/SQL/Search/$driver_name/Indexer.pm";
|
||||
};
|
||||
push @arr, $driver_name if $loaded;
|
||||
}
|
||||
closedir DHANDLE;
|
||||
return wantarray ? @arr : \@arr;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
GT::SQL::Search - internal driver for searching
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
This implements the query string based searching scheme for GT::SQL. Driver
|
||||
based, it is designed to take advantage of the different indexing schemes
|
||||
available on different database engines.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Instead of describing how Search.pm is interfaced* this will describe how a
|
||||
driver should be structured and how a new driver can be implemented.
|
||||
|
||||
* as it is never accessed directly by the programmer as it was designed to be
|
||||
called through the functions GT::SQL::Table::query and GT::SQL::Table::query_sth
|
||||
|
||||
=head2 Drivers
|
||||
|
||||
A driver has two parts. The Indexer and the Search packages are the most
|
||||
important. Howserver, for any driver in the search, there must exist a directory
|
||||
with the name of the driver in ALL CAPS. For exampel, MYSQL for MySQL, POSTGRES
|
||||
for Postgres. Within each driver directory, The Indexer and Search portions of
|
||||
the driver contains all the information required for initializing the database
|
||||
table and searching the database.
|
||||
|
||||
The Indexing package of the driver handles all the data that is manipulated in
|
||||
the database and also the initializes and the database for indexing.
|
||||
|
||||
The Search package handles the queries and retrieves results for the eventual
|
||||
consumption by the calling program.
|
||||
|
||||
Drivers are simply subclasses of the base driver module, GT::SQL::Search::Base
|
||||
and operate by overriding certain key functions.
|
||||
|
||||
The next few sections will cover how to create a search driver, and assumes a
|
||||
fair bit of familiarity with GT::SQL.
|
||||
|
||||
=head2 Structure of an Indexing Driver
|
||||
|
||||
The following is an absolutely simple skeleton driver that does nothing and but
|
||||
called "CUSTOM". Found in the CUSTOM directory, this is the search package, and
|
||||
would be call Search.pm in the GT/SQL/Search/CUSTOM library directory.
|
||||
|
||||
package GT::SQL::Search::CUSTOM::Search;
|
||||
#------------------------------------------
|
||||
use strict;
|
||||
use vars qw/ @ISA /;
|
||||
use GT::SQL::Search::Base::Search;
|
||||
@ISA = qw( GT::SQL::Search::Base::Search );
|
||||
|
||||
sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Search->new(@_) };
|
||||
|
||||
# overrides would go here
|
||||
|
||||
1;
|
||||
|
||||
For the indexer, another file, Indexer.pm would be found in the
|
||||
GT/SQL/Search/CUSTOM directory.
|
||||
|
||||
package GT::SQL::Search::CUSTOM::Indexer;
|
||||
#------------------------------------------
|
||||
|
||||
use strict;
|
||||
use vars qw/ @ISA /;
|
||||
use GT::SQL::Search::Base;
|
||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||||
|
||||
sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Indexer->new(@_) };
|
||||
|
||||
# overrides would go here
|
||||
|
||||
1;
|
||||
|
||||
The almost empty subs that immediately return with a value are functions that
|
||||
can be overridden to do special tasks. More will be detailed later.
|
||||
|
||||
The Driver has been split into two packages. The original package name,
|
||||
GT::SQL::Search::Nothing, houses the Search package.
|
||||
GT::SQL::Search::Nothing::Indexer is the Indexing portion of the seach system.
|
||||
"::Indexer" must be appended to the orginial search name for the indexer.
|
||||
|
||||
Each of the override functions are triggered at points just before and after a
|
||||
major event occurs in GT::SQL. Depending on the type of actions you require, you
|
||||
pick and chose which events you'd like your driver to attach to.
|
||||
|
||||
=head2 Structure of Indexing Driver
|
||||
|
||||
The Indexer is responsible for creating all the indexes, maintaining them and
|
||||
when the table is dropped, removing all the associated indexes.
|
||||
|
||||
The following header must be defined for the Indexer.
|
||||
GT::SQL::Search::Base::Indexer is the superclass that our driver inherits from.
|
||||
|
||||
package GT::SQL::Search::CUSTOM::Indexer;
|
||||
#------------------------------------------
|
||||
|
||||
use strict;
|
||||
use vars qw/ @ISA /;
|
||||
use GT::Base;
|
||||
use GT::SQL::Search::Base::Indexer;
|
||||
@ISA = qw/ GT::SQL::Search::Base::Indexer /;
|
||||
|
||||
In addition to the header, the following function must be defined.
|
||||
GT::SQL::Search::Driver::Indexer::load creates the new object and allows for
|
||||
special preinitialization that must occur. You can also create another driver
|
||||
silently (such as defaulting to INTERNAL after a version check fails).
|
||||
|
||||
sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Indexer->new(@_) };
|
||||
|
||||
Finally, there are the overrides. None of the override functions need be defined
|
||||
in your driver. Any calls made to undefined methods will silently fallback to
|
||||
the superclass driver's methods. When a method has been overridden, the function
|
||||
must return a true value when it is successful, otherwise the action will fail
|
||||
and an error generated.
|
||||
|
||||
Whenever a object is created it will receive one property $self->{table} which
|
||||
is the table that is being worked upon. This property is available in all the
|
||||
method calls and is required for methods such as _create_table and
|
||||
_drop_search_driver methods.
|
||||
|
||||
When a table is first created or when a table is destroyed the following two
|
||||
functions are called. They are not passed any special values, however, these are
|
||||
all class methods and $self->{table} will be a reference to the current table in
|
||||
use.
|
||||
|
||||
This set of overrides are used by GT::SQL::Creator when the ::create method is
|
||||
called. They are called just prior and then after the create table sql query has
|
||||
been executed.
|
||||
|
||||
=over 2
|
||||
|
||||
=item pre_create_table
|
||||
|
||||
=item post_create_table
|
||||
|
||||
These functions receive no special parameters. They will receive the data to the
|
||||
table in the $self->{table} property.
|
||||
|
||||
=back
|
||||
|
||||
This next set of functions take place in GT::SQL::Editor.
|
||||
|
||||
=over 2
|
||||
|
||||
=item drop_search_driver
|
||||
|
||||
This method receives no special parameters but is responsible for removing all
|
||||
indexes and "things" associated with the indexing schema.
|
||||
|
||||
=item add_search_driver
|
||||
|
||||
Receives no extra parameters. Creates all indexes and does all actions required
|
||||
to initialize indexing scheme.
|
||||
|
||||
=item pre_add_column
|
||||
|
||||
=item post_add_column
|
||||
|
||||
The previous two functions are called just before and after a new column is
|
||||
added.
|
||||
|
||||
pre_add_column accepts $name (of column), $col (hashref of column attributes).
|
||||
The method will only be called if the column has a weight associated with it.
|
||||
The function must return a non-zero value if successful. Note that the returned
|
||||
value will be passed into the post_add_column so temporary values can be passed
|
||||
through if required.
|
||||
|
||||
post_add_column accepts $name (of column), $col (hashref of column attributes),
|
||||
$results (of pre_add_column). This method is called just after the column has
|
||||
been inserted into the database.
|
||||
|
||||
=item pre_delete_column
|
||||
|
||||
=item post_delete_column
|
||||
|
||||
These previous functions are called just before and after the sql for a old
|
||||
column is deleted. They must remove all objects and "things" associated with a
|
||||
particular column's index.
|
||||
|
||||
pre_delete_column accepts $name (of column), $col (hashref of column
|
||||
attributes). The method will only be called if the column has a weight
|
||||
associated with it. The function must return a non-zero value if successful.
|
||||
Note that the returned value will be passed into the post_delete_column so
|
||||
temporary values can be passed through if required.
|
||||
|
||||
post_delete_column accepts $name (of column), $col (hashref of column
|
||||
attributes), $results (of pre_add_column). This method is called just after the
|
||||
column has been dropped from the database.
|
||||
|
||||
=item pre_drop_table
|
||||
|
||||
=item post_drop_table
|
||||
|
||||
The two previous methods are used before and after the table is dropped. The
|
||||
methods must remove any tables or "things" related to indexing from the table.
|
||||
|
||||
pre_drop_table receives no arguments. It can find a copy of the current table
|
||||
and columns associated in $self->{table}.
|
||||
|
||||
post_drop_table receives one argument, which is the result of the
|
||||
pre_drop_table.
|
||||
|
||||
=back
|
||||
|
||||
The following set of functions take place in GT::SQL::Table
|
||||
|
||||
=over 2
|
||||
|
||||
=item pre_add_record
|
||||
|
||||
=item post_add_record
|
||||
|
||||
Called just before and after an insert occurs. These functions take the record
|
||||
and indexes them as required.
|
||||
|
||||
pre_add_record will receive one argument, $rec, hashref, which is the record
|
||||
that will be inserted into the database. Table information can be found by
|
||||
accessing $self->{table} Much like the other functions, on success the result
|
||||
will be cached and fed into the post_add_record function.
|
||||
|
||||
post_add_record receives $rec, a hashref to describing the new result, the $sth
|
||||
of the insert query, and the result of the pre_add_record method. The result
|
||||
from $sth->insert_id if there is a ai field will be the new unique primary key.
|
||||
|
||||
=item pre_update_record
|
||||
|
||||
=item post_update_record
|
||||
|
||||
Intercepts the update request before and just after the sql query is executed.
|
||||
This override has the potential of being rather messy. More than one record can
|
||||
be modified in this action and the indexer must work a lot to ensure the
|
||||
database is up to snuff.
|
||||
|
||||
pre_update_record receives two parameters, $set_cond, $where_cond. $set_cond is
|
||||
a hashref containing the new values that must be set, and $where_cond is a
|
||||
GT::SQL::Condition object selecting records to update. The result once again, is
|
||||
cached and if undef is considered an error.
|
||||
|
||||
post_update_record takes the same parameters as pre_update_record, except one
|
||||
extra paremeter, the result of pre_update_record.
|
||||
|
||||
=item pre_delete_record
|
||||
|
||||
=item post_delete_record
|
||||
|
||||
Called just before and after the deletion request for records are called.
|
||||
|
||||
pre_delete_record, has only one parameter, $where, a GT::SQL::Condition object
|
||||
telling which records to delete. The results of this method are passed to
|
||||
post_delete_record.
|
||||
|
||||
post_delete_record, has one addition parameter to pre_delete_record and like
|
||||
most post_ methods, is the result of the pre_delete_record method.
|
||||
|
||||
=item pre_delete_all_records
|
||||
|
||||
=item post_delete_all_records
|
||||
|
||||
These two functions are quite simple, but they are different from drop search
|
||||
driver in that though the records are all dropped, the framework for all the
|
||||
indexing is not dropped as well.
|
||||
|
||||
Neither function is passed any special data, except for post_delete_all_records
|
||||
which receives the rsults of the pre_delete_all_records method.
|
||||
|
||||
=item reindex_all
|
||||
|
||||
This function is sometimes called by the user to refresh the index. The
|
||||
motivation for this, in the case of the INTERNAL driver, is sometimes due to
|
||||
outside manipulation of the database tables, the index can become
|
||||
non-representative of the data in the tables. This method is to force the
|
||||
indexing system to fix errors that have passed.
|
||||
|
||||
=item ok
|
||||
|
||||
This function is called by GT::SQL::Search as a package method,
|
||||
GT::SQL::Search::EXAMPLE::Indexer->ok( $table ); and is passed a table object
|
||||
reference. What this function must do is to return a true or false value that
|
||||
tells the search system if this driver can be used. The MYSQL driver has a good
|
||||
example for this, it tests to ensure that the mysql database system version is
|
||||
at least 3.23.23.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Structure of a Search Driver
|
||||
|
||||
The Searcher is responsible for only one thing, to return results from a query
|
||||
search. You can override the parser, however, subclassing the following methods
|
||||
will have full parsing for all things such as +/-, string parsing and substring
|
||||
matching.
|
||||
|
||||
The structures passed into the methods get a little complicated so beware!
|
||||
|
||||
ALL the following functions receive two parameters, the first is a search
|
||||
parameters detailing the words/phrases to search for, the second parameter is
|
||||
the current result set of IDs => scores.
|
||||
|
||||
There are two types of search parameters, one for words and the other for
|
||||
phrases. The structure is a little messy so I'll detail them here.
|
||||
|
||||
For words, the structure is like the following:
|
||||
|
||||
$word_search = {
|
||||
'word' => {
|
||||
substring => '1', # set to 1 if this is substring match
|
||||
phrase => 0, # not a phrase
|
||||
keyword => 1, # is a keyword
|
||||
mode => '', # can also be must, cannot to mean +/-
|
||||
},
|
||||
'word2' => ...
|
||||
}
|
||||
|
||||
For phrases the structure will become:
|
||||
|
||||
$phrase_search => {
|
||||
'phrase' => {
|
||||
substring => undef # never required
|
||||
phrase => [
|
||||
'word1',
|
||||
'word2',
|
||||
'word3',
|
||||
...
|
||||
], # for searching by indiv word if required
|
||||
keyword => 0, # not a keyword
|
||||
mode => '' # can also be must, cannot
|
||||
},
|
||||
'phrase2' => ...
|
||||
}
|
||||
|
||||
Based on these structures, hopefully it will be easy enough to build whatever is
|
||||
required to grab the appropriate records.
|
||||
|
||||
Finally, the second item passed in will be a hash filled with ID => score values
|
||||
of search results. They look something like this:
|
||||
|
||||
$results = {
|
||||
1 => 56,
|
||||
2 => 31,
|
||||
4 => 6
|
||||
}
|
||||
|
||||
It is important for all the methods to take the results and return the results,
|
||||
as the result set will be daisychained down like a set to be operated on by
|
||||
various searching schemes.
|
||||
|
||||
At the end of the query, the results in this set will be sorted and returned to
|
||||
the user as an sth.
|
||||
|
||||
Operations on this set are preformed by the following five methods.
|
||||
|
||||
=over 2
|
||||
|
||||
=item _query
|
||||
|
||||
This method is called just after all the query string has been parsed and put
|
||||
into their proper buckets. This method is overridden by the INTERNAL driver to
|
||||
decide it wants to switch to the NONINDEX driver for better performance.
|
||||
|
||||
Two parameters are passed in, ( $input, $buckets ). $input is a hash that
|
||||
contains all the form/cgi parameters passed to the $tbl->query function and
|
||||
$buckets is s the structure that is created after the query string is parsed.
|
||||
You may also call $self->SUPER::_query( $input, $buckets ) to pass the request
|
||||
along normally.
|
||||
|
||||
You must return undef or an STH from this function.
|
||||
|
||||
=item _union_query
|
||||
|
||||
This method takes a $word_search and does a simple match query. If it finds
|
||||
records with any of the words included, it will append the results to the list.
|
||||
Passed in is the $results and it must return the altered results set.
|
||||
|
||||
This method must also implement substring searching.
|
||||
|
||||
=item _phrase_query
|
||||
|
||||
Just like the union_query, however it searches based on phrases.
|
||||
|
||||
=item _phrase_intersect_query
|
||||
|
||||
This takes a $phrase_search and a $result as parameters. This method must look
|
||||
to find results that are found within the current result set that have the
|
||||
passed phrases as well. However, if there are no results found, this method can
|
||||
look for more results.
|
||||
|
||||
=item _intersect_query
|
||||
|
||||
Takes two parameters, a $word_search, and $results. Just like the
|
||||
_phrase_intersect query, if there are results already, tries to whittle away the
|
||||
result set. If there are no results, tries to look for results that have all the
|
||||
keywords in a record.
|
||||
|
||||
This method must also implement substring searching.
|
||||
|
||||
=item _disjoin_query
|
||||
|
||||
Takes two parameters, a $word_search, and $results. This will look through the
|
||||
result set and remove all matches to any of the keywords.
|
||||
|
||||
This method must also implement substring searching.
|
||||
|
||||
=item _phrase_disjoin_query
|
||||
|
||||
Two parameters, $phrase_search and $results are passed to this method. This does
|
||||
the exact same thing as _disjoin_query but it looks for phrases.
|
||||
|
||||
=item query
|
||||
|
||||
If you choose to override this method, you will have full control of the query.
|
||||
|
||||
This method accepts a $CGI or a $HASH object and performs the following
|
||||
|
||||
Options:
|
||||
- paging
|
||||
mh : max hits
|
||||
nh : number hit (or page of hits)
|
||||
sb : column to sort by (default is by score)
|
||||
|
||||
- searching
|
||||
ww : whole word
|
||||
ma : 1 => OR match, 0 => AND match, undefined => QUERY
|
||||
substring : search for substrings of words
|
||||
bool : 'and' => and search, 'or' => or search, '' => regular query
|
||||
query : the string of things to ask for
|
||||
|
||||
- filtering
|
||||
field_name : value # Find all rows with field_name = value
|
||||
field_name : ">value" # Find all rows with field_name > value.
|
||||
field_name : "<value" # Find all rows with field_name < value.
|
||||
field_name-gt : value # Find all rows with field_name > value.
|
||||
field_name-lt : value # Find all rows with field_name < value.
|
||||
|
||||
The function must return a STH object. However, you may find useful the
|
||||
GT::SQL::Search::STH object, which will automatically handle mh, nh, and
|
||||
alternative sorting requests. All you will have to do is
|
||||
|
||||
sub query { ... your code ... return $self->sth( $results ); }
|
||||
|
||||
Where results is a hashref containing primarykeyvalue => scorevalues.
|
||||
|
||||
=item alternate_driver_query
|
||||
|
||||
There is no reason to override this method, however, if you would like to use
|
||||
another driver's search instead of the current, this method will let you do so.
|
||||
|
||||
Accepting 2 parameters, ( $drivername, $input ), where $drivername is the name
|
||||
of the driver you'd like to use and $input is the parameters passed to the
|
||||
method. Returned is an $sth value (undef if an error has occurred). This method
|
||||
was used in the INTERNAL driver to shunt to NONINDEXED if it found the search
|
||||
would take too long.
|
||||
|
||||
=back
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||||
http://www.gossamer-threads.com/
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
Revision: $Id: Search.pm,v 1.62 2008/09/23 23:55:26 brewt Exp $
|
||||
|
||||
=cut
|
Reference in New Issue
Block a user