# ================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::SQL::Search # Author : Aki Mimoto # CVS Info : 087,071,086,086,085 # $Id: Search.pm,v 1.62 2008/09/23 23:55:26 brewt Exp $ # # Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. # ================================================================== # # Description: # highlevel class for searching, works with GT::SQL::Indexer # package GT::SQL::Search; #-------------------------------------------------------------------------------- # pragmas use strict; use vars qw/@ISA $ERRORS $ERROR_MESSAGE $VERSION/; # includes use GT::Base; use GT::AutoLoader; # variables $VERSION = sprintf "%d.%03d", q$Revision: 1.62 $ =~ /(\d+)\.(\d+)/; @ISA = qw(GT::Base); $ERROR_MESSAGE = 'GT::SQL'; $ERRORS = { UNKNOWNDRIVER => 'Unknown driver requested: %s', NOTABLE => 'Cannot find reference to table object' }; sub load_search { #-------------------------------------------------------------------------------- # checks if there is driver for this current database and if so, loads that # instead (since it would be faster) # my $class = shift; my $opts = ref $_[0] ? $_[0] : {@_}; $opts->{mode} = 'Search'; my $driver = $class->load_driver( $opts ) or return; my $pkg = "GT::SQL::Search::${driver}::Search"; return $pkg->load(@_); } sub load_indexer { #-------------------------------------------------------------------------------- # checks if there is driver for this current database and if so, loads that # instead (since it would be faster) # my $class = shift; my $opts = ref $_[0] ? $_[0] : {@_}; $opts->{mode} = 'Indexer'; my $driver = $class->load_driver( $opts ) or return; my $pkg = "GT::SQL::Search::${driver}::Indexer"; return $pkg->load(@_); } sub driver_ok { #-------------------------------------------------------------------------------- # checks to see if a particular driver is allowed on this system # my $class = shift; my $driver = uc shift or return; my $opts = ref $_[0] ? $_[0] : {@_}; my $mode = $opts->{mode} || 'Indexer'; my $tbl = $opts->{table} or return GT::SQL::Search->error( 'NOTABLE', 'FATAL' ); my $pkg = 'GT::SQL::Search::' . $driver . '::' . $mode; eval { require "GT/SQL/Search/$driver/$mode.pm" }; $@ and return GT::SQL::Search->error('UNKNOWNDRIVER', 'WARN', $driver); return $pkg->can('ok') ? $pkg->ok($tbl) : 1; } sub load_driver { #-------------------------------------------------------------------------------- # Loads a driver into memory. # my $class = shift; my $opts = ref $_[0] ? $_[0] : {@_}; my $tbl = $opts->{table}; my $mode = $opts->{mode} || 'Indexer'; my $driver = uc($opts->{driver} || $tbl->{schema}->{search_driver} || 'NONINDEXED'); require "GT/SQL/Search/$driver/$mode.pm"; return $driver; } sub available_drivers { #-------------------------------------------------------------------------------- # Returns a list of available drivers. # my $class = shift; (my $path = $INC{'GT/SQL/Search.pm'}) =~ s/\.pm$//; opendir DHANDLE, $path or return $class->fatal(CANTOPENDIR => $path, "$!"); my @arr; for my $driver_name (readdir DHANDLE) { next if $driver_name =~ y/a-z//; next if $driver_name eq 'LUCENE'; -f "$path/$driver_name/Search.pm" and -r _ or next; -f "$path/$driver_name/Indexer.pm" and -r _ or next; my $loaded = eval { require "GT/SQL/Search/$driver_name/Search.pm"; require "GT/SQL/Search/$driver_name/Indexer.pm"; }; push @arr, $driver_name if $loaded; } closedir DHANDLE; return wantarray ? @arr : \@arr; } 1; __END__ =head1 NAME GT::SQL::Search - internal driver for searching =head1 SYNOPSIS This implements the query string based searching scheme for GT::SQL. Driver based, it is designed to take advantage of the different indexing schemes available on different database engines. =head1 DESCRIPTION Instead of describing how Search.pm is interfaced* this will describe how a driver should be structured and how a new driver can be implemented. * as it is never accessed directly by the programmer as it was designed to be called through the functions GT::SQL::Table::query and GT::SQL::Table::query_sth =head2 Drivers A driver has two parts. The Indexer and the Search packages are the most important. Howserver, for any driver in the search, there must exist a directory with the name of the driver in ALL CAPS. For exampel, MYSQL for MySQL, POSTGRES for Postgres. Within each driver directory, The Indexer and Search portions of the driver contains all the information required for initializing the database table and searching the database. The Indexing package of the driver handles all the data that is manipulated in the database and also the initializes and the database for indexing. The Search package handles the queries and retrieves results for the eventual consumption by the calling program. Drivers are simply subclasses of the base driver module, GT::SQL::Search::Base and operate by overriding certain key functions. The next few sections will cover how to create a search driver, and assumes a fair bit of familiarity with GT::SQL. =head2 Structure of an Indexing Driver The following is an absolutely simple skeleton driver that does nothing and but called "CUSTOM". Found in the CUSTOM directory, this is the search package, and would be call Search.pm in the GT/SQL/Search/CUSTOM library directory. package GT::SQL::Search::CUSTOM::Search; #------------------------------------------ use strict; use vars qw/ @ISA /; use GT::SQL::Search::Base::Search; @ISA = qw( GT::SQL::Search::Base::Search ); sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Search->new(@_) }; # overrides would go here 1; For the indexer, another file, Indexer.pm would be found in the GT/SQL/Search/CUSTOM directory. package GT::SQL::Search::CUSTOM::Indexer; #------------------------------------------ use strict; use vars qw/ @ISA /; use GT::SQL::Search::Base; @ISA = qw/ GT::SQL::Search::Base::Indexer /; sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Indexer->new(@_) }; # overrides would go here 1; The almost empty subs that immediately return with a value are functions that can be overridden to do special tasks. More will be detailed later. The Driver has been split into two packages. The original package name, GT::SQL::Search::Nothing, houses the Search package. GT::SQL::Search::Nothing::Indexer is the Indexing portion of the seach system. "::Indexer" must be appended to the orginial search name for the indexer. Each of the override functions are triggered at points just before and after a major event occurs in GT::SQL. Depending on the type of actions you require, you pick and chose which events you'd like your driver to attach to. =head2 Structure of Indexing Driver The Indexer is responsible for creating all the indexes, maintaining them and when the table is dropped, removing all the associated indexes. The following header must be defined for the Indexer. GT::SQL::Search::Base::Indexer is the superclass that our driver inherits from. package GT::SQL::Search::CUSTOM::Indexer; #------------------------------------------ use strict; use vars qw/ @ISA /; use GT::Base; use GT::SQL::Search::Base::Indexer; @ISA = qw/ GT::SQL::Search::Base::Indexer /; In addition to the header, the following function must be defined. GT::SQL::Search::Driver::Indexer::load creates the new object and allows for special preinitialization that must occur. You can also create another driver silently (such as defaulting to INTERNAL after a version check fails). sub load { my $package_name = shift; return GT::SQL::Search::CUSTOM::Indexer->new(@_) }; Finally, there are the overrides. None of the override functions need be defined in your driver. Any calls made to undefined methods will silently fallback to the superclass driver's methods. When a method has been overridden, the function must return a true value when it is successful, otherwise the action will fail and an error generated. Whenever a object is created it will receive one property $self->{table} which is the table that is being worked upon. This property is available in all the method calls and is required for methods such as _create_table and _drop_search_driver methods. When a table is first created or when a table is destroyed the following two functions are called. They are not passed any special values, however, these are all class methods and $self->{table} will be a reference to the current table in use. This set of overrides are used by GT::SQL::Creator when the ::create method is called. They are called just prior and then after the create table sql query has been executed. =over 2 =item pre_create_table =item post_create_table These functions receive no special parameters. They will receive the data to the table in the $self->{table} property. =back This next set of functions take place in GT::SQL::Editor. =over 2 =item drop_search_driver This method receives no special parameters but is responsible for removing all indexes and "things" associated with the indexing schema. =item add_search_driver Receives no extra parameters. Creates all indexes and does all actions required to initialize indexing scheme. =item pre_add_column =item post_add_column The previous two functions are called just before and after a new column is added. pre_add_column accepts $name (of column), $col (hashref of column attributes). The method will only be called if the column has a weight associated with it. The function must return a non-zero value if successful. Note that the returned value will be passed into the post_add_column so temporary values can be passed through if required. post_add_column accepts $name (of column), $col (hashref of column attributes), $results (of pre_add_column). This method is called just after the column has been inserted into the database. =item pre_delete_column =item post_delete_column These previous functions are called just before and after the sql for a old column is deleted. They must remove all objects and "things" associated with a particular column's index. pre_delete_column accepts $name (of column), $col (hashref of column attributes). The method will only be called if the column has a weight associated with it. The function must return a non-zero value if successful. Note that the returned value will be passed into the post_delete_column so temporary values can be passed through if required. post_delete_column accepts $name (of column), $col (hashref of column attributes), $results (of pre_add_column). This method is called just after the column has been dropped from the database. =item pre_drop_table =item post_drop_table The two previous methods are used before and after the table is dropped. The methods must remove any tables or "things" related to indexing from the table. pre_drop_table receives no arguments. It can find a copy of the current table and columns associated in $self->{table}. post_drop_table receives one argument, which is the result of the pre_drop_table. =back The following set of functions take place in GT::SQL::Table =over 2 =item pre_add_record =item post_add_record Called just before and after an insert occurs. These functions take the record and indexes them as required. pre_add_record will receive one argument, $rec, hashref, which is the record that will be inserted into the database. Table information can be found by accessing $self->{table} Much like the other functions, on success the result will be cached and fed into the post_add_record function. post_add_record receives $rec, a hashref to describing the new result, the $sth of the insert query, and the result of the pre_add_record method. The result from $sth->insert_id if there is a ai field will be the new unique primary key. =item pre_update_record =item post_update_record Intercepts the update request before and just after the sql query is executed. This override has the potential of being rather messy. More than one record can be modified in this action and the indexer must work a lot to ensure the database is up to snuff. pre_update_record receives two parameters, $set_cond, $where_cond. $set_cond is a hashref containing the new values that must be set, and $where_cond is a GT::SQL::Condition object selecting records to update. The result once again, is cached and if undef is considered an error. post_update_record takes the same parameters as pre_update_record, except one extra paremeter, the result of pre_update_record. =item pre_delete_record =item post_delete_record Called just before and after the deletion request for records are called. pre_delete_record, has only one parameter, $where, a GT::SQL::Condition object telling which records to delete. The results of this method are passed to post_delete_record. post_delete_record, has one addition parameter to pre_delete_record and like most post_ methods, is the result of the pre_delete_record method. =item pre_delete_all_records =item post_delete_all_records These two functions are quite simple, but they are different from drop search driver in that though the records are all dropped, the framework for all the indexing is not dropped as well. Neither function is passed any special data, except for post_delete_all_records which receives the rsults of the pre_delete_all_records method. =item reindex_all This function is sometimes called by the user to refresh the index. The motivation for this, in the case of the INTERNAL driver, is sometimes due to outside manipulation of the database tables, the index can become non-representative of the data in the tables. This method is to force the indexing system to fix errors that have passed. =item ok This function is called by GT::SQL::Search as a package method, GT::SQL::Search::EXAMPLE::Indexer->ok( $table ); and is passed a table object reference. What this function must do is to return a true or false value that tells the search system if this driver can be used. The MYSQL driver has a good example for this, it tests to ensure that the mysql database system version is at least 3.23.23. =back =head2 Structure of a Search Driver The Searcher is responsible for only one thing, to return results from a query search. You can override the parser, however, subclassing the following methods will have full parsing for all things such as +/-, string parsing and substring matching. The structures passed into the methods get a little complicated so beware! ALL the following functions receive two parameters, the first is a search parameters detailing the words/phrases to search for, the second parameter is the current result set of IDs => scores. There are two types of search parameters, one for words and the other for phrases. The structure is a little messy so I'll detail them here. For words, the structure is like the following: $word_search = { 'word' => { substring => '1', # set to 1 if this is substring match phrase => 0, # not a phrase keyword => 1, # is a keyword mode => '', # can also be must, cannot to mean +/- }, 'word2' => ... } For phrases the structure will become: $phrase_search => { 'phrase' => { substring => undef # never required phrase => [ 'word1', 'word2', 'word3', ... ], # for searching by indiv word if required keyword => 0, # not a keyword mode => '' # can also be must, cannot }, 'phrase2' => ... } Based on these structures, hopefully it will be easy enough to build whatever is required to grab the appropriate records. Finally, the second item passed in will be a hash filled with ID => score values of search results. They look something like this: $results = { 1 => 56, 2 => 31, 4 => 6 } It is important for all the methods to take the results and return the results, as the result set will be daisychained down like a set to be operated on by various searching schemes. At the end of the query, the results in this set will be sorted and returned to the user as an sth. Operations on this set are preformed by the following five methods. =over 2 =item _query This method is called just after all the query string has been parsed and put into their proper buckets. This method is overridden by the INTERNAL driver to decide it wants to switch to the NONINDEX driver for better performance. Two parameters are passed in, ( $input, $buckets ). $input is a hash that contains all the form/cgi parameters passed to the $tbl->query function and $buckets is s the structure that is created after the query string is parsed. You may also call $self->SUPER::_query( $input, $buckets ) to pass the request along normally. You must return undef or an STH from this function. =item _union_query This method takes a $word_search and does a simple match query. If it finds records with any of the words included, it will append the results to the list. Passed in is the $results and it must return the altered results set. This method must also implement substring searching. =item _phrase_query Just like the union_query, however it searches based on phrases. =item _phrase_intersect_query This takes a $phrase_search and a $result as parameters. This method must look to find results that are found within the current result set that have the passed phrases as well. However, if there are no results found, this method can look for more results. =item _intersect_query Takes two parameters, a $word_search, and $results. Just like the _phrase_intersect query, if there are results already, tries to whittle away the result set. If there are no results, tries to look for results that have all the keywords in a record. This method must also implement substring searching. =item _disjoin_query Takes two parameters, a $word_search, and $results. This will look through the result set and remove all matches to any of the keywords. This method must also implement substring searching. =item _phrase_disjoin_query Two parameters, $phrase_search and $results are passed to this method. This does the exact same thing as _disjoin_query but it looks for phrases. =item query If you choose to override this method, you will have full control of the query. This method accepts a $CGI or a $HASH object and performs the following Options: - paging mh : max hits nh : number hit (or page of hits) sb : column to sort by (default is by score) - searching ww : whole word ma : 1 => OR match, 0 => AND match, undefined => QUERY substring : search for substrings of words bool : 'and' => and search, 'or' => or search, '' => regular query query : the string of things to ask for - filtering field_name : value # Find all rows with field_name = value field_name : ">value" # Find all rows with field_name > value. field_name : " value. field_name-lt : value # Find all rows with field_name < value. The function must return a STH object. However, you may find useful the GT::SQL::Search::STH object, which will automatically handle mh, nh, and alternative sorting requests. All you will have to do is sub query { ... your code ... return $self->sth( $results ); } Where results is a hashref containing primarykeyvalue => scorevalues. =item alternate_driver_query There is no reason to override this method, however, if you would like to use another driver's search instead of the current, this method will let you do so. Accepting 2 parameters, ( $drivername, $input ), where $drivername is the name of the driver you'd like to use and $input is the parameters passed to the method. Returned is an $sth value (undef if an error has occurred). This method was used in the INTERNAL driver to shunt to NONINDEXED if it found the search would take too long. =back =head1 COPYRIGHT Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. http://www.gossamer-threads.com/ =head1 VERSION Revision: $Id: Search.pm,v 1.62 2008/09/23 23:55:26 brewt Exp $ =cut