207 lines
7.8 KiB
Plaintext
207 lines
7.8 KiB
Plaintext
|
NAME
|
||
|
Lucene -- API to the C++ port of the Lucene search engine
|
||
|
|
||
|
SYNOPSIS
|
||
|
Initialize/Empty Lucene index
|
||
|
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
|
||
|
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 1);
|
||
|
|
||
|
my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
|
||
|
$tmp_writer->close;
|
||
|
undef $tmp_writer;
|
||
|
|
||
|
Choose your Analyzer (string tokenizer)
|
||
|
# lowercases text and splits it at non-letter characters
|
||
|
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||
|
# same as before and removes stop words
|
||
|
my $analyzer = Lucene::Analysis::StopAnalyzer();
|
||
|
# splits text at whitespace characters
|
||
|
my $analyzer = Lucene::Analysis::WhitespaceAnalyzer();
|
||
|
# lowercases text, tokenized it based on a grammer that
|
||
|
# leaves named authorities intact (e-mails, company names,
|
||
|
# web hostnames, IP addresses, etc) and removed stop words
|
||
|
my $analyzer = Lucene::Analysis::Standard::StandardAnalyzer();
|
||
|
|
||
|
Choose your Store (storage engine)
|
||
|
# in-memory storage
|
||
|
my $store = new Lucene::Store::RAMDirectory();
|
||
|
# disk-based storage
|
||
|
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||
|
|
||
|
Open and configure an IndexWriter
|
||
|
my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 0);
|
||
|
# optional settings for power users
|
||
|
$writer->setMergeFactor(100);
|
||
|
$writer->setUseCompoundFile(0);
|
||
|
$writer->setMaxFieldLength(255);
|
||
|
$writer->setMinMergeDocs(10);
|
||
|
$writer->setMaxMergeDocs(100);
|
||
|
|
||
|
Create Documents and add Fields
|
||
|
my $doc = new Lucene::Document;
|
||
|
# field gets analyzed, indexed and stored
|
||
|
$doc->add(Lucene::Document::Field->Text("content", $content));
|
||
|
# field gets indexed and stored
|
||
|
$doc->add(Lucene::Document::Field->Keyword("isbn", $isbn));
|
||
|
# field gets just stored
|
||
|
$doc->add(Lucene::Document::Field->UnIndexed("sales_rank", $sales_rank));
|
||
|
# field gets analyzed and indexed
|
||
|
$doc->add(Lucene::Document::Field->UnStored("categories", $categories));
|
||
|
|
||
|
Add Documents to an IndexWriter
|
||
|
$writer->addDocument($doc);
|
||
|
|
||
|
Optimize your index and close the IndexWriter
|
||
|
$writer->optimize();
|
||
|
$writer->close();
|
||
|
undef $writer;
|
||
|
|
||
|
Delete Documents
|
||
|
my $reader = Lucene::Index::IndexReader->open($store);
|
||
|
my $term = new Lucene::Index::Term("isbn", $isbn);
|
||
|
$reader->deleteDocuments($term);
|
||
|
$reader->close();
|
||
|
undef $reader;
|
||
|
|
||
|
Query index
|
||
|
# initalize searcher and parser
|
||
|
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||
|
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||
|
my $searcher = new Lucene::Search::IndexSearcher($store);
|
||
|
my $parser = new Lucene::QueryParser("default_field", $analyzer);
|
||
|
|
||
|
# build a query on the default field
|
||
|
my $query = $parser->parse("perl");
|
||
|
|
||
|
# build a query on another field
|
||
|
my $query = $parser->parse("title:cookbook");
|
||
|
|
||
|
# define a sort on one field
|
||
|
my $sortfield = new Lucene::Search::SortField("unixtime");
|
||
|
my $reversed_sortfield = new Lucene::Search::SortField("unixtime", 1);
|
||
|
my $sort = new Lucene::Search::Sort($sortfield);
|
||
|
|
||
|
# define a sort on two fields
|
||
|
my $sort = new Lucene::Search::Sort($sortfield1, $sortfield2);
|
||
|
|
||
|
# use Lucene's INDEXORDER or RELEVANCE sort
|
||
|
my $sort = Lucene::Search::Sort->INDEXORDER;
|
||
|
my $sort = Lucene::Search::Sort->RELEVANCE;
|
||
|
|
||
|
# query index and get results
|
||
|
my $hits = $searcher->search($query);
|
||
|
my $sorted_hits = $searcher->search($query, $sort);
|
||
|
|
||
|
# get number of results
|
||
|
my $num_hits = $hits->length();
|
||
|
|
||
|
# get fields and ranking score for each hit
|
||
|
for (my $i = 0; $i < $num_hits; $i++) {
|
||
|
my $doc = $hits->doc($i);
|
||
|
my $score = $hits->score($i);
|
||
|
my $title = $doc->get("title");
|
||
|
my $isbn = $doc->get("isbn");
|
||
|
}
|
||
|
|
||
|
# free memory and close searcher
|
||
|
undef $hits;
|
||
|
undef $query;
|
||
|
undef $parser;
|
||
|
undef $analyzer;
|
||
|
$searcher->close();
|
||
|
undef $fsdir;
|
||
|
undef $searcher;
|
||
|
}
|
||
|
|
||
|
Close your Store
|
||
|
$store->close;
|
||
|
undef $store;
|
||
|
|
||
|
DESCRIPTION
|
||
|
Like it or not Apache Lucene has become the de-facto standard for
|
||
|
open-source high-performance search. It has a large user-base, is well
|
||
|
documented and has plenty of committers. Unfortunately Apache Lucene is
|
||
|
entirely written in Java and therefore of relatively little use for perl
|
||
|
programmers. Fortunately in the recent years a group of C++ programmers
|
||
|
led by Ben van Klinken decided to port Java Lucene to C++.
|
||
|
|
||
|
The purpose of the module is to export the C++ Lucene API to perl and at
|
||
|
the same time be as close as possible to the original Java API. This has
|
||
|
the combined advantage of providing perl programmers with a
|
||
|
well-documented API and giving them access to a C++ search engine
|
||
|
library that is supposedly faster than the original.
|
||
|
|
||
|
CHARACTER SUPPORT
|
||
|
Currently only ISO 8859-1 (Latin-1) characters are supported. Obviously
|
||
|
this included all ASCII characters.
|
||
|
|
||
|
INDEX COMPATIBLITY
|
||
|
For the moment indices produced by this module are not compatible with
|
||
|
those from Apache Lucene. The reason for this is that this module uses
|
||
|
1-byte character encoding as opposed to 2-byte (widechar) encoding with
|
||
|
Apache Lucene.
|
||
|
|
||
|
INSTALLATION
|
||
|
This module requires the clucene library to be installed. The best way
|
||
|
to get it is to go to the following page
|
||
|
|
||
|
http://sourceforge.net/projects/clucene/
|
||
|
|
||
|
and download the latest STABLE clucene-core version. Currently it is
|
||
|
clucene-core-0.9.15. Make sure you compile it in ASCII mode and install
|
||
|
it in your standard library path.
|
||
|
|
||
|
On a Linux platform this goes as follows:
|
||
|
|
||
|
wget http://kent.dl.sourceforge.net/sourceforge/clucene/clucene-core-0.9.15.tar.gz
|
||
|
cd clucene-core-0.9.15
|
||
|
./autogen.sh
|
||
|
./configure --disable-debug --prefix=/usr --exec-prefix=/usr --enable-ascii
|
||
|
make
|
||
|
make check
|
||
|
(as root) make install
|
||
|
|
||
|
To install the perl module itself, run the following commands:
|
||
|
|
||
|
perl Makefile.PL
|
||
|
make
|
||
|
make test
|
||
|
(as root) make install
|
||
|
|
||
|
AUTHOR
|
||
|
Thomas Busch <tbusch at cpan dot org>
|
||
|
|
||
|
COPYRIGHT AND LICENSE
|
||
|
Copyright (c) 2006 Thomas Busch
|
||
|
|
||
|
This library is free software; you can redistribute it and/or modify it
|
||
|
under the same terms as Perl itself.
|
||
|
|
||
|
SEE ALSO
|
||
|
Plucene - a pure-Perl implementation of Lucene
|
||
|
|
||
|
KinoSearch - a search engine library inspired by Lucene
|
||
|
|
||
|
DISCLAIMER OF WARRANTY
|
||
|
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||
|
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||
|
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||
|
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
|
||
|
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
|
||
|
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
|
||
|
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
|
||
|
NECESSARY SERVICING, REPAIR, OR CORRECTION.
|
||
|
|
||
|
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||
|
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
|
||
|
TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
|
||
|
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||
|
SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||
|
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||
|
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||
|
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||
|
DAMAGES.
|
||
|
|