First pass at adding key files
This commit is contained in:
@ -0,0 +1,206 @@
|
||||
NAME
|
||||
Lucene -- API to the C++ port of the Lucene search engine
|
||||
|
||||
SYNOPSIS
|
||||
Initialize/Empty Lucene index
|
||||
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 1);
|
||||
|
||||
my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
|
||||
$tmp_writer->close;
|
||||
undef $tmp_writer;
|
||||
|
||||
Choose your Analyzer (string tokenizer)
|
||||
# lowercases text and splits it at non-letter characters
|
||||
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||||
# same as before and removes stop words
|
||||
my $analyzer = Lucene::Analysis::StopAnalyzer();
|
||||
# splits text at whitespace characters
|
||||
my $analyzer = Lucene::Analysis::WhitespaceAnalyzer();
|
||||
# lowercases text, tokenized it based on a grammer that
|
||||
# leaves named authorities intact (e-mails, company names,
|
||||
# web hostnames, IP addresses, etc) and removed stop words
|
||||
my $analyzer = Lucene::Analysis::Standard::StandardAnalyzer();
|
||||
|
||||
Choose your Store (storage engine)
|
||||
# in-memory storage
|
||||
my $store = new Lucene::Store::RAMDirectory();
|
||||
# disk-based storage
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||||
|
||||
Open and configure an IndexWriter
|
||||
my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 0);
|
||||
# optional settings for power users
|
||||
$writer->setMergeFactor(100);
|
||||
$writer->setUseCompoundFile(0);
|
||||
$writer->setMaxFieldLength(255);
|
||||
$writer->setMinMergeDocs(10);
|
||||
$writer->setMaxMergeDocs(100);
|
||||
|
||||
Create Documents and add Fields
|
||||
my $doc = new Lucene::Document;
|
||||
# field gets analyzed, indexed and stored
|
||||
$doc->add(Lucene::Document::Field->Text("content", $content));
|
||||
# field gets indexed and stored
|
||||
$doc->add(Lucene::Document::Field->Keyword("isbn", $isbn));
|
||||
# field gets just stored
|
||||
$doc->add(Lucene::Document::Field->UnIndexed("sales_rank", $sales_rank));
|
||||
# field gets analyzed and indexed
|
||||
$doc->add(Lucene::Document::Field->UnStored("categories", $categories));
|
||||
|
||||
Add Documents to an IndexWriter
|
||||
$writer->addDocument($doc);
|
||||
|
||||
Optimize your index and close the IndexWriter
|
||||
$writer->optimize();
|
||||
$writer->close();
|
||||
undef $writer;
|
||||
|
||||
Delete Documents
|
||||
my $reader = Lucene::Index::IndexReader->open($store);
|
||||
my $term = new Lucene::Index::Term("isbn", $isbn);
|
||||
$reader->deleteDocuments($term);
|
||||
$reader->close();
|
||||
undef $reader;
|
||||
|
||||
Query index
|
||||
# initalize searcher and parser
|
||||
my $analyzer = Lucene::Analysis::SimpleAnalyzer();
|
||||
my $store = Lucene::Store::FSDirectory->getDirectory("/home/lucene", 0);
|
||||
my $searcher = new Lucene::Search::IndexSearcher($store);
|
||||
my $parser = new Lucene::QueryParser("default_field", $analyzer);
|
||||
|
||||
# build a query on the default field
|
||||
my $query = $parser->parse("perl");
|
||||
|
||||
# build a query on another field
|
||||
my $query = $parser->parse("title:cookbook");
|
||||
|
||||
# define a sort on one field
|
||||
my $sortfield = new Lucene::Search::SortField("unixtime");
|
||||
my $reversed_sortfield = new Lucene::Search::SortField("unixtime", 1);
|
||||
my $sort = new Lucene::Search::Sort($sortfield);
|
||||
|
||||
# define a sort on two fields
|
||||
my $sort = new Lucene::Search::Sort($sortfield1, $sortfield2);
|
||||
|
||||
# use Lucene's INDEXORDER or RELEVANCE sort
|
||||
my $sort = Lucene::Search::Sort->INDEXORDER;
|
||||
my $sort = Lucene::Search::Sort->RELEVANCE;
|
||||
|
||||
# query index and get results
|
||||
my $hits = $searcher->search($query);
|
||||
my $sorted_hits = $searcher->search($query, $sort);
|
||||
|
||||
# get number of results
|
||||
my $num_hits = $hits->length();
|
||||
|
||||
# get fields and ranking score for each hit
|
||||
for (my $i = 0; $i < $num_hits; $i++) {
|
||||
my $doc = $hits->doc($i);
|
||||
my $score = $hits->score($i);
|
||||
my $title = $doc->get("title");
|
||||
my $isbn = $doc->get("isbn");
|
||||
}
|
||||
|
||||
# free memory and close searcher
|
||||
undef $hits;
|
||||
undef $query;
|
||||
undef $parser;
|
||||
undef $analyzer;
|
||||
$searcher->close();
|
||||
undef $fsdir;
|
||||
undef $searcher;
|
||||
}
|
||||
|
||||
Close your Store
|
||||
$store->close;
|
||||
undef $store;
|
||||
|
||||
DESCRIPTION
|
||||
Like it or not Apache Lucene has become the de-facto standard for
|
||||
open-source high-performance search. It has a large user-base, is well
|
||||
documented and has plenty of committers. Unfortunately Apache Lucene is
|
||||
entirely written in Java and therefore of relatively little use for perl
|
||||
programmers. Fortunately in the recent years a group of C++ programmers
|
||||
led by Ben van Klinken decided to port Java Lucene to C++.
|
||||
|
||||
The purpose of the module is to export the C++ Lucene API to perl and at
|
||||
the same time be as close as possible to the original Java API. This has
|
||||
the combined advantage of providing perl programmers with a
|
||||
well-documented API and giving them access to a C++ search engine
|
||||
library that is supposedly faster than the original.
|
||||
|
||||
CHARACTER SUPPORT
|
||||
Currently only ISO 8859-1 (Latin-1) characters are supported. Obviously
|
||||
this included all ASCII characters.
|
||||
|
||||
INDEX COMPATIBLITY
|
||||
For the moment indices produced by this module are not compatible with
|
||||
those from Apache Lucene. The reason for this is that this module uses
|
||||
1-byte character encoding as opposed to 2-byte (widechar) encoding with
|
||||
Apache Lucene.
|
||||
|
||||
INSTALLATION
|
||||
This module requires the clucene library to be installed. The best way
|
||||
to get it is to go to the following page
|
||||
|
||||
http://sourceforge.net/projects/clucene/
|
||||
|
||||
and download the latest STABLE clucene-core version. Currently it is
|
||||
clucene-core-0.9.15. Make sure you compile it in ASCII mode and install
|
||||
it in your standard library path.
|
||||
|
||||
On a Linux platform this goes as follows:
|
||||
|
||||
wget http://kent.dl.sourceforge.net/sourceforge/clucene/clucene-core-0.9.15.tar.gz
|
||||
cd clucene-core-0.9.15
|
||||
./autogen.sh
|
||||
./configure --disable-debug --prefix=/usr --exec-prefix=/usr --enable-ascii
|
||||
make
|
||||
make check
|
||||
(as root) make install
|
||||
|
||||
To install the perl module itself, run the following commands:
|
||||
|
||||
perl Makefile.PL
|
||||
make
|
||||
make test
|
||||
(as root) make install
|
||||
|
||||
AUTHOR
|
||||
Thomas Busch <tbusch at cpan dot org>
|
||||
|
||||
COPYRIGHT AND LICENSE
|
||||
Copyright (c) 2006 Thomas Busch
|
||||
|
||||
This library is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
SEE ALSO
|
||||
Plucene - a pure-Perl implementation of Lucene
|
||||
|
||||
KinoSearch - a search engine library inspired by Lucene
|
||||
|
||||
DISCLAIMER OF WARRANTY
|
||||
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
|
||||
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
|
||||
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
|
||||
NECESSARY SERVICING, REPAIR, OR CORRECTION.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
|
||||
TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
Reference in New Issue
Block a user