555 lines
18 KiB
Perl
555 lines
18 KiB
Perl
# ==================================================================
|
|
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
|
#
|
|
# GT::URI
|
|
# Author : Aki Mimoto
|
|
# CVS Info : 087,071,086,086,085
|
|
# $Id: URI.pm,v 1.24 2002/04/07 03:35:35 jagerman Exp $
|
|
#
|
|
# Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
|
# ==================================================================
|
|
#
|
|
|
|
package GT::URI;
|
|
# ===============================================================
|
|
# handles requests to URI type locations
|
|
|
|
use GT::Base;
|
|
use strict;
|
|
use vars qw/ $ATTRIBS @ISA $DEBUG /;
|
|
|
|
$DEBUG = 0;
|
|
|
|
@ISA = ( 'GT::Base' );
|
|
|
|
$ATTRIBS = {
|
|
'racked_uri' => [
|
|
# 'url1', 'url2'
|
|
],
|
|
'max_down' => 5000000, # Set max download to 5 MB.
|
|
'max_simultaneous' => 10,
|
|
'downloading' => {
|
|
# 'uri' => {
|
|
# 'request' => GT::URI::HTTP,
|
|
# 'buffer' => ''
|
|
# }
|
|
},
|
|
'completed' => {
|
|
# 'uri' => {
|
|
# 'buffer' => ''
|
|
# ...
|
|
# }
|
|
},
|
|
|
|
'protocol_opts' => {
|
|
# 'protocol' => {
|
|
# setting => value,
|
|
# ...
|
|
# }
|
|
}
|
|
};
|
|
|
|
sub requests {
|
|
#--------------------------------------------------------------------------------
|
|
# returns number of active requests
|
|
# input will wait n seconds for input pending on it's queue
|
|
#
|
|
my $self = shift;
|
|
my $tics = shift;
|
|
|
|
my $requests= scalar(keys %{$self->{downloading}}); # + scalar(@{$self->{racked_uri}});
|
|
|
|
if ( $requests and $tics ) {
|
|
my $bits;
|
|
$bits = $self->vec() and select( $bits, undef, undef, ( $tics > 0 ) ? $tics : undef );
|
|
}
|
|
|
|
return $requests;
|
|
|
|
}
|
|
|
|
sub rack_uri {
|
|
#--------------------------------------------------------------------------------
|
|
# loads a uri to download
|
|
my $self = shift;
|
|
|
|
push @{$self->{racked_uri}}, grep $_, @_;
|
|
}
|
|
|
|
sub do_iteration {
|
|
#--------------------------------------------------------------------------------
|
|
# try to download pages
|
|
# and returns a hash of completed requests
|
|
my $self = shift;
|
|
|
|
# if we haven't reached the maximum number of simultaneous connections add more to
|
|
# the requesting pool
|
|
while ( @{$self->{racked_uri}} and ( scalar(keys %{$self->{downloading}}) < $self->{max_simultaneous} ) ) {
|
|
$self->debug("Racking a new Request") if ($self->{_debug});
|
|
$self->_queue_connect( shift @{$self->{racked_uri}} );
|
|
}
|
|
|
|
# go through and check if any of the racked objects can be download
|
|
while ( $self->pending() ) {
|
|
$self->debug("Someone has data that can be read, checking up") if ($self->{_debug});
|
|
$self->_buffered_read();
|
|
}
|
|
|
|
# handle all the aquired stuff
|
|
return $self->_completed_requests();
|
|
|
|
}
|
|
|
|
sub pending {
|
|
#--------------------------------------------------------------------------------
|
|
my $self = shift;
|
|
my $downloading = $self->{downloading};
|
|
my $pending = 0;
|
|
|
|
# go through each downloading uri hand
|
|
foreach my $uri ( keys %{$downloading} ) {
|
|
|
|
# ... now check up on the url
|
|
my $URI= $downloading->{$uri};
|
|
if ( $URI->{request} ) {
|
|
$URI->{request}->pending() and $pending++
|
|
}
|
|
|
|
# ... wierd
|
|
else {
|
|
$self->debug( "$uri is peculiar because it does no have a request object associated" ) if ($self->{_debug});
|
|
}
|
|
|
|
}
|
|
|
|
return $pending;
|
|
}
|
|
|
|
sub vec {
|
|
#--------------------------------------------------------------------------------
|
|
# sets file bits suitable for using in a select call
|
|
#
|
|
my $self = shift;
|
|
my $bits = shift;
|
|
my $downloading = $self->{downloading};
|
|
|
|
# go through each downloading uri hand
|
|
foreach my $URI ( values %{$downloading} ) {
|
|
my $request = $URI->{'request'};
|
|
if ( $request and $request->can('vec') ) {
|
|
$bits = $request->vec($bits);
|
|
}
|
|
}
|
|
|
|
return $bits;
|
|
}
|
|
|
|
sub _completed_requests {
|
|
#--------------------------------------------------------------------------------
|
|
# check if any requests are finished
|
|
my $self = shift;
|
|
my $downloading = $self->{downloading};
|
|
my $completed = $self->{completed};
|
|
|
|
# go through each downloading uri hand
|
|
foreach my $uri ( keys %{$downloading} ) {
|
|
|
|
$self->debug( "Checking $uri for download completion" ) if ($self->{_debug});
|
|
|
|
# ... now check up on the url
|
|
my $URI = $downloading->{$uri};
|
|
if ( my $req = $URI->{request} ) {
|
|
|
|
# .... if the request has finished, get the extra resource stuff that we may want
|
|
# then, move the request into the finished queue
|
|
if ($req->EOF()) {
|
|
$self->debug( "$uri has been completed, finishing up." ) if ($self->{_debug});
|
|
$URI->{resource_attribs} = $req->resource_attrib();
|
|
$completed->{$uri} = $URI;
|
|
delete $downloading->{$uri}->{request};
|
|
delete $downloading->{$uri};
|
|
}
|
|
}
|
|
|
|
# ... wierd
|
|
else {
|
|
$self->debug( "$uri is peculiar because it does not have a request object associated" ) if ($self->{_debug});
|
|
}
|
|
}
|
|
|
|
return $completed;
|
|
|
|
}
|
|
|
|
sub clear_completed {
|
|
#--------------------------------------------------------------------------------
|
|
# clear the completed requests
|
|
#
|
|
my $self = shift;
|
|
my $tmp = $self->{completed};
|
|
$self->{completed} = {};
|
|
return $tmp;
|
|
}
|
|
|
|
sub completed {
|
|
#--------------------------------------------------------------------------------
|
|
# returns a hash of all the completed downloads
|
|
#
|
|
my $self = shift;
|
|
|
|
if ( $self->{completed} ) {
|
|
return keys %{$self->{completed}} ? $self->{completed} : undef;
|
|
}
|
|
|
|
else { return };
|
|
}
|
|
|
|
sub completed_requests {
|
|
#--------------------------------------------------------------------------------
|
|
my $self = shift;
|
|
return scalar( keys %{$self->{completed} || {}} )
|
|
}
|
|
|
|
sub _queue_connect {
|
|
#--------------------------------------------------------------------------------
|
|
# connects to a URI
|
|
my $self = shift;
|
|
my $uri = shift;
|
|
|
|
$self->debug("Preparing a new Connection from $uri") if ($self->{_debug});
|
|
|
|
# trim free of spaces
|
|
$uri =~ s/^\s*|\s*$//g;
|
|
|
|
# extract protocol name from the URI
|
|
my $protocol = uc(substr($uri, 0, index($uri, ':')));
|
|
|
|
$self->debug("Protocol to be used with $uri: $protocol") if ($self->{_debug});
|
|
|
|
# load the corresponding protocol object
|
|
my $file = 'GT/URI/' . $protocol . '.pm';
|
|
my $pkg = 'GT::URI::' . $protocol;
|
|
{
|
|
local $SIG{__DIE__};
|
|
my $ret = eval { require "$file"; 1; };
|
|
if ( ! $ret ) {
|
|
$self->{completed}->{$uri}->{error} = "Could not load $protocol protocol handler : " . $@;
|
|
return;
|
|
}
|
|
}
|
|
|
|
# so, instantiate, init and issue the request
|
|
my $request = $pkg->new ( { max_down => $self->{max_down} } );
|
|
unless ($request) {
|
|
$self->{completed}->{$uri}->{error} = "Could not load $protocol protocol handler : " . $@;
|
|
return;
|
|
}
|
|
|
|
# set protocol-only parameters
|
|
my $protocol_requests = ( $self->{protocol_opts} || {} );
|
|
$protocol_requests = ( $protocol_requests->{ $protocol } || {} );
|
|
if ( $protocol_requests ) {
|
|
$request->set( $protocol_requests );
|
|
}
|
|
|
|
# finally, send out the initial request for the resource
|
|
if ( $request->fetch($uri) ) {
|
|
$self->debug( "Queued Request $uri" ) if ($self->{_debug});
|
|
$self->{downloading}->{$uri} = {
|
|
request => $request,
|
|
buffer => ''
|
|
};
|
|
}
|
|
|
|
# uh oh, there was an error
|
|
else {
|
|
$self->debug( "Error in Request $uri" ) if ($self->{_debug});
|
|
my $completed = $self->{completed};
|
|
$completed->{$uri} = {
|
|
resource_attribs => $request->{resource_attribs},
|
|
error => 'Connection problems'
|
|
};
|
|
}
|
|
|
|
# even through
|
|
return $request;
|
|
}
|
|
|
|
sub _buffered_read {
|
|
#--------------------------------------------------------------------------------
|
|
# reads all pending requests into the buffer
|
|
my $self = shift;
|
|
my $downloading = $self->{downloading};
|
|
|
|
# Go through each downloading uri hand
|
|
foreach my $uri ( keys %{$downloading} ) {
|
|
|
|
# ... now check up on the url
|
|
my $URI = $downloading->{$uri};
|
|
if ( $URI->{request} and $URI->{request}->pending() ) {
|
|
$URI->{buffer} .= $URI->{request}->gulp_read();
|
|
}
|
|
|
|
# ... wierd...,
|
|
else {
|
|
$self->debug( "$uri is peculiar because it does not have a request object associated" ) if ( $self->{_debug} );
|
|
}
|
|
}
|
|
}
|
|
|
|
sub get {
|
|
#--------------------------------------------------------------------------------
|
|
# gets a bunch of documents in a blocking fashion
|
|
my ( @uris, $options );
|
|
my $class = shift;
|
|
|
|
# parse out the parameters
|
|
foreach my $param ( @_ ) {
|
|
if ( ref $param eq 'HASH' ) {
|
|
$options = $param;
|
|
}
|
|
|
|
elsif ( not ref $param ) {
|
|
push @uris, $param;
|
|
}
|
|
}
|
|
|
|
# create and rack the URIs that we want to download
|
|
my $URI = new GT::URI($options);
|
|
|
|
foreach my $uri ( @uris ) { $URI->rack_uri( $uri ); };
|
|
|
|
# download the URIs
|
|
my ( $completed, $tmp );
|
|
while (
|
|
( %{$tmp = $URI->do_iteration()} ) or
|
|
( $URI->requests( -1 ) )
|
|
) {
|
|
|
|
if ( $tmp ) {
|
|
$URI->clear_completed();
|
|
foreach my $uri ( keys %{$tmp} ) {
|
|
$completed->{$uri} = $tmp->{$uri};
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
return $completed;
|
|
}
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
GT::URI - Internet resource request broker
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use GT::URI;
|
|
my $doc = GT::URI->get( 'http://www.gossamer-threads.com' );
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
GT::URI Makes requests and retrieves resources from internet servers.
|
|
|
|
=head1 BASICS
|
|
|
|
=head2 Getting a resource, the simple way
|
|
|
|
Just want just a few items? Call GT::URI::HTTP->get and all the magic will be done for you.
|
|
|
|
use GT::URI;
|
|
my $docs = GT::URI->get( "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
|
|
|
|
If options need to be set, include a hashref that has the appropriate setting you'd like set
|
|
|
|
use GT::URI;
|
|
my $conf = { max_down => 2000 };
|
|
my $docs = GT::URI::HTTP->get( $conf, "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
|
|
|
|
=head2 When you've got better things to do than wait
|
|
|
|
The simple method blocks when acquiring the data, meaning until all the data is downloaded, your script is frozen. GT::URI has the capability to do handle things in a non-blocking fashion, so while you wait for the documents to download, you can do something else.
|
|
|
|
A very simple example follows.
|
|
|
|
use GT::URI;
|
|
use GT::Dumper;
|
|
|
|
$uri = new GT::URI();
|
|
|
|
# queue up the URIs wanted
|
|
$uri->rack_uri( "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
|
|
|
|
# loop through until there are no more requests left to finish
|
|
while ( $uri->requests() ) {
|
|
$docs = $uri->do_iteration();
|
|
|
|
# do something here
|
|
print '.';
|
|
}
|
|
|
|
# output all the data
|
|
print Dumper($docs);
|
|
|
|
But this can quickly get much more complex. Since the downloads are asynchronous, the code can be changed to handle each request as it comes in.
|
|
|
|
use GT::URI;
|
|
use GT::Dumper;
|
|
|
|
$uri = new GT::URI();
|
|
|
|
# queue up the URIs wanted
|
|
$uri->rack_uri( "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
|
|
|
|
# loop through until there are no more requests left to finish
|
|
while ( $uri->requests() ) {
|
|
|
|
$uri->do_iteration();
|
|
|
|
# if there are any completed requests, handle them
|
|
if ( my $number_completed = $uri->completed_requests() ) {
|
|
|
|
print "Completed $number_completed request(s):\n";
|
|
my $completed = $uri->completed();
|
|
print Dumper( $completed );
|
|
|
|
# IMPORTANT: the object caches downloaded requests, once the
|
|
# data wanted has been pulled out of the object, clear the object's
|
|
# cache. Otherwise, the resource will appear again in the next
|
|
# $uri->completed() call
|
|
$uri->clear_completed();
|
|
}
|
|
|
|
# do something here
|
|
print '.';
|
|
}
|
|
|
|
# output all the data
|
|
print Dumper($docs);
|
|
|
|
It is possible to queue more links with the $uri->rack_uri() within the loop safely though a separate accounting system must be designed to prevent infinite loops.
|
|
|
|
=head2 Options to configure GT::URI
|
|
|
|
GT::URI has only a few options to control it's behaviour: There's not much it does that can be configured!
|
|
|
|
$opts = {
|
|
|
|
# maximum number of bytes to download for a single resource
|
|
'max_down' => 0,
|
|
|
|
# maximum number of simultaneous downloads
|
|
'max_simultaneous' => 10,
|
|
|
|
# configuration settings for individual protocols, look in
|
|
# any GT::URI::xxxx protocol module to find out related
|
|
# configuration options
|
|
'protocol_opts' => {
|
|
'protocol_name' => {
|
|
setting => value,
|
|
...
|
|
},
|
|
# eg
|
|
'HTTP' => {
|
|
'agent_name' => 'example agent name option value'
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
=head2 The main data structure GT::URI creates
|
|
|
|
The data structure that GT::URI produces to house all the resource infomation is mildly complex.
|
|
|
|
$docs = {
|
|
'uri requested' => {
|
|
'buffer' => 'resource data',
|
|
'resource_attribs' => {
|
|
'resource_key' => 'value'
|
|
},
|
|
'extra info' => ....
|
|
}
|
|
}
|
|
|
|
The 'buffer' will contain the raw http data, 'resource_attribs' will contain extra information related to the resource.
|
|
|
|
Depending on the service requested, there could be more information added. Currently no protocol requires the need for an extra key.
|
|
|
|
=head1 METHOD LIST
|
|
|
|
Socket Handling
|
|
|
|
sub do_iteration() Basic looping function that downloads resources in the background
|
|
sub pending() Returns true if data awaiting
|
|
|
|
Acquisition
|
|
|
|
sub completed() Returns a hash of all the completed requests
|
|
sub completed_requests() The number of requests completed
|
|
sub clear_completed() Cleans the completed request cache
|
|
sub get() Simple resource aquisition function
|
|
sub rack_uri() Add a URI to be downloaded
|
|
sub requests() Returns number of active requests
|
|
sub vec() Sets file bits suitable for a select call
|
|
|
|
=head2 completed () : completed_requests HASHREF
|
|
|
|
Returns a datastructure with the cached completed documents.
|
|
|
|
=head2 completed_requests () : num_requests INTEGER
|
|
|
|
Returns the current number of completed requests in the cache.
|
|
|
|
=head2 clear_completed ()
|
|
|
|
Clears the completed document cache.
|
|
|
|
=head2 do_iteration () : completed_requests HASHREF
|
|
|
|
The major bulk of the non-blocking work is handled within this function.
|
|
|
|
=head2 GT::URI->get ( [ conf HASHREF, ] url STRING, url STRING, url STRING..., ) : completed_requests HASHREF
|
|
|
|
The simplest way of acquiring a number of pages. Call and it will return a the GT::URI data structure.
|
|
|
|
The configuration hashref, can be found anywhere in the list. The function will iterate through the get parameters and assume any hashref is an option parameter and any scalar an URI.
|
|
|
|
=head2 pending () : status BOOLEAN
|
|
|
|
Returns '1' or '0' if there is data pending to be downloaded for any of the requests
|
|
|
|
=head2 rack_uri ( url1 STRING, [ url2 STRING ... ] )
|
|
|
|
Takes a list of URLs and queues them for download.
|
|
|
|
=head2 requests ( tics INTEGER ) :
|
|
|
|
Will return the number of requests pending action in the downloading queue. Usually this would be followed up with a call to $uri->do_iteration();
|
|
|
|
=head2 vec ( [ bits STRING ] ) : bits STRING
|
|
|
|
Returns a bit mask that can be used in a call to select. If you want to use an already existing bit mask, pass it into the function and the appropriate bits from requets will be additionally set.
|
|
|
|
=head1 BUILDING PROTOCOL HANDLERS
|
|
|
|
forthcoming
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
|
http://www.gossamer-threads.com/
|
|
|
|
=head1 VERSION
|
|
|
|
Revision: $Id: URI.pm,v 1.24 2002/04/07 03:35:35 jagerman Exp $
|
|
|
|
=cut
|
|
|
|
|
|
|
|
|