discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/GT/URI.pm
2024-06-17 21:49:12 +10:00

555 lines
18 KiB
Perl

# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::URI
# Author : Aki Mimoto
# CVS Info : 087,071,086,086,085
# $Id: URI.pm,v 1.24 2002/04/07 03:35:35 jagerman Exp $
#
# Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
package GT::URI;
# ===============================================================
# handles requests to URI type locations
use GT::Base;
use strict;
use vars qw/ $ATTRIBS @ISA $DEBUG /;
$DEBUG = 0;
@ISA = ( 'GT::Base' );
$ATTRIBS = {
'racked_uri' => [
# 'url1', 'url2'
],
'max_down' => 5000000, # Set max download to 5 MB.
'max_simultaneous' => 10,
'downloading' => {
# 'uri' => {
# 'request' => GT::URI::HTTP,
# 'buffer' => ''
# }
},
'completed' => {
# 'uri' => {
# 'buffer' => ''
# ...
# }
},
'protocol_opts' => {
# 'protocol' => {
# setting => value,
# ...
# }
}
};
sub requests {
#--------------------------------------------------------------------------------
# returns number of active requests
# input will wait n seconds for input pending on it's queue
#
my $self = shift;
my $tics = shift;
my $requests= scalar(keys %{$self->{downloading}}); # + scalar(@{$self->{racked_uri}});
if ( $requests and $tics ) {
my $bits;
$bits = $self->vec() and select( $bits, undef, undef, ( $tics > 0 ) ? $tics : undef );
}
return $requests;
}
sub rack_uri {
#--------------------------------------------------------------------------------
# loads a uri to download
my $self = shift;
push @{$self->{racked_uri}}, grep $_, @_;
}
sub do_iteration {
#--------------------------------------------------------------------------------
# try to download pages
# and returns a hash of completed requests
my $self = shift;
# if we haven't reached the maximum number of simultaneous connections add more to
# the requesting pool
while ( @{$self->{racked_uri}} and ( scalar(keys %{$self->{downloading}}) < $self->{max_simultaneous} ) ) {
$self->debug("Racking a new Request") if ($self->{_debug});
$self->_queue_connect( shift @{$self->{racked_uri}} );
}
# go through and check if any of the racked objects can be download
while ( $self->pending() ) {
$self->debug("Someone has data that can be read, checking up") if ($self->{_debug});
$self->_buffered_read();
}
# handle all the aquired stuff
return $self->_completed_requests();
}
sub pending {
#--------------------------------------------------------------------------------
my $self = shift;
my $downloading = $self->{downloading};
my $pending = 0;
# go through each downloading uri hand
foreach my $uri ( keys %{$downloading} ) {
# ... now check up on the url
my $URI= $downloading->{$uri};
if ( $URI->{request} ) {
$URI->{request}->pending() and $pending++
}
# ... wierd
else {
$self->debug( "$uri is peculiar because it does no have a request object associated" ) if ($self->{_debug});
}
}
return $pending;
}
sub vec {
#--------------------------------------------------------------------------------
# sets file bits suitable for using in a select call
#
my $self = shift;
my $bits = shift;
my $downloading = $self->{downloading};
# go through each downloading uri hand
foreach my $URI ( values %{$downloading} ) {
my $request = $URI->{'request'};
if ( $request and $request->can('vec') ) {
$bits = $request->vec($bits);
}
}
return $bits;
}
sub _completed_requests {
#--------------------------------------------------------------------------------
# check if any requests are finished
my $self = shift;
my $downloading = $self->{downloading};
my $completed = $self->{completed};
# go through each downloading uri hand
foreach my $uri ( keys %{$downloading} ) {
$self->debug( "Checking $uri for download completion" ) if ($self->{_debug});
# ... now check up on the url
my $URI = $downloading->{$uri};
if ( my $req = $URI->{request} ) {
# .... if the request has finished, get the extra resource stuff that we may want
# then, move the request into the finished queue
if ($req->EOF()) {
$self->debug( "$uri has been completed, finishing up." ) if ($self->{_debug});
$URI->{resource_attribs} = $req->resource_attrib();
$completed->{$uri} = $URI;
delete $downloading->{$uri}->{request};
delete $downloading->{$uri};
}
}
# ... wierd
else {
$self->debug( "$uri is peculiar because it does not have a request object associated" ) if ($self->{_debug});
}
}
return $completed;
}
sub clear_completed {
#--------------------------------------------------------------------------------
# clear the completed requests
#
my $self = shift;
my $tmp = $self->{completed};
$self->{completed} = {};
return $tmp;
}
sub completed {
#--------------------------------------------------------------------------------
# returns a hash of all the completed downloads
#
my $self = shift;
if ( $self->{completed} ) {
return keys %{$self->{completed}} ? $self->{completed} : undef;
}
else { return };
}
sub completed_requests {
#--------------------------------------------------------------------------------
my $self = shift;
return scalar( keys %{$self->{completed} || {}} )
}
sub _queue_connect {
#--------------------------------------------------------------------------------
# connects to a URI
my $self = shift;
my $uri = shift;
$self->debug("Preparing a new Connection from $uri") if ($self->{_debug});
# trim free of spaces
$uri =~ s/^\s*|\s*$//g;
# extract protocol name from the URI
my $protocol = uc(substr($uri, 0, index($uri, ':')));
$self->debug("Protocol to be used with $uri: $protocol") if ($self->{_debug});
# load the corresponding protocol object
my $file = 'GT/URI/' . $protocol . '.pm';
my $pkg = 'GT::URI::' . $protocol;
{
local $SIG{__DIE__};
my $ret = eval { require "$file"; 1; };
if ( ! $ret ) {
$self->{completed}->{$uri}->{error} = "Could not load $protocol protocol handler : " . $@;
return;
}
}
# so, instantiate, init and issue the request
my $request = $pkg->new ( { max_down => $self->{max_down} } );
unless ($request) {
$self->{completed}->{$uri}->{error} = "Could not load $protocol protocol handler : " . $@;
return;
}
# set protocol-only parameters
my $protocol_requests = ( $self->{protocol_opts} || {} );
$protocol_requests = ( $protocol_requests->{ $protocol } || {} );
if ( $protocol_requests ) {
$request->set( $protocol_requests );
}
# finally, send out the initial request for the resource
if ( $request->fetch($uri) ) {
$self->debug( "Queued Request $uri" ) if ($self->{_debug});
$self->{downloading}->{$uri} = {
request => $request,
buffer => ''
};
}
# uh oh, there was an error
else {
$self->debug( "Error in Request $uri" ) if ($self->{_debug});
my $completed = $self->{completed};
$completed->{$uri} = {
resource_attribs => $request->{resource_attribs},
error => 'Connection problems'
};
}
# even through
return $request;
}
sub _buffered_read {
#--------------------------------------------------------------------------------
# reads all pending requests into the buffer
my $self = shift;
my $downloading = $self->{downloading};
# Go through each downloading uri hand
foreach my $uri ( keys %{$downloading} ) {
# ... now check up on the url
my $URI = $downloading->{$uri};
if ( $URI->{request} and $URI->{request}->pending() ) {
$URI->{buffer} .= $URI->{request}->gulp_read();
}
# ... wierd...,
else {
$self->debug( "$uri is peculiar because it does not have a request object associated" ) if ( $self->{_debug} );
}
}
}
sub get {
#--------------------------------------------------------------------------------
# gets a bunch of documents in a blocking fashion
my ( @uris, $options );
my $class = shift;
# parse out the parameters
foreach my $param ( @_ ) {
if ( ref $param eq 'HASH' ) {
$options = $param;
}
elsif ( not ref $param ) {
push @uris, $param;
}
}
# create and rack the URIs that we want to download
my $URI = new GT::URI($options);
foreach my $uri ( @uris ) { $URI->rack_uri( $uri ); };
# download the URIs
my ( $completed, $tmp );
while (
( %{$tmp = $URI->do_iteration()} ) or
( $URI->requests( -1 ) )
) {
if ( $tmp ) {
$URI->clear_completed();
foreach my $uri ( keys %{$tmp} ) {
$completed->{$uri} = $tmp->{$uri};
}
}
}
return $completed;
}
1;
__END__
=head1 NAME
GT::URI - Internet resource request broker
=head1 SYNOPSIS
use GT::URI;
my $doc = GT::URI->get( 'http://www.gossamer-threads.com' );
=head1 DESCRIPTION
GT::URI Makes requests and retrieves resources from internet servers.
=head1 BASICS
=head2 Getting a resource, the simple way
Just want just a few items? Call GT::URI::HTTP->get and all the magic will be done for you.
use GT::URI;
my $docs = GT::URI->get( "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
If options need to be set, include a hashref that has the appropriate setting you'd like set
use GT::URI;
my $conf = { max_down => 2000 };
my $docs = GT::URI::HTTP->get( $conf, "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
=head2 When you've got better things to do than wait
The simple method blocks when acquiring the data, meaning until all the data is downloaded, your script is frozen. GT::URI has the capability to do handle things in a non-blocking fashion, so while you wait for the documents to download, you can do something else.
A very simple example follows.
use GT::URI;
use GT::Dumper;
$uri = new GT::URI();
# queue up the URIs wanted
$uri->rack_uri( "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
# loop through until there are no more requests left to finish
while ( $uri->requests() ) {
$docs = $uri->do_iteration();
# do something here
print '.';
}
# output all the data
print Dumper($docs);
But this can quickly get much more complex. Since the downloads are asynchronous, the code can be changed to handle each request as it comes in.
use GT::URI;
use GT::Dumper;
$uri = new GT::URI();
# queue up the URIs wanted
$uri->rack_uri( "http://www.gossamer-threads.com/", "http://www.google.com/", "http://www.somethingelse.com" );
# loop through until there are no more requests left to finish
while ( $uri->requests() ) {
$uri->do_iteration();
# if there are any completed requests, handle them
if ( my $number_completed = $uri->completed_requests() ) {
print "Completed $number_completed request(s):\n";
my $completed = $uri->completed();
print Dumper( $completed );
# IMPORTANT: the object caches downloaded requests, once the
# data wanted has been pulled out of the object, clear the object's
# cache. Otherwise, the resource will appear again in the next
# $uri->completed() call
$uri->clear_completed();
}
# do something here
print '.';
}
# output all the data
print Dumper($docs);
It is possible to queue more links with the $uri->rack_uri() within the loop safely though a separate accounting system must be designed to prevent infinite loops.
=head2 Options to configure GT::URI
GT::URI has only a few options to control it's behaviour: There's not much it does that can be configured!
$opts = {
# maximum number of bytes to download for a single resource
'max_down' => 0,
# maximum number of simultaneous downloads
'max_simultaneous' => 10,
# configuration settings for individual protocols, look in
# any GT::URI::xxxx protocol module to find out related
# configuration options
'protocol_opts' => {
'protocol_name' => {
setting => value,
...
},
# eg
'HTTP' => {
'agent_name' => 'example agent name option value'
}
}
}
=head2 The main data structure GT::URI creates
The data structure that GT::URI produces to house all the resource infomation is mildly complex.
$docs = {
'uri requested' => {
'buffer' => 'resource data',
'resource_attribs' => {
'resource_key' => 'value'
},
'extra info' => ....
}
}
The 'buffer' will contain the raw http data, 'resource_attribs' will contain extra information related to the resource.
Depending on the service requested, there could be more information added. Currently no protocol requires the need for an extra key.
=head1 METHOD LIST
Socket Handling
sub do_iteration() Basic looping function that downloads resources in the background
sub pending() Returns true if data awaiting
Acquisition
sub completed() Returns a hash of all the completed requests
sub completed_requests() The number of requests completed
sub clear_completed() Cleans the completed request cache
sub get() Simple resource aquisition function
sub rack_uri() Add a URI to be downloaded
sub requests() Returns number of active requests
sub vec() Sets file bits suitable for a select call
=head2 completed () : completed_requests HASHREF
Returns a datastructure with the cached completed documents.
=head2 completed_requests () : num_requests INTEGER
Returns the current number of completed requests in the cache.
=head2 clear_completed ()
Clears the completed document cache.
=head2 do_iteration () : completed_requests HASHREF
The major bulk of the non-blocking work is handled within this function.
=head2 GT::URI->get ( [ conf HASHREF, ] url STRING, url STRING, url STRING..., ) : completed_requests HASHREF
The simplest way of acquiring a number of pages. Call and it will return a the GT::URI data structure.
The configuration hashref, can be found anywhere in the list. The function will iterate through the get parameters and assume any hashref is an option parameter and any scalar an URI.
=head2 pending () : status BOOLEAN
Returns '1' or '0' if there is data pending to be downloaded for any of the requests
=head2 rack_uri ( url1 STRING, [ url2 STRING ... ] )
Takes a list of URLs and queues them for download.
=head2 requests ( tics INTEGER ) :
Will return the number of requests pending action in the downloading queue. Usually this would be followed up with a call to $uri->do_iteration();
=head2 vec ( [ bits STRING ] ) : bits STRING
Returns a bit mask that can be used in a call to select. If you want to use an already existing bit mask, pass it into the function and the appropriate bits from requets will be additionally set.
=head1 BUILDING PROTOCOL HANDLERS
forthcoming
=head1 COPYRIGHT
Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
http://www.gossamer-threads.com/
=head1 VERSION
Revision: $Id: URI.pm,v 1.24 2002/04/07 03:35:35 jagerman Exp $
=cut