956 lines
28 KiB
Perl
956 lines
28 KiB
Perl
|
# ==================================================================
|
||
|
# Gossamer Threads Module Library
|
||
|
#
|
||
|
# GT::URI::HTTP
|
||
|
# Author : Aki Mimoto (support@gossamer-threads.com)
|
||
|
# CVS Info : 087,071,086,086,085
|
||
|
# $Id: HTTP.pm,v 1.30 2002/06/27 18:36:02 aki Exp $
|
||
|
#
|
||
|
# Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
||
|
# ==================================================================
|
||
|
#
|
||
|
# Description:
|
||
|
# Gets HTTP data
|
||
|
#
|
||
|
|
||
|
package GT::URI::HTTP;
|
||
|
# ===============================================================
|
||
|
use strict;
|
||
|
use GT::Socket;
|
||
|
use GT::Base;
|
||
|
use GT::CGI;
|
||
|
use Exporter ();
|
||
|
use vars qw/$ATTRIBS $EOL @ISA $DEBUG @EXPORT_OK $BASE_PORT /;
|
||
|
|
||
|
@ISA = ('GT::Base', 'Exporter');
|
||
|
|
||
|
$EOL = "\015\012";
|
||
|
$DEBUG = 0;
|
||
|
|
||
|
$ATTRIBS = {
|
||
|
URL => '',
|
||
|
request_method => 'GET',
|
||
|
parameters => {},
|
||
|
agent_name => 'Mozilla/4.73 [en]',
|
||
|
agent_host => 'X11; I; Linux 2.2.15-4mdk i586',
|
||
|
headers => {},
|
||
|
cookies => {},
|
||
|
resource_attribs => {},
|
||
|
resource_data => undef,
|
||
|
max_down => 200000,
|
||
|
debug => 0
|
||
|
};
|
||
|
|
||
|
@EXPORT_OK = qw/ parse_url deparse_url build_path build_parameters /;
|
||
|
|
||
|
sub pending {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# return true if there is some data to be picked up
|
||
|
my $self = shift;
|
||
|
my $tics = shift;
|
||
|
my $sock = $self->{'sock'} or return 1;
|
||
|
return $sock->pending();
|
||
|
}
|
||
|
|
||
|
sub EOF {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# return true if the system has finished downloading
|
||
|
my $self = shift;
|
||
|
my $sock = $self->{'sock'};
|
||
|
return ( $sock and $sock->fh() ? $sock->EOF() : 1 );
|
||
|
}
|
||
|
|
||
|
sub gulp_read {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $self = shift;
|
||
|
my $tics = shift;
|
||
|
return $self->do_iteration( $tics );
|
||
|
}
|
||
|
|
||
|
sub resource_attrib {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $self = shift;
|
||
|
my $attrib = shift;
|
||
|
my $attribs = $self->{resource_attribs} || {};
|
||
|
return $attrib ? $attribs->{$attrib} : $attribs;
|
||
|
}
|
||
|
|
||
|
sub do_iteration {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# useful in a loop, this is a generally non blocking way to get a page
|
||
|
# it will return nulls until it gets the page (and returns the page)
|
||
|
#
|
||
|
my ( $self, $fh, $sock, $pause );
|
||
|
$self = shift;
|
||
|
$pause = shift;
|
||
|
$sock = $self->{'sock'};
|
||
|
|
||
|
# if we haven't done the request yet
|
||
|
if ( not( $self->{'resource_downloaded'} or $sock ) and $self->{URL} ) {
|
||
|
$self->fetch();
|
||
|
$sock = $self->{'sock'};
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
# if we can't connect
|
||
|
if ( $self->{'flag_err'} ) {
|
||
|
return '';
|
||
|
}
|
||
|
|
||
|
$sock->fh() or return;
|
||
|
|
||
|
# only if the resource hasn't been downloaded yet..,
|
||
|
if ( !$self->{'resource_downloaded'} ) {
|
||
|
|
||
|
# if there's data pending, download it!
|
||
|
if ( $sock->pending( $pause ) ) {
|
||
|
# get as much data as we can
|
||
|
while ( my $buf = $sock->gulpread() ) {
|
||
|
$self->{'resource_data'} .= $buf;
|
||
|
}
|
||
|
# if we need to parse the server headers
|
||
|
$self->_parse_server_headers();
|
||
|
|
||
|
}
|
||
|
# we've run out of data, let's stop
|
||
|
if ( $sock->EOF() ) {
|
||
|
|
||
|
# make sure that the object know
|
||
|
$self->{'resource_downloaded'} = 1;
|
||
|
|
||
|
return $self->{'resource_data'};
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
else {
|
||
|
return $self->{'resource_data'};
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
sub fetch {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# fetches a page and handles CGI if requested
|
||
|
#
|
||
|
my ( $self, $url, $parameters, $sock, $request_str, $host, $port, $dirs, $file, $params, $fh, $order );
|
||
|
|
||
|
$self = shift;
|
||
|
$url = shift || $self->{'URL'} or return undef;
|
||
|
$parameters = shift || $self->{'parameters'};
|
||
|
|
||
|
# save the headers
|
||
|
$self->{'URL'} = $url;
|
||
|
$self->{'parameters'} = $parameters;
|
||
|
|
||
|
( $host, $port, $dirs, $file, $params, $order ) = parse_url($url);
|
||
|
|
||
|
# see if we can connect to the host
|
||
|
$sock = GT::Socket->open({
|
||
|
'host' => $host,
|
||
|
'port' => $port,
|
||
|
'_debug' => $self->{_debug},
|
||
|
'debug' => $self->{debug},
|
||
|
'max_down' => $self->{max_down}
|
||
|
});
|
||
|
|
||
|
# submit the request to the host
|
||
|
if ( $sock ) {
|
||
|
$fh = $sock->fh();
|
||
|
$self->debug( "Connected to $url" ) if ($self->{_debug});
|
||
|
$request_str = $self->_create_request( $url, $parameters, $host, $port, $dirs, $file, $params, $order );
|
||
|
$self->debug( "Sent request:\n$request_str" ) if ($self->{_debug});
|
||
|
print $fh $request_str;
|
||
|
}
|
||
|
|
||
|
else {
|
||
|
$self->debug( "Couldn't connect to $url. Reason: $GT::Socket::error" ) if ($self->{_debug});
|
||
|
$self->{resource_attribs} = {
|
||
|
ERROR_CODE => 'CANTCONN',
|
||
|
ERROR_MESSAGE => 'Cannot connect to server. Reason: $GT::Socket::error'
|
||
|
};
|
||
|
$self->{resource_downloaded} = 1;
|
||
|
$self->{flag_err} = 1;
|
||
|
}
|
||
|
|
||
|
# store the socket
|
||
|
return( $self->{'sock'} = $sock );
|
||
|
}
|
||
|
|
||
|
sub _parse_server_headers {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# this next part parses the server headers
|
||
|
#
|
||
|
my ( $self, $doc, $fh, $sock, $line, $error_code, $error_message, %resource_attribs, @lines );
|
||
|
$self = shift;
|
||
|
|
||
|
$doc = shift || $self->{'resource_data'};
|
||
|
|
||
|
if ( !$self->{'headers_parsed'} ) {
|
||
|
|
||
|
# check is there is a full header yet
|
||
|
( $doc =~ s/((.*?))(\012\015\012\015|\015\012\015\012|\015\015|\012\012)//so ) or return undef;
|
||
|
$line = $1;
|
||
|
$self->{'resource_data'} = $doc;
|
||
|
$line =~ s/(\012\015|\015\012)/\n/g;
|
||
|
|
||
|
# now get all the lines of the header..,
|
||
|
@lines = grep { not /^(\n|\r)+$/ } split /(\n|\r)/, $line;
|
||
|
|
||
|
# read the first header, which is the status line
|
||
|
$line = shift @lines;
|
||
|
($error_code, $error_message) = $line =~ /[^ ]*[ ]*(\d*)(.*)/;
|
||
|
|
||
|
# loop until we're done the headers
|
||
|
while ( $line = shift @lines ) {
|
||
|
if ( $line =~ /^(.*?): (.*)/ ) {
|
||
|
$resource_attribs{lc($1)} = $2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$resource_attribs{ERROR_CODE} = $error_code;
|
||
|
$resource_attribs{ERROR_MESSAGE} = $error_message;
|
||
|
|
||
|
# now parse cookies since we're dealing in http
|
||
|
if ( $resource_attribs{'set-cookie'} ) {
|
||
|
$resource_attribs{'COOKIE'} = _parse_cookie_string( $resource_attribs{'set-cookie'} );
|
||
|
}
|
||
|
|
||
|
# add the headers to the local object
|
||
|
$self->{'resource_attribs'} = \%resource_attribs;
|
||
|
|
||
|
# now set the headers parsed flag
|
||
|
$self->{'headers_parsed'} = 1;
|
||
|
}
|
||
|
|
||
|
return \%resource_attribs;
|
||
|
}
|
||
|
|
||
|
sub _parse_cookie_string {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my $str = shift;
|
||
|
my @cookie = grep $_, map { s/^\s*(.*?)\s*$/$1/; $_ } split /[;]/, $str;
|
||
|
|
||
|
# as first element of the cookie is a hash
|
||
|
my $value = shift @cookie;
|
||
|
|
||
|
my %segments = map { /^([^=]*)=(.*)$/ ? ( lc($1) => $2 ) : ( $_ => undef ) } @cookie;
|
||
|
|
||
|
if ( $value =~ /^([^=]*)=(.*)$/ ) {
|
||
|
$segments{name} = GT::CGI::unescape($1);
|
||
|
$segments{value} = GT::CGI::unescape($2);
|
||
|
}
|
||
|
else {
|
||
|
$segments{name} = GT::CGI::unescape($value);
|
||
|
$segments{value} = undef;
|
||
|
};
|
||
|
|
||
|
return \%segments;
|
||
|
}
|
||
|
|
||
|
sub _hash_to_phash {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# creates a parameter hash from a standard hash
|
||
|
my $hash = shift or return undef;
|
||
|
|
||
|
my $phash = {
|
||
|
map {
|
||
|
$_ => [ $$hash{$_} ]
|
||
|
} keys %{$hash}
|
||
|
};
|
||
|
|
||
|
return $phash;
|
||
|
}
|
||
|
|
||
|
sub _combine_phash {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
my ( $a, $b ) = @_;
|
||
|
foreach my $key ( %$b ) {
|
||
|
my $container = $a->{$key};
|
||
|
if ( ref $container ) {
|
||
|
push @{$container}, @{$b->{$key}};
|
||
|
}
|
||
|
else {
|
||
|
$a->{$key} = [ $container, @{$b->{$key}} ];
|
||
|
}
|
||
|
}
|
||
|
return $a;
|
||
|
}
|
||
|
|
||
|
sub _create_request {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# creates the http request to be submitted to the server
|
||
|
my ( $sock, $fh, $get_str, $req, $headers, $method, $tmp );
|
||
|
|
||
|
my ( $self, $url, $parameters, $host, $port, $dirs, $file, $params, $order ) = @_;
|
||
|
|
||
|
$method = lc($self->{'request_method'});
|
||
|
|
||
|
# now handle the request
|
||
|
$get_str = uc($method) . ' /' . build_path($dirs,$file);
|
||
|
|
||
|
# if this is a get request, if there are any parameters we want to pass, handle that
|
||
|
if ( $method eq 'get' ) {
|
||
|
$tmp = build_parameters( _combine_phash( $params, _hash_to_phash($parameters) ), $order );
|
||
|
$get_str.= $tmp ? "?$tmp" : "";
|
||
|
}
|
||
|
$get_str .= " HTTP/1.0$EOL";
|
||
|
|
||
|
# handle the host field...,
|
||
|
$get_str .= "Host: $host" . ( ( $port == 80 ) ? "" : ":$port" ) . $EOL;
|
||
|
|
||
|
# handle the useragent field
|
||
|
$get_str .= "User-Agent: $self->{agent_name} ($self->{agent_host})" . $EOL;
|
||
|
$get_str .= "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*" . $EOL;
|
||
|
$get_str .= "Accept-Language: en" . $EOL;
|
||
|
$get_str .= "Accept-Charset: iso-8859-1,*,utf-8" . $EOL;
|
||
|
$get_str .= "Connection: close" . $EOL;
|
||
|
|
||
|
# handle cookies
|
||
|
if ( my $cookiestr = $self->_create_cookie_request() ) {
|
||
|
$get_str .= $cookiestr . $EOL;
|
||
|
}
|
||
|
|
||
|
# handle any extra headers
|
||
|
$headers = $self->{'headers'};
|
||
|
$get_str .= join(
|
||
|
"",
|
||
|
map {
|
||
|
$_ = ucfirst;
|
||
|
"$_: $$headers{$_}$EOL";
|
||
|
} keys %{$headers}
|
||
|
);
|
||
|
|
||
|
# handle if the request is a post
|
||
|
if ( $method eq 'post' ) {
|
||
|
my $request = build_parameters( { %{$params}, %{_hash_to_phash($parameters)} } );
|
||
|
$get_str .=
|
||
|
"Content-type: application/x-www-form-urlencoded$EOL" .
|
||
|
"Content-length: " .length($request)."$EOL$EOL".
|
||
|
$request;
|
||
|
}
|
||
|
|
||
|
else {
|
||
|
$get_str .= "$EOL$EOL";
|
||
|
}
|
||
|
|
||
|
return $get_str;
|
||
|
}
|
||
|
|
||
|
sub cookie {
|
||
|
}
|
||
|
|
||
|
sub _create_cookie_request {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# creates the string the client sends to the server for a cookie
|
||
|
my $self = shift;
|
||
|
|
||
|
my $cookies = $self->{'cookies'};
|
||
|
my $str = join "; ", grep $_, map {
|
||
|
|
||
|
my $value = $cookies->{$_};
|
||
|
|
||
|
if ( ref $value eq 'GT::CGI::Cookie' ) {
|
||
|
GT::CGI::escape($value->{-name}) . "=" . GT::CGI::escape( $value->{-value} );
|
||
|
}
|
||
|
|
||
|
elsif ( ref $value eq 'HASH' ) {
|
||
|
GT::CGI::escape($value->{name}) . "=" . GT::CGI::escape( $value->{value} );
|
||
|
}
|
||
|
|
||
|
elsif ( $value and not ref $value ) {
|
||
|
GT::CGI::escape($_) . "=" . GT::CGI::escape($value);
|
||
|
}
|
||
|
|
||
|
} keys %{$cookies};
|
||
|
|
||
|
return $str ? "Cookie: $str" : "";
|
||
|
}
|
||
|
|
||
|
sub method {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# sets the acquisition method for parameter quering
|
||
|
#
|
||
|
my $self = shift;
|
||
|
my $method = shift or return $self->{'form_method'};
|
||
|
|
||
|
if ( lc $method eq 'get' ) {
|
||
|
$self->{'form_method'} = lc $method;
|
||
|
}
|
||
|
|
||
|
elsif ( lc $method eq 'post' ) {
|
||
|
$self->{'form_method'} = lc $method;
|
||
|
}
|
||
|
|
||
|
elsif ( lc $method eq 'head' ) {
|
||
|
$self->{'form_method'} = lc $method;
|
||
|
}
|
||
|
|
||
|
return $self->{'form_method'};
|
||
|
}
|
||
|
|
||
|
sub load_parameter {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# loads the passed parameters into the database
|
||
|
my ( $self, $params, $key, $parameters );
|
||
|
$self = shift;
|
||
|
$params = ( ref $_[0] ? $_[0] : { @_ } ) or return undef;
|
||
|
$parameters = $self->{'parameters'};
|
||
|
|
||
|
# add each entry into the object's attribs
|
||
|
foreach $key ( keys %{$params} ) {
|
||
|
$$parameters{ $key } = $$params{ $key };
|
||
|
}
|
||
|
|
||
|
return $self->{'parameters'};
|
||
|
}
|
||
|
|
||
|
sub delete_parameter {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# deletes the named parameters
|
||
|
my ( $self, $params, $key, $parameters );
|
||
|
$self = shift;
|
||
|
$params = ref $_[0] ? $_[0] : [ @_ ];
|
||
|
$parameters = $self->{'parameters'};
|
||
|
|
||
|
# now delete the parameters
|
||
|
foreach $key ( @{$params} ) {
|
||
|
delete $$parameters{$key};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub parse_url {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# parses the url and converts it into host
|
||
|
# assume HTTP/1.0 format
|
||
|
#
|
||
|
my ( $url, $host, $port, $path, $file, $page, @dirs, $param, %params, $item, @order );
|
||
|
$url = shift;
|
||
|
|
||
|
# first, do the initial parse
|
||
|
$url =~ s/^[a-zA-Z]+:\/\///i;
|
||
|
$url =~ /([\w.\-]*):?(\d*)\/?([^?]*)\??(.*)/i;
|
||
|
|
||
|
$host = $1;
|
||
|
$port = $2 || 80;
|
||
|
$path = $3;
|
||
|
$param = $4;
|
||
|
|
||
|
# now handle the paths
|
||
|
$path =~ s/^\s*(.*?)\s*$/$1/;
|
||
|
|
||
|
# the paths are not simply directories
|
||
|
if ( $path !~ /\/$/ ) {
|
||
|
$path =~ s/([^\/]*)$//;
|
||
|
$file = $1;
|
||
|
}
|
||
|
|
||
|
# handle the directories
|
||
|
@dirs = grep $_, split /\//, $path;
|
||
|
|
||
|
# and finally parse out the parameters if there are any
|
||
|
foreach $item ( split /\&/, $param ) {
|
||
|
if ( $item =~ /^(.*?)=(.*?)$/ ) {
|
||
|
push @order, $1;
|
||
|
push @{$params{$1}}, $2;
|
||
|
}
|
||
|
else {
|
||
|
push @order, $item;
|
||
|
push @{$params{$item}}, undef;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# now return the collection
|
||
|
return ( $host, $port, \@dirs, $file, \%params, \@order );
|
||
|
|
||
|
}
|
||
|
|
||
|
sub deparse_url {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# takes a number of parameters and builds the request url
|
||
|
my ( $url, $host, $port, $path, $file, $page, $dirs, $param, $params, $item );
|
||
|
|
||
|
( $host, $port, $dirs, $file, $params ) = @_;
|
||
|
|
||
|
$url = $host;
|
||
|
|
||
|
# build the host portion
|
||
|
if ( $port != 80 ) {
|
||
|
$url .= ":80";
|
||
|
}
|
||
|
|
||
|
# now build the page access
|
||
|
$url .= "/" . build_path( $dirs, $file );
|
||
|
|
||
|
# and finally handle the parameters if any
|
||
|
if ( $params and keys %{$params} ) {
|
||
|
$url .= "?" . build_parameters( $params );
|
||
|
}
|
||
|
|
||
|
return $url;
|
||
|
|
||
|
}
|
||
|
|
||
|
sub build_path {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# takes a dir array and builds a path
|
||
|
my ( $path, $dirs, $page );
|
||
|
|
||
|
$dirs = shift;
|
||
|
$page = shift;
|
||
|
|
||
|
if ( $dirs ) {
|
||
|
$path = join( "/", grep $_, @{$dirs} );
|
||
|
$path &&= "$path/";
|
||
|
}
|
||
|
|
||
|
return $path .= $page;
|
||
|
|
||
|
}
|
||
|
|
||
|
sub build_parameters {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# takes a hash of parameters and builds a string with it
|
||
|
my ( $params, $paramstr, @paramlist, $order );
|
||
|
|
||
|
$params = shift or return undef;
|
||
|
$order = shift;
|
||
|
|
||
|
if ( keys %{$params} ) {
|
||
|
|
||
|
# Deep copy hash and array refs only.
|
||
|
my $tmp;
|
||
|
while (my ($k, $v) = each %$params) {
|
||
|
if (! ref $v) {
|
||
|
$tmp->{$k} = $v;
|
||
|
}
|
||
|
elsif (ref $v eq 'HASH') {
|
||
|
$tmp->{$k} = {};
|
||
|
foreach my $k1 (keys %{$params->{$k}}) { $tmp->{$k}->{$k1} = $params->{$k}->{$k1}; }
|
||
|
}
|
||
|
elsif (ref $v eq 'ARRAY') {
|
||
|
$tmp->{$k} = [];
|
||
|
foreach my $v1 (@{$params->{$k}}) { push @{$tmp->{$k}}, $v1; }
|
||
|
}
|
||
|
else { $tmp->{$k} = $v; }
|
||
|
}
|
||
|
|
||
|
if ( $order ) {
|
||
|
|
||
|
foreach my $key ( @$order ) {
|
||
|
my $value = shift @{$tmp->{$key} || []};
|
||
|
if ( $tmp->{$key} and not @{$tmp->{$key}} ) {
|
||
|
delete $tmp->{$key}
|
||
|
}
|
||
|
$key = GT::CGI::escape( $key );
|
||
|
push @paramlist, ( defined $value ? "$key=$value" : $key );
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
require GT::CGI;
|
||
|
$paramstr = join "&", ( @paramlist,
|
||
|
map {
|
||
|
my $key = GT::CGI::escape($_);
|
||
|
my $list = $tmp->{$_};
|
||
|
|
||
|
( ref $list eq 'ARRAY' ) ?
|
||
|
join "&",
|
||
|
map {
|
||
|
"$key" . ( defined $_ ? "=". GT::CGI::escape($_) : '' );
|
||
|
} @{$list}
|
||
|
:
|
||
|
"$key=$list";
|
||
|
|
||
|
} keys %{$tmp} );
|
||
|
}
|
||
|
|
||
|
return $paramstr;
|
||
|
}
|
||
|
|
||
|
sub get {
|
||
|
#-------------------------------------------------------------------------------
|
||
|
# simple blocking method to get some data
|
||
|
my ( $uri, $options, $HTTP, $doc );
|
||
|
|
||
|
shift; # get rid of the first parameter
|
||
|
$uri = shift or return undef;
|
||
|
$options = ( @_ and ref $_[0] ) ? shift : {@_};
|
||
|
|
||
|
$HTTP = new GT::URI::HTTP($options);
|
||
|
$HTTP->fetch($uri);
|
||
|
|
||
|
while ( not defined( $doc = $HTTP->do_iteration(-1) ) ) {};
|
||
|
|
||
|
return $doc;
|
||
|
}
|
||
|
|
||
|
1;
|
||
|
|
||
|
__END__
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
GT::URI::HTTP - HTTP request broker.
|
||
|
|
||
|
=head1 SYNOPSIS
|
||
|
|
||
|
use GT::URI::HTTP;
|
||
|
|
||
|
print GT::URI::HTTP->get( "http://www.gossamer-threads.com" );
|
||
|
|
||
|
=head1 DESCRIPTION
|
||
|
|
||
|
GT::URI::HTTP makes requests and retrieves resources from http servers (not
|
||
|
limited to text). Can be used stand-alone or through GT::URI
|
||
|
|
||
|
=head1 Method List
|
||
|
|
||
|
Socket Handling
|
||
|
|
||
|
sub pending() Returns true if data awaiting
|
||
|
sub EOF() Returns open/closed status of socket
|
||
|
sub gulp_read() Alias to do_iteration
|
||
|
sub do_iteration() Basic looping function that downloads resources in the background
|
||
|
|
||
|
Acquisition
|
||
|
|
||
|
sub fetch() Tell the object which URL to acquire
|
||
|
sub method() The method of acquisition
|
||
|
sub load_parameter() Add a item for CGI parameters
|
||
|
sub delete_parameter() Delete a CGI parameter
|
||
|
sub resource_attrib() Headers related to resource and server
|
||
|
sub get() Simple resource aquisition function
|
||
|
|
||
|
Support Methods (must be imported)
|
||
|
|
||
|
sub parse_url() Decomposes a URL into constituent parts
|
||
|
sub deparse_url() Takes those parts and builds an URL
|
||
|
sub build_path() Takes a list of directories and builds a path
|
||
|
sub build_parameters() Takes a hash of parameter->values and builds a CGI request string
|
||
|
|
||
|
=head1 Basics
|
||
|
|
||
|
=head2 Getting a resource, the simple way
|
||
|
|
||
|
|
||
|
Just want a single item? Call GT::URI::HTTP->get and all the magic will be done
|
||
|
for you.
|
||
|
|
||
|
use GT::URI::HTTP;
|
||
|
my $buf = GT::URI::HTTP->get( "http://www.gossamer-threads.com/" );
|
||
|
|
||
|
Get based requests are permissable as well:
|
||
|
|
||
|
use GT::URI::HTTP;
|
||
|
my $buf = GT::URI::HTTP->get( "http://search.yahoo.com/bin/search?p=gossamer+threads" );
|
||
|
|
||
|
If extra options need to be set, simply append the options to the parameter
|
||
|
list, like follows.
|
||
|
|
||
|
use GT::URI::HTTP;
|
||
|
my $buf = GT::URI::HTTP->get( "http://search.yahoo.com/bin/search?p=gossamer+threads", { request_method => 'POST' } );
|
||
|
|
||
|
=head2 When just the document is not enough
|
||
|
|
||
|
|
||
|
If a new GT::URI::HTTP object is instantiated, much more control is available,
|
||
|
including facilities for non-blocking downloading of pages.
|
||
|
|
||
|
To create a GT::URI::HTTP object, call new with all the options required:
|
||
|
|
||
|
use GT::URI::HTTP;
|
||
|
|
||
|
my $http = new GT::URI::HTTP(
|
||
|
|
||
|
# URL to acquire (optional)
|
||
|
'URL' => '',
|
||
|
|
||
|
# Can also be set to POST/GET/HEAD (optional)
|
||
|
'request_method' => 'GET',
|
||
|
|
||
|
# a hash of keys pointing to an arrayref of values to be sent to the server
|
||
|
# {
|
||
|
# 'key' => [ 'value1', 'value2'... ],
|
||
|
# }
|
||
|
# (optional)
|
||
|
'parameters' => {},
|
||
|
|
||
|
# Name portion of the User-Agent: string the server acquires (optional)
|
||
|
'agent_name' => 'Mozilla/4.73 [en]',
|
||
|
|
||
|
# Host-from portion of the User-Agent: string the server acquires (optional)
|
||
|
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
|
||
|
|
||
|
# To prevent downloading of 80Tb files, but if you still wanted to, set this to 0 (optional)
|
||
|
'max_down' => 200000
|
||
|
|
||
|
);
|
||
|
|
||
|
If URL has been specified in the options, for interactions with a CGI, you can
|
||
|
set extra parameters with $http->load_parameter(). Finally, loop on
|
||
|
$http->do_iteration() until the value is defined. To replicate the "simple get"
|
||
|
example:
|
||
|
|
||
|
use GT::URI::HTTP;
|
||
|
$|++;
|
||
|
|
||
|
my $http = new GT::URI::HTTP(
|
||
|
|
||
|
URL => 'http://search.yahoo.com/bin/search',
|
||
|
|
||
|
# can also use the following:
|
||
|
parameters => {
|
||
|
'p' => [ 'gossamer threads' ]
|
||
|
}
|
||
|
|
||
|
);
|
||
|
|
||
|
my $doc;
|
||
|
while ( not defined( $doc = $http->do_iteration() ) ) {
|
||
|
# do something here while waiting for the resource to arrive
|
||
|
print "."
|
||
|
}
|
||
|
|
||
|
print $doc, "\n\n";
|
||
|
|
||
|
Beyond the resource, the http server often supplies extra information in a
|
||
|
header. To access this information, use $http->resource_attrib().
|
||
|
|
||
|
Appending this code to the previous example, a list of all the associated server
|
||
|
headers can be seen:
|
||
|
|
||
|
my $attribs = $http->resource_attrib();
|
||
|
foreach my $key ( sort keys %{$attribs} ) {
|
||
|
print "$key => $attribs->{$key}\n";
|
||
|
}
|
||
|
|
||
|
=head2 Support Methods
|
||
|
|
||
|
In addition to the basic fetching abilities, since the module must parse HTTP
|
||
|
URLs, the methods used to do so have been made public.
|
||
|
|
||
|
These methods decompose URLs into datastructures that make URLs easily studied
|
||
|
or modified and then reconstructed.
|
||
|
|
||
|
However, these routines have not been polished for useability so beware! The
|
||
|
following is a very basic example:
|
||
|
|
||
|
use GT::URI::HTTP qw/ parse_url deparse_url build_path build_parameters /;
|
||
|
|
||
|
# fragment the URL
|
||
|
my ( $host, $port, $dirs, $file, $params ) = parse_url( 'http://www.gossamer-threads.com/perl/forum/showflat.pl?Cat=&Board=GosDisc&Number=113355&page=0&view=' );
|
||
|
|
||
|
print "Parsed Data:\n\n";
|
||
|
print "Host: $host\n";
|
||
|
print "Port: $port\n";
|
||
|
print "Dirs:\n";
|
||
|
foreach my $dir ( @{$dirs} ) {
|
||
|
print " $dir/\n";
|
||
|
}
|
||
|
print "Resource Filename: $file\n";
|
||
|
print "Params:\n";
|
||
|
foreach my $key ( sort keys %{$params} ) {
|
||
|
print " $key: ";
|
||
|
my $values = ( $params->{$key} || {} );
|
||
|
foreach my $value ( sort @{$values} ) {
|
||
|
print "'", quotemeta($value), "' ";
|
||
|
}
|
||
|
print "\n";
|
||
|
}
|
||
|
|
||
|
# put the data back together again
|
||
|
my $url = deparse_url( $host, $port, $dirs, $file, $params );
|
||
|
print "\nDeparsed Data:\n\n";
|
||
|
print "URL: http://$url\n";
|
||
|
|
||
|
=head1 Methods List
|
||
|
|
||
|
=head2 build_path ( dir ARRAYREF, [ page STRING ] ) : STRING
|
||
|
|
||
|
Takes an array ref of directory names and an optional filename and returns a
|
||
|
filepath.
|
||
|
|
||
|
use GT::URI::HTTP qw/ build_path /;
|
||
|
print build_path( [ 'topdir', 'middir', 'bottomdir' ], 'file.html' );
|
||
|
|
||
|
This function must be imported.
|
||
|
|
||
|
=head2 build_parameters ( parameter HASHREF ) : STRING
|
||
|
|
||
|
Builds a CGI request string from list of keys and values. The function has the
|
||
|
ability to handle keys with more than one parameter, simply use an arrayref with
|
||
|
multiple values.
|
||
|
|
||
|
use GT::URI::HTTP qw/ build_parameters /;
|
||
|
my $params = {
|
||
|
'simplekey' => 'value'
|
||
|
'onekey' => [ 'one value' ],
|
||
|
'anotherkey' => [ 'another value', 'and yet anotherone!' ],
|
||
|
};
|
||
|
print build_parameters($params);
|
||
|
|
||
|
This function must be imported.
|
||
|
|
||
|
=head2 delete_parameter ( keys ARRAYREF/ARRAY )
|
||
|
|
||
|
When loading the object with parameters before a request, it is possible to
|
||
|
delete an entire set of keys and values.
|
||
|
|
||
|
=head2 deparse_url ( host STRING, [ port STRING, [ dirs ARRAYREF, [ file STRING, [ params HASH ] ] ] ] ) : STRING
|
||
|
|
||
|
This builds an entire URL from basic parameters.
|
||
|
|
||
|
For an example of this function, see the example in "Support Methods".
|
||
|
|
||
|
This function must be imported.
|
||
|
|
||
|
=head2 do_iteration ( tics INTEGER ) : STRING
|
||
|
|
||
|
The basic iteration function. This function will return undef until the resource
|
||
|
is received which, upon receipt will return the resource data.
|
||
|
|
||
|
The function can return an empty string, so it is important to checked
|
||
|
defined'ness. If the return is an empty string, check the ERROR_CODE in
|
||
|
resource_attrib to find out if the script simply can't connect to the host or
|
||
|
the resource is empty.
|
||
|
|
||
|
=head2 EOF () : BOOLEAN
|
||
|
|
||
|
Returns '1' or '0' depending if the object has stopped receiving/sending data to
|
||
|
the remote server.
|
||
|
|
||
|
=head2 fetch ( url STRING, [ parameters HASHREF ] )
|
||
|
|
||
|
Tells the server the URL to retreive the resource of. If CGI parameters are
|
||
|
required pass in a hash of keys and values.
|
||
|
|
||
|
=head2 GT::URI::HTTP->get ( url, [ options HASH/HASHREF ] ) : RESOURCE_DATA
|
||
|
|
||
|
Simplest resource aquision method. Give it the URL and any options and the
|
||
|
function will return after the resource has been downloaded.
|
||
|
|
||
|
=head2 gulp_read ( tic INTEGER ) : RESOURCE_DATA
|
||
|
|
||
|
This is just an alias to the function do_iteration. This method is used by
|
||
|
GT::URI in its mass resource aquisition runs.
|
||
|
|
||
|
Unless you feel like being different, you shouldn't need to use this.
|
||
|
|
||
|
=head2 load_parameter ( params HASH/HASHREF ) : HASHREF
|
||
|
|
||
|
Takes a list of keys and values and loads the values into the list of CGI
|
||
|
parameters to be sent to the remote server.
|
||
|
|
||
|
=head2 method ( method STRING ) : STRING
|
||
|
|
||
|
Sets the acquisition method for the resource. Currently, GET/POST/HEAD are
|
||
|
supported.
|
||
|
|
||
|
If no parameters are supplied the function simply returns the current
|
||
|
acquisition method.
|
||
|
|
||
|
=head2 parse_url ( url STRING ) : host STRING, port INTEGER, dirs ARRAYREF,
|
||
|
file STRING, params HASHREF
|
||
|
|
||
|
Takes an URL and decomposes it into easily manipulated datastructures. The
|
||
|
output can be fed back into deparse_url to reconstruct an URL.
|
||
|
|
||
|
This function must be imported.
|
||
|
|
||
|
=head2 pending ( tics INTEGER ) : BOOLEAN
|
||
|
|
||
|
If there is data available to be downloaded, this function returns '1',
|
||
|
otherwise '0'. This is another function used by GT::URI in it's mass downloads
|
||
|
and unlikely to be of any use to anyone using this module directly. This
|
||
|
function exists because it is lighter than do_iteration which can be quite a
|
||
|
load as opposed to this if there were 100 racked downloads, all being polled
|
||
|
every tenth of a second!
|
||
|
|
||
|
=head2 resource_attrib ( [ key STRING ] ) STRING or HASHREF
|
||
|
|
||
|
If a key is requested, function will return the value associated with the
|
||
|
resource attribute. If not, the function will return a hashref keyed by server
|
||
|
parameter to its corresponding value.
|
||
|
|
||
|
All the server keys have been converted into lower-case. This prevents conflict
|
||
|
with two very important keys, ERROR_CODE, and ERROR_MESSAGE, which carry the
|
||
|
HTTP error code and message associated with the aquisition of this page.
|
||
|
|
||
|
=head1 EXAMPLES
|
||
|
|
||
|
=head2 HTTP get example
|
||
|
|
||
|
#!/usr/bin/perl
|
||
|
|
||
|
use GT::URI::HTTP;
|
||
|
|
||
|
if ( not @ARGV ) {
|
||
|
print qq!
|
||
|
SYNOPSIS
|
||
|
|
||
|
$0 url [-f/-h] [ cgi_parameter1=value1 cgi_parameter2=value2 ... ]
|
||
|
|
||
|
basic HTTP requestor
|
||
|
|
||
|
OPTIONS
|
||
|
|
||
|
-f : full information; headers and resource. Usually only a dump of the resource is provided.
|
||
|
-h : just the headers\n\n!;
|
||
|
exit;
|
||
|
}
|
||
|
|
||
|
# parse out the command line
|
||
|
# first argument, URL
|
||
|
$url = shift @ARGV;
|
||
|
|
||
|
# next arguments, parameters
|
||
|
foreach my $item ( @ARGV ) {
|
||
|
|
||
|
# ... check for special requests
|
||
|
if ( $item =~ /^-([fd])$/ ) {
|
||
|
$mode = $1;
|
||
|
}
|
||
|
|
||
|
# ... is not a special request, but probably a parameter
|
||
|
( $key, $value ) = ( $item =~ /([^=]+)=(.*)/ );
|
||
|
$key ||= $item;
|
||
|
push @{$parameters->{$key}}, $value;
|
||
|
}
|
||
|
|
||
|
# setup and send the request
|
||
|
$http = new GT::URI::HTTP(
|
||
|
|
||
|
# if we're only looking to use the head
|
||
|
request_method => ( $mode eq 'h' ? 'HEAD' : 'GET' )
|
||
|
|
||
|
);
|
||
|
$http->fetch( $url, $parameters );
|
||
|
|
||
|
# get the resource
|
||
|
while ( not defined ( $doc = $http->do_iteration(-1) ) ) {}
|
||
|
|
||
|
# and print out the headers if wanted
|
||
|
if ( $mode ) {
|
||
|
$headers = $http->resource_attrib();
|
||
|
foreach $key ( sort keys %{$headers || {}} ) {
|
||
|
print "$key: $headers->{$key}\n";
|
||
|
}
|
||
|
print "\n";
|
||
|
}
|
||
|
|
||
|
# and output the resource...
|
||
|
print $doc;
|
||
|
|
||
|
=head1 COPYRIGHT
|
||
|
|
||
|
Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
||
|
http://www.gossamer-threads.com/
|
||
|
|
||
|
=head1 VERSION
|
||
|
|
||
|
Revision: $Id: HTTP.pm,v 1.30 2002/06/27 18:36:02 aki Exp $
|
||
|
|
||
|
=cut
|
||
|
|