First pass at adding key files
This commit is contained in:
955
site/slowtwitch.com/cgi-bin/articles/admin/GT/URI/HTTP.pm
Normal file
955
site/slowtwitch.com/cgi-bin/articles/admin/GT/URI/HTTP.pm
Normal file
@ -0,0 +1,955 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library
|
||||
#
|
||||
# GT::URI::HTTP
|
||||
# Author : Aki Mimoto (support@gossamer-threads.com)
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: HTTP.pm,v 1.30 2002/06/27 18:36:02 aki Exp $
|
||||
#
|
||||
# Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Gets HTTP data
|
||||
#
|
||||
|
||||
package GT::URI::HTTP;
|
||||
# ===============================================================
|
||||
use strict;
|
||||
use GT::Socket;
|
||||
use GT::Base;
|
||||
use GT::CGI;
|
||||
use Exporter ();
|
||||
use vars qw/$ATTRIBS $EOL @ISA $DEBUG @EXPORT_OK $BASE_PORT /;
|
||||
|
||||
@ISA = ('GT::Base', 'Exporter');
|
||||
|
||||
$EOL = "\015\012";
|
||||
$DEBUG = 0;
|
||||
|
||||
$ATTRIBS = {
|
||||
URL => '',
|
||||
request_method => 'GET',
|
||||
parameters => {},
|
||||
agent_name => 'Mozilla/4.73 [en]',
|
||||
agent_host => 'X11; I; Linux 2.2.15-4mdk i586',
|
||||
headers => {},
|
||||
cookies => {},
|
||||
resource_attribs => {},
|
||||
resource_data => undef,
|
||||
max_down => 200000,
|
||||
debug => 0
|
||||
};
|
||||
|
||||
@EXPORT_OK = qw/ parse_url deparse_url build_path build_parameters /;
|
||||
|
||||
sub pending {
|
||||
#-------------------------------------------------------------------------------
|
||||
# return true if there is some data to be picked up
|
||||
my $self = shift;
|
||||
my $tics = shift;
|
||||
my $sock = $self->{'sock'} or return 1;
|
||||
return $sock->pending();
|
||||
}
|
||||
|
||||
sub EOF {
|
||||
#-------------------------------------------------------------------------------
|
||||
# return true if the system has finished downloading
|
||||
my $self = shift;
|
||||
my $sock = $self->{'sock'};
|
||||
return ( $sock and $sock->fh() ? $sock->EOF() : 1 );
|
||||
}
|
||||
|
||||
sub gulp_read {
|
||||
#-------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $tics = shift;
|
||||
return $self->do_iteration( $tics );
|
||||
}
|
||||
|
||||
sub resource_attrib {
|
||||
#-------------------------------------------------------------------------------
|
||||
my $self = shift;
|
||||
my $attrib = shift;
|
||||
my $attribs = $self->{resource_attribs} || {};
|
||||
return $attrib ? $attribs->{$attrib} : $attribs;
|
||||
}
|
||||
|
||||
sub do_iteration {
|
||||
#-------------------------------------------------------------------------------
|
||||
# useful in a loop, this is a generally non blocking way to get a page
|
||||
# it will return nulls until it gets the page (and returns the page)
|
||||
#
|
||||
my ( $self, $fh, $sock, $pause );
|
||||
$self = shift;
|
||||
$pause = shift;
|
||||
$sock = $self->{'sock'};
|
||||
|
||||
# if we haven't done the request yet
|
||||
if ( not( $self->{'resource_downloaded'} or $sock ) and $self->{URL} ) {
|
||||
$self->fetch();
|
||||
$sock = $self->{'sock'};
|
||||
return;
|
||||
}
|
||||
|
||||
# if we can't connect
|
||||
if ( $self->{'flag_err'} ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$sock->fh() or return;
|
||||
|
||||
# only if the resource hasn't been downloaded yet..,
|
||||
if ( !$self->{'resource_downloaded'} ) {
|
||||
|
||||
# if there's data pending, download it!
|
||||
if ( $sock->pending( $pause ) ) {
|
||||
# get as much data as we can
|
||||
while ( my $buf = $sock->gulpread() ) {
|
||||
$self->{'resource_data'} .= $buf;
|
||||
}
|
||||
# if we need to parse the server headers
|
||||
$self->_parse_server_headers();
|
||||
|
||||
}
|
||||
# we've run out of data, let's stop
|
||||
if ( $sock->EOF() ) {
|
||||
|
||||
# make sure that the object know
|
||||
$self->{'resource_downloaded'} = 1;
|
||||
|
||||
return $self->{'resource_data'};
|
||||
}
|
||||
return;
|
||||
}
|
||||
else {
|
||||
return $self->{'resource_data'};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
sub fetch {
|
||||
#-------------------------------------------------------------------------------
|
||||
# fetches a page and handles CGI if requested
|
||||
#
|
||||
my ( $self, $url, $parameters, $sock, $request_str, $host, $port, $dirs, $file, $params, $fh, $order );
|
||||
|
||||
$self = shift;
|
||||
$url = shift || $self->{'URL'} or return undef;
|
||||
$parameters = shift || $self->{'parameters'};
|
||||
|
||||
# save the headers
|
||||
$self->{'URL'} = $url;
|
||||
$self->{'parameters'} = $parameters;
|
||||
|
||||
( $host, $port, $dirs, $file, $params, $order ) = parse_url($url);
|
||||
|
||||
# see if we can connect to the host
|
||||
$sock = GT::Socket->open({
|
||||
'host' => $host,
|
||||
'port' => $port,
|
||||
'_debug' => $self->{_debug},
|
||||
'debug' => $self->{debug},
|
||||
'max_down' => $self->{max_down}
|
||||
});
|
||||
|
||||
# submit the request to the host
|
||||
if ( $sock ) {
|
||||
$fh = $sock->fh();
|
||||
$self->debug( "Connected to $url" ) if ($self->{_debug});
|
||||
$request_str = $self->_create_request( $url, $parameters, $host, $port, $dirs, $file, $params, $order );
|
||||
$self->debug( "Sent request:\n$request_str" ) if ($self->{_debug});
|
||||
print $fh $request_str;
|
||||
}
|
||||
|
||||
else {
|
||||
$self->debug( "Couldn't connect to $url. Reason: $GT::Socket::error" ) if ($self->{_debug});
|
||||
$self->{resource_attribs} = {
|
||||
ERROR_CODE => 'CANTCONN',
|
||||
ERROR_MESSAGE => 'Cannot connect to server. Reason: $GT::Socket::error'
|
||||
};
|
||||
$self->{resource_downloaded} = 1;
|
||||
$self->{flag_err} = 1;
|
||||
}
|
||||
|
||||
# store the socket
|
||||
return( $self->{'sock'} = $sock );
|
||||
}
|
||||
|
||||
sub _parse_server_headers {
|
||||
#-------------------------------------------------------------------------------
|
||||
# this next part parses the server headers
|
||||
#
|
||||
my ( $self, $doc, $fh, $sock, $line, $error_code, $error_message, %resource_attribs, @lines );
|
||||
$self = shift;
|
||||
|
||||
$doc = shift || $self->{'resource_data'};
|
||||
|
||||
if ( !$self->{'headers_parsed'} ) {
|
||||
|
||||
# check is there is a full header yet
|
||||
( $doc =~ s/((.*?))(\012\015\012\015|\015\012\015\012|\015\015|\012\012)//so ) or return undef;
|
||||
$line = $1;
|
||||
$self->{'resource_data'} = $doc;
|
||||
$line =~ s/(\012\015|\015\012)/\n/g;
|
||||
|
||||
# now get all the lines of the header..,
|
||||
@lines = grep { not /^(\n|\r)+$/ } split /(\n|\r)/, $line;
|
||||
|
||||
# read the first header, which is the status line
|
||||
$line = shift @lines;
|
||||
($error_code, $error_message) = $line =~ /[^ ]*[ ]*(\d*)(.*)/;
|
||||
|
||||
# loop until we're done the headers
|
||||
while ( $line = shift @lines ) {
|
||||
if ( $line =~ /^(.*?): (.*)/ ) {
|
||||
$resource_attribs{lc($1)} = $2;
|
||||
}
|
||||
}
|
||||
|
||||
$resource_attribs{ERROR_CODE} = $error_code;
|
||||
$resource_attribs{ERROR_MESSAGE} = $error_message;
|
||||
|
||||
# now parse cookies since we're dealing in http
|
||||
if ( $resource_attribs{'set-cookie'} ) {
|
||||
$resource_attribs{'COOKIE'} = _parse_cookie_string( $resource_attribs{'set-cookie'} );
|
||||
}
|
||||
|
||||
# add the headers to the local object
|
||||
$self->{'resource_attribs'} = \%resource_attribs;
|
||||
|
||||
# now set the headers parsed flag
|
||||
$self->{'headers_parsed'} = 1;
|
||||
}
|
||||
|
||||
return \%resource_attribs;
|
||||
}
|
||||
|
||||
sub _parse_cookie_string {
|
||||
#-------------------------------------------------------------------------------
|
||||
my $str = shift;
|
||||
my @cookie = grep $_, map { s/^\s*(.*?)\s*$/$1/; $_ } split /[;]/, $str;
|
||||
|
||||
# as first element of the cookie is a hash
|
||||
my $value = shift @cookie;
|
||||
|
||||
my %segments = map { /^([^=]*)=(.*)$/ ? ( lc($1) => $2 ) : ( $_ => undef ) } @cookie;
|
||||
|
||||
if ( $value =~ /^([^=]*)=(.*)$/ ) {
|
||||
$segments{name} = GT::CGI::unescape($1);
|
||||
$segments{value} = GT::CGI::unescape($2);
|
||||
}
|
||||
else {
|
||||
$segments{name} = GT::CGI::unescape($value);
|
||||
$segments{value} = undef;
|
||||
};
|
||||
|
||||
return \%segments;
|
||||
}
|
||||
|
||||
sub _hash_to_phash {
|
||||
#-------------------------------------------------------------------------------
|
||||
# creates a parameter hash from a standard hash
|
||||
my $hash = shift or return undef;
|
||||
|
||||
my $phash = {
|
||||
map {
|
||||
$_ => [ $$hash{$_} ]
|
||||
} keys %{$hash}
|
||||
};
|
||||
|
||||
return $phash;
|
||||
}
|
||||
|
||||
sub _combine_phash {
|
||||
#-------------------------------------------------------------------------------
|
||||
my ( $a, $b ) = @_;
|
||||
foreach my $key ( %$b ) {
|
||||
my $container = $a->{$key};
|
||||
if ( ref $container ) {
|
||||
push @{$container}, @{$b->{$key}};
|
||||
}
|
||||
else {
|
||||
$a->{$key} = [ $container, @{$b->{$key}} ];
|
||||
}
|
||||
}
|
||||
return $a;
|
||||
}
|
||||
|
||||
sub _create_request {
|
||||
#-------------------------------------------------------------------------------
|
||||
# creates the http request to be submitted to the server
|
||||
my ( $sock, $fh, $get_str, $req, $headers, $method, $tmp );
|
||||
|
||||
my ( $self, $url, $parameters, $host, $port, $dirs, $file, $params, $order ) = @_;
|
||||
|
||||
$method = lc($self->{'request_method'});
|
||||
|
||||
# now handle the request
|
||||
$get_str = uc($method) . ' /' . build_path($dirs,$file);
|
||||
|
||||
# if this is a get request, if there are any parameters we want to pass, handle that
|
||||
if ( $method eq 'get' ) {
|
||||
$tmp = build_parameters( _combine_phash( $params, _hash_to_phash($parameters) ), $order );
|
||||
$get_str.= $tmp ? "?$tmp" : "";
|
||||
}
|
||||
$get_str .= " HTTP/1.0$EOL";
|
||||
|
||||
# handle the host field...,
|
||||
$get_str .= "Host: $host" . ( ( $port == 80 ) ? "" : ":$port" ) . $EOL;
|
||||
|
||||
# handle the useragent field
|
||||
$get_str .= "User-Agent: $self->{agent_name} ($self->{agent_host})" . $EOL;
|
||||
$get_str .= "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*" . $EOL;
|
||||
$get_str .= "Accept-Language: en" . $EOL;
|
||||
$get_str .= "Accept-Charset: iso-8859-1,*,utf-8" . $EOL;
|
||||
$get_str .= "Connection: close" . $EOL;
|
||||
|
||||
# handle cookies
|
||||
if ( my $cookiestr = $self->_create_cookie_request() ) {
|
||||
$get_str .= $cookiestr . $EOL;
|
||||
}
|
||||
|
||||
# handle any extra headers
|
||||
$headers = $self->{'headers'};
|
||||
$get_str .= join(
|
||||
"",
|
||||
map {
|
||||
$_ = ucfirst;
|
||||
"$_: $$headers{$_}$EOL";
|
||||
} keys %{$headers}
|
||||
);
|
||||
|
||||
# handle if the request is a post
|
||||
if ( $method eq 'post' ) {
|
||||
my $request = build_parameters( { %{$params}, %{_hash_to_phash($parameters)} } );
|
||||
$get_str .=
|
||||
"Content-type: application/x-www-form-urlencoded$EOL" .
|
||||
"Content-length: " .length($request)."$EOL$EOL".
|
||||
$request;
|
||||
}
|
||||
|
||||
else {
|
||||
$get_str .= "$EOL$EOL";
|
||||
}
|
||||
|
||||
return $get_str;
|
||||
}
|
||||
|
||||
sub cookie {
|
||||
}
|
||||
|
||||
sub _create_cookie_request {
|
||||
#-------------------------------------------------------------------------------
|
||||
# creates the string the client sends to the server for a cookie
|
||||
my $self = shift;
|
||||
|
||||
my $cookies = $self->{'cookies'};
|
||||
my $str = join "; ", grep $_, map {
|
||||
|
||||
my $value = $cookies->{$_};
|
||||
|
||||
if ( ref $value eq 'GT::CGI::Cookie' ) {
|
||||
GT::CGI::escape($value->{-name}) . "=" . GT::CGI::escape( $value->{-value} );
|
||||
}
|
||||
|
||||
elsif ( ref $value eq 'HASH' ) {
|
||||
GT::CGI::escape($value->{name}) . "=" . GT::CGI::escape( $value->{value} );
|
||||
}
|
||||
|
||||
elsif ( $value and not ref $value ) {
|
||||
GT::CGI::escape($_) . "=" . GT::CGI::escape($value);
|
||||
}
|
||||
|
||||
} keys %{$cookies};
|
||||
|
||||
return $str ? "Cookie: $str" : "";
|
||||
}
|
||||
|
||||
sub method {
|
||||
#-------------------------------------------------------------------------------
|
||||
# sets the acquisition method for parameter quering
|
||||
#
|
||||
my $self = shift;
|
||||
my $method = shift or return $self->{'form_method'};
|
||||
|
||||
if ( lc $method eq 'get' ) {
|
||||
$self->{'form_method'} = lc $method;
|
||||
}
|
||||
|
||||
elsif ( lc $method eq 'post' ) {
|
||||
$self->{'form_method'} = lc $method;
|
||||
}
|
||||
|
||||
elsif ( lc $method eq 'head' ) {
|
||||
$self->{'form_method'} = lc $method;
|
||||
}
|
||||
|
||||
return $self->{'form_method'};
|
||||
}
|
||||
|
||||
sub load_parameter {
|
||||
#-------------------------------------------------------------------------------
|
||||
# loads the passed parameters into the database
|
||||
my ( $self, $params, $key, $parameters );
|
||||
$self = shift;
|
||||
$params = ( ref $_[0] ? $_[0] : { @_ } ) or return undef;
|
||||
$parameters = $self->{'parameters'};
|
||||
|
||||
# add each entry into the object's attribs
|
||||
foreach $key ( keys %{$params} ) {
|
||||
$$parameters{ $key } = $$params{ $key };
|
||||
}
|
||||
|
||||
return $self->{'parameters'};
|
||||
}
|
||||
|
||||
sub delete_parameter {
|
||||
#-------------------------------------------------------------------------------
|
||||
# deletes the named parameters
|
||||
my ( $self, $params, $key, $parameters );
|
||||
$self = shift;
|
||||
$params = ref $_[0] ? $_[0] : [ @_ ];
|
||||
$parameters = $self->{'parameters'};
|
||||
|
||||
# now delete the parameters
|
||||
foreach $key ( @{$params} ) {
|
||||
delete $$parameters{$key};
|
||||
}
|
||||
}
|
||||
|
||||
sub parse_url {
|
||||
#-------------------------------------------------------------------------------
|
||||
# parses the url and converts it into host
|
||||
# assume HTTP/1.0 format
|
||||
#
|
||||
my ( $url, $host, $port, $path, $file, $page, @dirs, $param, %params, $item, @order );
|
||||
$url = shift;
|
||||
|
||||
# first, do the initial parse
|
||||
$url =~ s/^[a-zA-Z]+:\/\///i;
|
||||
$url =~ /([\w.\-]*):?(\d*)\/?([^?]*)\??(.*)/i;
|
||||
|
||||
$host = $1;
|
||||
$port = $2 || 80;
|
||||
$path = $3;
|
||||
$param = $4;
|
||||
|
||||
# now handle the paths
|
||||
$path =~ s/^\s*(.*?)\s*$/$1/;
|
||||
|
||||
# the paths are not simply directories
|
||||
if ( $path !~ /\/$/ ) {
|
||||
$path =~ s/([^\/]*)$//;
|
||||
$file = $1;
|
||||
}
|
||||
|
||||
# handle the directories
|
||||
@dirs = grep $_, split /\//, $path;
|
||||
|
||||
# and finally parse out the parameters if there are any
|
||||
foreach $item ( split /\&/, $param ) {
|
||||
if ( $item =~ /^(.*?)=(.*?)$/ ) {
|
||||
push @order, $1;
|
||||
push @{$params{$1}}, $2;
|
||||
}
|
||||
else {
|
||||
push @order, $item;
|
||||
push @{$params{$item}}, undef;
|
||||
}
|
||||
}
|
||||
|
||||
# now return the collection
|
||||
return ( $host, $port, \@dirs, $file, \%params, \@order );
|
||||
|
||||
}
|
||||
|
||||
sub deparse_url {
|
||||
#-------------------------------------------------------------------------------
|
||||
# takes a number of parameters and builds the request url
|
||||
my ( $url, $host, $port, $path, $file, $page, $dirs, $param, $params, $item );
|
||||
|
||||
( $host, $port, $dirs, $file, $params ) = @_;
|
||||
|
||||
$url = $host;
|
||||
|
||||
# build the host portion
|
||||
if ( $port != 80 ) {
|
||||
$url .= ":80";
|
||||
}
|
||||
|
||||
# now build the page access
|
||||
$url .= "/" . build_path( $dirs, $file );
|
||||
|
||||
# and finally handle the parameters if any
|
||||
if ( $params and keys %{$params} ) {
|
||||
$url .= "?" . build_parameters( $params );
|
||||
}
|
||||
|
||||
return $url;
|
||||
|
||||
}
|
||||
|
||||
sub build_path {
|
||||
#-------------------------------------------------------------------------------
|
||||
# takes a dir array and builds a path
|
||||
my ( $path, $dirs, $page );
|
||||
|
||||
$dirs = shift;
|
||||
$page = shift;
|
||||
|
||||
if ( $dirs ) {
|
||||
$path = join( "/", grep $_, @{$dirs} );
|
||||
$path &&= "$path/";
|
||||
}
|
||||
|
||||
return $path .= $page;
|
||||
|
||||
}
|
||||
|
||||
sub build_parameters {
|
||||
#-------------------------------------------------------------------------------
|
||||
# takes a hash of parameters and builds a string with it
|
||||
my ( $params, $paramstr, @paramlist, $order );
|
||||
|
||||
$params = shift or return undef;
|
||||
$order = shift;
|
||||
|
||||
if ( keys %{$params} ) {
|
||||
|
||||
# Deep copy hash and array refs only.
|
||||
my $tmp;
|
||||
while (my ($k, $v) = each %$params) {
|
||||
if (! ref $v) {
|
||||
$tmp->{$k} = $v;
|
||||
}
|
||||
elsif (ref $v eq 'HASH') {
|
||||
$tmp->{$k} = {};
|
||||
foreach my $k1 (keys %{$params->{$k}}) { $tmp->{$k}->{$k1} = $params->{$k}->{$k1}; }
|
||||
}
|
||||
elsif (ref $v eq 'ARRAY') {
|
||||
$tmp->{$k} = [];
|
||||
foreach my $v1 (@{$params->{$k}}) { push @{$tmp->{$k}}, $v1; }
|
||||
}
|
||||
else { $tmp->{$k} = $v; }
|
||||
}
|
||||
|
||||
if ( $order ) {
|
||||
|
||||
foreach my $key ( @$order ) {
|
||||
my $value = shift @{$tmp->{$key} || []};
|
||||
if ( $tmp->{$key} and not @{$tmp->{$key}} ) {
|
||||
delete $tmp->{$key}
|
||||
}
|
||||
$key = GT::CGI::escape( $key );
|
||||
push @paramlist, ( defined $value ? "$key=$value" : $key );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
require GT::CGI;
|
||||
$paramstr = join "&", ( @paramlist,
|
||||
map {
|
||||
my $key = GT::CGI::escape($_);
|
||||
my $list = $tmp->{$_};
|
||||
|
||||
( ref $list eq 'ARRAY' ) ?
|
||||
join "&",
|
||||
map {
|
||||
"$key" . ( defined $_ ? "=". GT::CGI::escape($_) : '' );
|
||||
} @{$list}
|
||||
:
|
||||
"$key=$list";
|
||||
|
||||
} keys %{$tmp} );
|
||||
}
|
||||
|
||||
return $paramstr;
|
||||
}
|
||||
|
||||
sub get {
|
||||
#-------------------------------------------------------------------------------
|
||||
# simple blocking method to get some data
|
||||
my ( $uri, $options, $HTTP, $doc );
|
||||
|
||||
shift; # get rid of the first parameter
|
||||
$uri = shift or return undef;
|
||||
$options = ( @_ and ref $_[0] ) ? shift : {@_};
|
||||
|
||||
$HTTP = new GT::URI::HTTP($options);
|
||||
$HTTP->fetch($uri);
|
||||
|
||||
while ( not defined( $doc = $HTTP->do_iteration(-1) ) ) {};
|
||||
|
||||
return $doc;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
GT::URI::HTTP - HTTP request broker.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use GT::URI::HTTP;
|
||||
|
||||
print GT::URI::HTTP->get( "http://www.gossamer-threads.com" );
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
GT::URI::HTTP makes requests and retrieves resources from http servers (not
|
||||
limited to text). Can be used stand-alone or through GT::URI
|
||||
|
||||
=head1 Method List
|
||||
|
||||
Socket Handling
|
||||
|
||||
sub pending() Returns true if data awaiting
|
||||
sub EOF() Returns open/closed status of socket
|
||||
sub gulp_read() Alias to do_iteration
|
||||
sub do_iteration() Basic looping function that downloads resources in the background
|
||||
|
||||
Acquisition
|
||||
|
||||
sub fetch() Tell the object which URL to acquire
|
||||
sub method() The method of acquisition
|
||||
sub load_parameter() Add a item for CGI parameters
|
||||
sub delete_parameter() Delete a CGI parameter
|
||||
sub resource_attrib() Headers related to resource and server
|
||||
sub get() Simple resource aquisition function
|
||||
|
||||
Support Methods (must be imported)
|
||||
|
||||
sub parse_url() Decomposes a URL into constituent parts
|
||||
sub deparse_url() Takes those parts and builds an URL
|
||||
sub build_path() Takes a list of directories and builds a path
|
||||
sub build_parameters() Takes a hash of parameter->values and builds a CGI request string
|
||||
|
||||
=head1 Basics
|
||||
|
||||
=head2 Getting a resource, the simple way
|
||||
|
||||
|
||||
Just want a single item? Call GT::URI::HTTP->get and all the magic will be done
|
||||
for you.
|
||||
|
||||
use GT::URI::HTTP;
|
||||
my $buf = GT::URI::HTTP->get( "http://www.gossamer-threads.com/" );
|
||||
|
||||
Get based requests are permissable as well:
|
||||
|
||||
use GT::URI::HTTP;
|
||||
my $buf = GT::URI::HTTP->get( "http://search.yahoo.com/bin/search?p=gossamer+threads" );
|
||||
|
||||
If extra options need to be set, simply append the options to the parameter
|
||||
list, like follows.
|
||||
|
||||
use GT::URI::HTTP;
|
||||
my $buf = GT::URI::HTTP->get( "http://search.yahoo.com/bin/search?p=gossamer+threads", { request_method => 'POST' } );
|
||||
|
||||
=head2 When just the document is not enough
|
||||
|
||||
|
||||
If a new GT::URI::HTTP object is instantiated, much more control is available,
|
||||
including facilities for non-blocking downloading of pages.
|
||||
|
||||
To create a GT::URI::HTTP object, call new with all the options required:
|
||||
|
||||
use GT::URI::HTTP;
|
||||
|
||||
my $http = new GT::URI::HTTP(
|
||||
|
||||
# URL to acquire (optional)
|
||||
'URL' => '',
|
||||
|
||||
# Can also be set to POST/GET/HEAD (optional)
|
||||
'request_method' => 'GET',
|
||||
|
||||
# a hash of keys pointing to an arrayref of values to be sent to the server
|
||||
# {
|
||||
# 'key' => [ 'value1', 'value2'... ],
|
||||
# }
|
||||
# (optional)
|
||||
'parameters' => {},
|
||||
|
||||
# Name portion of the User-Agent: string the server acquires (optional)
|
||||
'agent_name' => 'Mozilla/4.73 [en]',
|
||||
|
||||
# Host-from portion of the User-Agent: string the server acquires (optional)
|
||||
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
|
||||
|
||||
# To prevent downloading of 80Tb files, but if you still wanted to, set this to 0 (optional)
|
||||
'max_down' => 200000
|
||||
|
||||
);
|
||||
|
||||
If URL has been specified in the options, for interactions with a CGI, you can
|
||||
set extra parameters with $http->load_parameter(). Finally, loop on
|
||||
$http->do_iteration() until the value is defined. To replicate the "simple get"
|
||||
example:
|
||||
|
||||
use GT::URI::HTTP;
|
||||
$|++;
|
||||
|
||||
my $http = new GT::URI::HTTP(
|
||||
|
||||
URL => 'http://search.yahoo.com/bin/search',
|
||||
|
||||
# can also use the following:
|
||||
parameters => {
|
||||
'p' => [ 'gossamer threads' ]
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
my $doc;
|
||||
while ( not defined( $doc = $http->do_iteration() ) ) {
|
||||
# do something here while waiting for the resource to arrive
|
||||
print "."
|
||||
}
|
||||
|
||||
print $doc, "\n\n";
|
||||
|
||||
Beyond the resource, the http server often supplies extra information in a
|
||||
header. To access this information, use $http->resource_attrib().
|
||||
|
||||
Appending this code to the previous example, a list of all the associated server
|
||||
headers can be seen:
|
||||
|
||||
my $attribs = $http->resource_attrib();
|
||||
foreach my $key ( sort keys %{$attribs} ) {
|
||||
print "$key => $attribs->{$key}\n";
|
||||
}
|
||||
|
||||
=head2 Support Methods
|
||||
|
||||
In addition to the basic fetching abilities, since the module must parse HTTP
|
||||
URLs, the methods used to do so have been made public.
|
||||
|
||||
These methods decompose URLs into datastructures that make URLs easily studied
|
||||
or modified and then reconstructed.
|
||||
|
||||
However, these routines have not been polished for useability so beware! The
|
||||
following is a very basic example:
|
||||
|
||||
use GT::URI::HTTP qw/ parse_url deparse_url build_path build_parameters /;
|
||||
|
||||
# fragment the URL
|
||||
my ( $host, $port, $dirs, $file, $params ) = parse_url( 'http://www.gossamer-threads.com/perl/forum/showflat.pl?Cat=&Board=GosDisc&Number=113355&page=0&view=' );
|
||||
|
||||
print "Parsed Data:\n\n";
|
||||
print "Host: $host\n";
|
||||
print "Port: $port\n";
|
||||
print "Dirs:\n";
|
||||
foreach my $dir ( @{$dirs} ) {
|
||||
print " $dir/\n";
|
||||
}
|
||||
print "Resource Filename: $file\n";
|
||||
print "Params:\n";
|
||||
foreach my $key ( sort keys %{$params} ) {
|
||||
print " $key: ";
|
||||
my $values = ( $params->{$key} || {} );
|
||||
foreach my $value ( sort @{$values} ) {
|
||||
print "'", quotemeta($value), "' ";
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
# put the data back together again
|
||||
my $url = deparse_url( $host, $port, $dirs, $file, $params );
|
||||
print "\nDeparsed Data:\n\n";
|
||||
print "URL: http://$url\n";
|
||||
|
||||
=head1 Methods List
|
||||
|
||||
=head2 build_path ( dir ARRAYREF, [ page STRING ] ) : STRING
|
||||
|
||||
Takes an array ref of directory names and an optional filename and returns a
|
||||
filepath.
|
||||
|
||||
use GT::URI::HTTP qw/ build_path /;
|
||||
print build_path( [ 'topdir', 'middir', 'bottomdir' ], 'file.html' );
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 build_parameters ( parameter HASHREF ) : STRING
|
||||
|
||||
Builds a CGI request string from list of keys and values. The function has the
|
||||
ability to handle keys with more than one parameter, simply use an arrayref with
|
||||
multiple values.
|
||||
|
||||
use GT::URI::HTTP qw/ build_parameters /;
|
||||
my $params = {
|
||||
'simplekey' => 'value'
|
||||
'onekey' => [ 'one value' ],
|
||||
'anotherkey' => [ 'another value', 'and yet anotherone!' ],
|
||||
};
|
||||
print build_parameters($params);
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 delete_parameter ( keys ARRAYREF/ARRAY )
|
||||
|
||||
When loading the object with parameters before a request, it is possible to
|
||||
delete an entire set of keys and values.
|
||||
|
||||
=head2 deparse_url ( host STRING, [ port STRING, [ dirs ARRAYREF, [ file STRING, [ params HASH ] ] ] ] ) : STRING
|
||||
|
||||
This builds an entire URL from basic parameters.
|
||||
|
||||
For an example of this function, see the example in "Support Methods".
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 do_iteration ( tics INTEGER ) : STRING
|
||||
|
||||
The basic iteration function. This function will return undef until the resource
|
||||
is received which, upon receipt will return the resource data.
|
||||
|
||||
The function can return an empty string, so it is important to checked
|
||||
defined'ness. If the return is an empty string, check the ERROR_CODE in
|
||||
resource_attrib to find out if the script simply can't connect to the host or
|
||||
the resource is empty.
|
||||
|
||||
=head2 EOF () : BOOLEAN
|
||||
|
||||
Returns '1' or '0' depending if the object has stopped receiving/sending data to
|
||||
the remote server.
|
||||
|
||||
=head2 fetch ( url STRING, [ parameters HASHREF ] )
|
||||
|
||||
Tells the server the URL to retreive the resource of. If CGI parameters are
|
||||
required pass in a hash of keys and values.
|
||||
|
||||
=head2 GT::URI::HTTP->get ( url, [ options HASH/HASHREF ] ) : RESOURCE_DATA
|
||||
|
||||
Simplest resource aquision method. Give it the URL and any options and the
|
||||
function will return after the resource has been downloaded.
|
||||
|
||||
=head2 gulp_read ( tic INTEGER ) : RESOURCE_DATA
|
||||
|
||||
This is just an alias to the function do_iteration. This method is used by
|
||||
GT::URI in its mass resource aquisition runs.
|
||||
|
||||
Unless you feel like being different, you shouldn't need to use this.
|
||||
|
||||
=head2 load_parameter ( params HASH/HASHREF ) : HASHREF
|
||||
|
||||
Takes a list of keys and values and loads the values into the list of CGI
|
||||
parameters to be sent to the remote server.
|
||||
|
||||
=head2 method ( method STRING ) : STRING
|
||||
|
||||
Sets the acquisition method for the resource. Currently, GET/POST/HEAD are
|
||||
supported.
|
||||
|
||||
If no parameters are supplied the function simply returns the current
|
||||
acquisition method.
|
||||
|
||||
=head2 parse_url ( url STRING ) : host STRING, port INTEGER, dirs ARRAYREF,
|
||||
file STRING, params HASHREF
|
||||
|
||||
Takes an URL and decomposes it into easily manipulated datastructures. The
|
||||
output can be fed back into deparse_url to reconstruct an URL.
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 pending ( tics INTEGER ) : BOOLEAN
|
||||
|
||||
If there is data available to be downloaded, this function returns '1',
|
||||
otherwise '0'. This is another function used by GT::URI in it's mass downloads
|
||||
and unlikely to be of any use to anyone using this module directly. This
|
||||
function exists because it is lighter than do_iteration which can be quite a
|
||||
load as opposed to this if there were 100 racked downloads, all being polled
|
||||
every tenth of a second!
|
||||
|
||||
=head2 resource_attrib ( [ key STRING ] ) STRING or HASHREF
|
||||
|
||||
If a key is requested, function will return the value associated with the
|
||||
resource attribute. If not, the function will return a hashref keyed by server
|
||||
parameter to its corresponding value.
|
||||
|
||||
All the server keys have been converted into lower-case. This prevents conflict
|
||||
with two very important keys, ERROR_CODE, and ERROR_MESSAGE, which carry the
|
||||
HTTP error code and message associated with the aquisition of this page.
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 HTTP get example
|
||||
|
||||
#!/usr/bin/perl
|
||||
|
||||
use GT::URI::HTTP;
|
||||
|
||||
if ( not @ARGV ) {
|
||||
print qq!
|
||||
SYNOPSIS
|
||||
|
||||
$0 url [-f/-h] [ cgi_parameter1=value1 cgi_parameter2=value2 ... ]
|
||||
|
||||
basic HTTP requestor
|
||||
|
||||
OPTIONS
|
||||
|
||||
-f : full information; headers and resource. Usually only a dump of the resource is provided.
|
||||
-h : just the headers\n\n!;
|
||||
exit;
|
||||
}
|
||||
|
||||
# parse out the command line
|
||||
# first argument, URL
|
||||
$url = shift @ARGV;
|
||||
|
||||
# next arguments, parameters
|
||||
foreach my $item ( @ARGV ) {
|
||||
|
||||
# ... check for special requests
|
||||
if ( $item =~ /^-([fd])$/ ) {
|
||||
$mode = $1;
|
||||
}
|
||||
|
||||
# ... is not a special request, but probably a parameter
|
||||
( $key, $value ) = ( $item =~ /([^=]+)=(.*)/ );
|
||||
$key ||= $item;
|
||||
push @{$parameters->{$key}}, $value;
|
||||
}
|
||||
|
||||
# setup and send the request
|
||||
$http = new GT::URI::HTTP(
|
||||
|
||||
# if we're only looking to use the head
|
||||
request_method => ( $mode eq 'h' ? 'HEAD' : 'GET' )
|
||||
|
||||
);
|
||||
$http->fetch( $url, $parameters );
|
||||
|
||||
# get the resource
|
||||
while ( not defined ( $doc = $http->do_iteration(-1) ) ) {}
|
||||
|
||||
# and print out the headers if wanted
|
||||
if ( $mode ) {
|
||||
$headers = $http->resource_attrib();
|
||||
foreach $key ( sort keys %{$headers || {}} ) {
|
||||
print "$key: $headers->{$key}\n";
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
# and output the resource...
|
||||
print $doc;
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
||||
http://www.gossamer-threads.com/
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
Revision: $Id: HTTP.pm,v 1.30 2002/06/27 18:36:02 aki Exp $
|
||||
|
||||
=cut
|
||||
|
601
site/slowtwitch.com/cgi-bin/articles/admin/GT/URI/HTTPS.pm
Normal file
601
site/slowtwitch.com/cgi-bin/articles/admin/GT/URI/HTTPS.pm
Normal file
@ -0,0 +1,601 @@
|
||||
# ==================================================================
|
||||
# Gossamer Threads Module Library
|
||||
#
|
||||
# GT::URI::HTTP
|
||||
# Author : Aki Mimoto (support@gossamer-threads.com)
|
||||
# CVS Info : 087,071,086,086,085
|
||||
# $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $
|
||||
#
|
||||
# Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
||||
# ==================================================================
|
||||
#
|
||||
# Description:
|
||||
# Gets HTTP data
|
||||
#
|
||||
|
||||
package GT::URI::HTTPS;
|
||||
# ===============================================================
|
||||
|
||||
|
||||
use strict;
|
||||
use lib '..';
|
||||
use GT::Socket;
|
||||
use GT::Base;
|
||||
use GT::URI::HTTP qw/ build_path build_parameters /;
|
||||
use Net::SSLeay qw(die_now die_if_ssl_error) ;
|
||||
use Exporter ();
|
||||
use vars qw/$ATTRIBS $EOL @ISA $DEBUG @EXPORT_OK $ERRORS/;
|
||||
|
||||
$ERRORS = {
|
||||
'NOSSLCTX' => 'Failed to create SSL_CTX: %s',
|
||||
'NOSSL' => 'Failed to create SSL: %s',
|
||||
'CTXOPTIONS' => 'Failed to set SSL CTX options: %s',
|
||||
'NOSSLCONNECT' => 'Failed to connect ssl',
|
||||
'CANTCONN' => 'Cannot connect to server'
|
||||
};
|
||||
|
||||
@ISA = ('GT::URI::HTTP', 'GT::Base', 'Exporter');
|
||||
|
||||
$EOL = "\015\012";
|
||||
$DEBUG = 0;
|
||||
|
||||
$ATTRIBS = {
|
||||
'URL' => '',
|
||||
'request_method' => 'GET',
|
||||
'parameters' => {},
|
||||
'agent_name' => 'Mozilla/4.73 [en]',
|
||||
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
|
||||
'headers' => {},
|
||||
'cookies' => {},
|
||||
'resource_attribs' => {},
|
||||
'resource_data' => undef,
|
||||
'max_down' => 200000,
|
||||
'debug' => 0,
|
||||
'ctx' => undef,
|
||||
'ssl' => undef,
|
||||
'cipher' => undef,
|
||||
};
|
||||
|
||||
@EXPORT_OK = qw/ parse_url deparse_url build_path build_parameters /;
|
||||
|
||||
Net::SSLeay::load_error_strings();
|
||||
Net::SSLeay::SSLeay_add_ssl_algorithms();
|
||||
Net::SSLeay::randomize();
|
||||
|
||||
sub pending {
|
||||
#-------------------------------------------------------------------------------
|
||||
# return true if there is some data to be picked up
|
||||
my $self = shift;
|
||||
my $tics = shift;
|
||||
return if ( $self->{resource_downloaded} );
|
||||
my $sock = $self->{'sock'} or return 1;
|
||||
return $sock->pending();
|
||||
}
|
||||
|
||||
sub EOF {
|
||||
#-------------------------------------------------------------------------------
|
||||
# return true if the system has finished downloading
|
||||
my $self = shift;
|
||||
my $sock = $self->{'sock'};
|
||||
return ( $sock and $sock->fh() ? $self->{resource_downloaded} : 1 );
|
||||
}
|
||||
|
||||
sub do_iteration {
|
||||
#-------------------------------------------------------------------------------
|
||||
# useful in a loop, this is a generally non blocking way to get a page
|
||||
# it will return nulls until it gets the page (and returns the page)
|
||||
#
|
||||
my ( $self, $fh, $sock, $pause );
|
||||
$self = shift;
|
||||
$pause = shift;
|
||||
$sock = $self->{'sock'};
|
||||
|
||||
# if we haven't done the request yet
|
||||
if ( not( $self->{'resource_downloaded'} or $sock ) and $self->{URL} ) {
|
||||
$self->fetch() or return '';
|
||||
$sock = $self->{'sock'};
|
||||
return;
|
||||
}
|
||||
|
||||
# if we can't connect
|
||||
if ( $self->{'flag_err'} ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$sock->fh() or return;
|
||||
# only if the resource hasn't been downloaded yet..,
|
||||
if ( !$self->{'resource_downloaded'} ) {
|
||||
# if there's data pending, download it!
|
||||
if ( $sock->pending( $pause ) ) {
|
||||
# get as much data as we can
|
||||
my $ssl = $self->{'ssl'};
|
||||
my $buf = Net::SSLeay::read($ssl);
|
||||
if ( $buf ) {
|
||||
$self->{'resource_data'} .= $buf;
|
||||
# if we need to parse the server headers
|
||||
$self->_parse_server_headers();
|
||||
}
|
||||
# we've run out of data, let's stop
|
||||
else {
|
||||
$self->{resource_downloaded} = 1;
|
||||
return $self->{'resource_data'};
|
||||
Net::SSLeay::free ($ssl);
|
||||
Net::SSLeay::CTX_free ($self->{'ctx'});
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
else {
|
||||
return $self->{'resource_data'};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
sub fetch {
|
||||
#-------------------------------------------------------------------------------
|
||||
# fetches a page and handles CGI if requested
|
||||
#
|
||||
my ( $self, $url, $parameters, $sock, $request_str, $fh );
|
||||
|
||||
$self = shift;
|
||||
$url = shift || $self->{'URL'} or return undef;
|
||||
$parameters = shift || $self->{'parameters'};
|
||||
|
||||
# save the headers
|
||||
$self->{'URL'} = $url;
|
||||
$self->{'parameters'} = $parameters;
|
||||
|
||||
my ($host, $port, $dirs, $file, $params) = parse_url($url);
|
||||
|
||||
# see if we can connect to the host
|
||||
$sock = GT::Socket->open({
|
||||
'host' => $host,
|
||||
'port' => $port,
|
||||
'_debug' => $self->{_debug},
|
||||
'debug' => $self->{debug},
|
||||
'max_down' => $self->{max_down}
|
||||
});
|
||||
|
||||
# submit the request to the host
|
||||
if ( $sock ) {
|
||||
$fh = $sock->fh();
|
||||
|
||||
# The network connection is now open, lets fire up SSL
|
||||
my $ctx = $self->{'ctx'} = Net::SSLeay::CTX_new() or return $self->error("NOSSLCTX", 'WARN', $!);
|
||||
Net::SSLeay::CTX_set_options($ctx, &Net::SSLeay::OP_ALL) and return $self->error("CTXOPTIONS", 'WARN');
|
||||
my $ssl = $self->{'ssl'} = Net::SSLeay::new($ctx) or return $self->error("NOSSL", 'WARN', $!);
|
||||
Net::SSLeay::set_fd($ssl, fileno($fh));
|
||||
my $res = $self->{'res'} = Net::SSLeay::connect($ssl);# and return $self->error( "NOSSLCONNECT", 'WARN' );
|
||||
|
||||
$self->{'cipher'} = Net::SSLeay::get_cipher($ssl);
|
||||
|
||||
$self->debug( "Connected to $url" ) if ($self->{_debug});
|
||||
$request_str = $self->_create_request( $url, $parameters, $host, $port, $dirs, $file, $params );
|
||||
$self->debug( "Sent request:\n$request_str" ) if ($self->{_debug});
|
||||
$res = Net::SSLeay::write($ssl, $request_str);
|
||||
}
|
||||
else {
|
||||
$self->debug( "Couldn't connect to $url" ) if ($self->{_debug});
|
||||
$self->{resource_attribs} = {
|
||||
ERROR_CODE => 'CANTCONN',
|
||||
ERROR_MESSAGE => 'Cannot connect to server'
|
||||
};
|
||||
$self->{resource_downloaded} = 1;
|
||||
$self->{flag_err} = 1;
|
||||
}
|
||||
|
||||
# store the socket
|
||||
return($self->{'sock'} = $sock);
|
||||
}
|
||||
|
||||
sub parse_url {
|
||||
#-------------------------------------------------------------------------------
|
||||
# parses the url and converts it into host
|
||||
# assume HTTP/1.0 format
|
||||
#
|
||||
my ( $url, $host, $port, $path, $file, $page, @dirs, $param, %params, $item );
|
||||
$url = shift;
|
||||
|
||||
# first, do the initial parse
|
||||
$url =~ s/^[a-zA-Z]+:\/\///i;
|
||||
($host, $port, $path, $param) = $url =~ /([\w.\-]*):?(\d*)\/?([^?]*)\??(.*)/;
|
||||
$port ||= 443;
|
||||
|
||||
# now handle the paths
|
||||
$path =~ s/^\s*(.*?)\s*$/$1/;
|
||||
|
||||
# the paths are not simply directories
|
||||
if ( $path !~ /\/$/ ) {
|
||||
$path =~ s/([^\/]*)$//;
|
||||
$file = $1;
|
||||
}
|
||||
|
||||
# handle the directories
|
||||
@dirs = grep $_, split '/', $path;
|
||||
|
||||
# and finally parse out the parameters if there are any
|
||||
foreach $item ( split /&/, $param ) {
|
||||
if ( $item =~ /^(.*?)=(.*)$/ ) {
|
||||
push @{$params{$1}}, $2;
|
||||
}
|
||||
else {
|
||||
push @{$params{$item}}, undef;
|
||||
}
|
||||
}
|
||||
|
||||
# now return the collection
|
||||
return( $host, $port, \@dirs, $file, \%params );
|
||||
|
||||
}
|
||||
|
||||
sub deparse_url {
|
||||
#-------------------------------------------------------------------------------
|
||||
# takes a number of parameters and builds the request url
|
||||
my ( $url, $host, $port, $path, $file, $page, $dirs, $param, $params, $item );
|
||||
|
||||
( $host, $port, $dirs, $file, $params ) = @_;
|
||||
|
||||
$url = $host;
|
||||
|
||||
# build the host portion
|
||||
$url .= ":$port" unless ($port == 443);
|
||||
|
||||
# now build the page access
|
||||
$url .= "/" . build_path( $dirs, $file );
|
||||
|
||||
# and finally handle the parameters if any
|
||||
if ( $params and keys %{$params} ) {
|
||||
$url .= "?" . build_parameters( $params );
|
||||
}
|
||||
|
||||
return $url;
|
||||
|
||||
}
|
||||
|
||||
sub get {
|
||||
#-------------------------------------------------------------------------------
|
||||
# simple blocking method to get some data
|
||||
my ( $uri, $options, $HTTPS, $doc );
|
||||
|
||||
shift; # get rid of the first parameter
|
||||
$uri = shift or return undef;
|
||||
$options = ( @_ and ref $_[0] ) ? shift : { @_ };
|
||||
$HTTPS = new GT::URI::HTTPS($options);
|
||||
$HTTPS->fetch($uri);
|
||||
|
||||
while ( not defined( $doc = $HTTPS->do_iteration(0) ) ) {};
|
||||
|
||||
return $doc;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
GT::URI::HTTPS - HTTPS request broker. Can be used stand-alone or through GT::URI
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use GT::URI::HTTPS;
|
||||
|
||||
print GT::URI::HTTPS->get( "http://www.gossamer-threads.com" );
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
GT::URI::HTTPS makes requests and retrieves resources from http servers (not limited to text).
|
||||
|
||||
=head1 Method List
|
||||
|
||||
Socket Handling
|
||||
|
||||
sub pending() Returns true if data awaiting
|
||||
sub EOF() Returns open/closed status of socket
|
||||
sub gulp_read() Alias to do_iteration
|
||||
sub do_iteration() Basic looping function that downloads resources in the background
|
||||
|
||||
Acquisition
|
||||
|
||||
sub fetch() Tell the object which URL to acquire
|
||||
sub method() The method of acquisition
|
||||
sub load_parameter() Add a item for CGI parameters
|
||||
sub delete_parameter() Delete a CGI parameter
|
||||
sub resource_attrib() Headers related to resource and server
|
||||
sub get() Simple resource aquisition function
|
||||
|
||||
Support Methods (must be imported)
|
||||
|
||||
sub parse_url() Decomposes a URL into constituent parts
|
||||
sub deparse_url() Takes those parts and builds an URL
|
||||
sub build_path() Takes a list of directories and builds a path
|
||||
sub build_parameters() Takes a hash of parameter->values and builds a CGI request string
|
||||
|
||||
=head1 Basics
|
||||
|
||||
=head2 Getting a resource, the simple way
|
||||
|
||||
|
||||
Just want a single item? Call GT::URI::HTTPS->get and all the magic will be done for you.
|
||||
|
||||
use GT::URI::HTTPS;
|
||||
my $buf = GT::URI::HTTPS->get( "http://www.gossamer-threads.com/" );
|
||||
|
||||
Get based requests are permissable as well:
|
||||
|
||||
use GT::URI::HTTPS;
|
||||
my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads" );
|
||||
|
||||
If extra options need to be set, simply append the options to the parameter list, like follows.
|
||||
|
||||
use GT::URI::HTTPS;
|
||||
my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads", { request_method => 'POST' } );
|
||||
|
||||
=head2 When just the document is not enough
|
||||
|
||||
|
||||
If a new GT::URI::HTTPS object is instantiated, much more control is available, including facilities for non-blocking downloading of pages.
|
||||
|
||||
To create a GT::URI::HTTPS object, call new with all the options required:
|
||||
|
||||
use GT::URI::HTTPS;
|
||||
|
||||
my $http = new GT::URI::HTTPS(
|
||||
|
||||
# URL to acquire (optional)
|
||||
'URL' => '',
|
||||
|
||||
# Can also be set to POST/GET/HEAD (optional)
|
||||
'request_method' => 'GET',
|
||||
|
||||
# a hash of keys pointing to an arrayref of values to be sent to the server
|
||||
# {
|
||||
# 'key' => [ 'value1', 'value2'... ],
|
||||
# }
|
||||
# (optional)
|
||||
'parameters' => {},
|
||||
|
||||
# Name portion of the User-Agent: string the server acquires (optional)
|
||||
'agent_name' => 'Mozilla/4.73 [en]',
|
||||
|
||||
# Host-from portion of the User-Agent: string the server acquires (optional)
|
||||
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
|
||||
|
||||
# To prevent downloading of 80Tb files, but if you still wanted to, set this to 0 (optional)
|
||||
'max_down' => 200000
|
||||
|
||||
);
|
||||
|
||||
If URL has been specified in the options, for interactions with a CGI, you can set extra parameters with $http->load_parameter(). Finally, loop on $http->do_iteration() until the value is defined. To replicate the "simple get" example:
|
||||
|
||||
use GT::URI::HTTPS;
|
||||
$|++;
|
||||
|
||||
my $http = new GT::URI::HTTPS(
|
||||
|
||||
URL => 'http://search.yahoo.com/bin/search',
|
||||
|
||||
# can also use the following:
|
||||
parameters => {
|
||||
'p' => [ 'gossamer threads' ]
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
my $doc;
|
||||
while ( not defined( $doc = $http->do_iteration() ) ) {
|
||||
# do something here while waiting for the resource to arrive
|
||||
print "."
|
||||
}
|
||||
|
||||
print $doc, "\n\n";
|
||||
|
||||
Beyond the resource, the http server often supplies extra information in a header. To access this information, use $http->resource_attrib().
|
||||
|
||||
Appending this code to the previous example, a list of all the associated server headers can be seen:
|
||||
|
||||
my $attribs = $http->resource_attrib();
|
||||
foreach my $key ( sort keys %{$attribs} ) {
|
||||
print "$key => $attribs->{$key}\n";
|
||||
}
|
||||
|
||||
=head2 Support Methods
|
||||
|
||||
In addition to the basic fetching abilities, since the module must parse HTTPS URLs, the methods used to do so have been made public.
|
||||
|
||||
These methods decompose URLs into datastructures that make URLs easily studied or modified and then reconstructed.
|
||||
|
||||
However, these routines have not been polished for useability so beware! The following is a very basic example:
|
||||
|
||||
use GT::URI::HTTPS qw/ parse_url deparse_url build_path build_parameters /;
|
||||
|
||||
# fragment the URL
|
||||
my ( $host, $port, $dirs, $file, $params ) = parse_url( 'http://www.gossamer-threads.com/perl/forum/showflat.pl?Cat=&Board=GosDisc&Number=113355&page=0&view=' );
|
||||
|
||||
print "Parsed Data:\n\n";
|
||||
print "Host: $host\n";
|
||||
print "Port: $port\n";
|
||||
print "Dirs:\n";
|
||||
foreach my $dir ( @{$dirs} ) {
|
||||
print " $dir/\n";
|
||||
}
|
||||
print "Resource Filename: $file\n";
|
||||
print "Params:\n";
|
||||
foreach my $key ( sort keys %{$params} ) {
|
||||
print " $key: ";
|
||||
my $values = ( $params->{$key} || {} );
|
||||
foreach my $value ( sort @{$values} ) {
|
||||
print "'", quotemeta($value), "' ";
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
# put the data back together again
|
||||
my $url = deparse_url( $host, $port, $dirs, $file, $params );
|
||||
print "\nDeparsed Data:\n\n";
|
||||
print "URL: http://$url\n";
|
||||
|
||||
=head1 Methods List
|
||||
|
||||
=head2 build_path ( dir ARRAYREF, [ page STRING ] ) : STRING
|
||||
|
||||
Takes an array ref of directory names and an optional filename and returns a filepath.
|
||||
|
||||
use GT::URI::HTTPS qw/ build_path /;
|
||||
print build_path( [ 'topdir', 'middir', 'bottomdir' ], 'file.html' );
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 build_parameters ( parameter HASHREF ) : STRING
|
||||
|
||||
Builds a CGI request string from list of keys and values. The function has the ability to handle keys with more than one parameter, simply use an arrayref with multiple values.
|
||||
|
||||
use GT::URI::HTTPS qw/ build_parameters /;
|
||||
my $params = {
|
||||
'simplekey' => 'value'
|
||||
'onekey' => [ 'one value' ],
|
||||
'anotherkey' => [ 'another value', 'and yet anotherone!' ],
|
||||
};
|
||||
print build_parameters($params);
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 delete_parameter ( keys ARRAYREF/ARRAY )
|
||||
|
||||
When loading the object with parameters before a request, it is possible to delete an entire set of keys and values.
|
||||
|
||||
=head2 deparse_url ( host STRING, [ port STRING, [ dirs ARRAYREF, [ file STRING, [ params HASH ] ] ] ] ) : STRING
|
||||
|
||||
This builds an entire URL from basic parameters.
|
||||
|
||||
For an example of this function, see the example in "Support Methods".
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 do_iteration ( tics INTEGER ) : STRING
|
||||
|
||||
The basic iteration function. This function will return undef until the resource is received which, upon receipt will return the resource data.
|
||||
|
||||
The function can return an empty string, so it is important to checked defined'ness. If the return is an empty string, check the ERROR_CODE in resource_attrib to find out if the script simply can't connect to the host or the resource is empty.
|
||||
|
||||
=head2 EOF () : BOOLEAN
|
||||
|
||||
Returns '1' or '0' depending if the object has stopped receiving/sending data to the remote server.
|
||||
|
||||
=head2 fetch ( url STRING, [ parameters HASHREF ] )
|
||||
|
||||
Tells the server the URL to retreive the resource of. If CGI parameters are required pass in a hash of keys and values.
|
||||
|
||||
=head2 GT::URI::HTTPS->get ( url, [ options HASH/HASHREF ] ) : RESOURCE_DATA
|
||||
|
||||
Simplest resource aquision method. Give it the URL and any options and the function will return after the resource has been downloaded.
|
||||
|
||||
=head2 gulp_read ( tic INTEGER ) : RESOURCE_DATA
|
||||
|
||||
This is just an alias to the function do_iteration. This method is used by GT::URI in its mass resource aquisition runs.
|
||||
|
||||
Unless you feel like being different, you shouldn't need to use this.
|
||||
|
||||
=head2 load_parameter ( params HASH/HASHREF ) : HASHREF
|
||||
|
||||
Takes a list of keys and values and loads the values into the list of CGI parameters to be sent to the remote server.
|
||||
|
||||
=head2 method ( method STRING ) : STRING
|
||||
|
||||
Sets the acquisition method for the resource. Currently, GET/POST/HEAD are supported.
|
||||
|
||||
If no parameters are supplied the function simply returns the current acquisition method.
|
||||
|
||||
=head2 parse_url ( url STRING ) : host STRING, port INTEGER, dirs ARRAYREF, file STRING, params HASHREF
|
||||
|
||||
Takes an URL and decomposes it into easily manipulated datastructures. The output can be fed back into deparse_url to reconstruct an URL.
|
||||
|
||||
This function must be imported.
|
||||
|
||||
=head2 pending ( tics INTEGER ) : BOOLEAN
|
||||
|
||||
If there is data available to be downloaded, this function returns '1', otherwise '0'. This is another function used by GT::URI in it's mass downloads and unlikely to be of any use to anyone using this module directly. This function exists because it is lighter than do_iteration which can be quite a load as opposed to this if there were 100 racked downloads, all being polled every tenth of a second!
|
||||
|
||||
=head2 resource_attrib ( [ key STRING ] ) STRING or HASHREF
|
||||
|
||||
If a key is requested, function will return the value associated with the resource attribute. If not, the function will return a hashref keyed by server parameter to its corresponding value.
|
||||
|
||||
All the server keys have been converted into lower-case. This prevents conflict with two very important keys, ERROR_CODE, and ERROR_MESSAGE, which carry the HTTPS error code and message associated with the aquisition of this page.
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 HTTPS get example
|
||||
|
||||
#!/usr/bin/perl
|
||||
|
||||
use GT::URI::HTTPS;
|
||||
|
||||
if ( not @ARGV ) {
|
||||
print qq!
|
||||
SYNOPSIS
|
||||
|
||||
$0 url [-f/-h] [ cgi_parameter1=value1 cgi_parameter2=value2 ... ]
|
||||
|
||||
basic HTTPS requestor
|
||||
|
||||
OPTIONS
|
||||
|
||||
-f : full information; headers and resource. Usually only a dump of the resource is provided.
|
||||
-h : just the headers\n\n!;
|
||||
exit;
|
||||
}
|
||||
|
||||
# parse out the command line
|
||||
# first argument, URL
|
||||
$url = shift @ARGV;
|
||||
|
||||
# next arguments, parameters
|
||||
foreach my $item ( @ARGV ) {
|
||||
|
||||
# ... check for special requests
|
||||
if ( $item =~ /^-([fd])$/ ) {
|
||||
$mode = $1;
|
||||
}
|
||||
|
||||
# ... is not a special request, but probably a parameter
|
||||
( $key, $value ) = ( $item =~ /([^=]+)=(.*)/ );
|
||||
$key ||= $item;
|
||||
push @{$parameters->{$key}}, $value;
|
||||
}
|
||||
|
||||
# setup and send the request
|
||||
$http = new GT::URI::HTTPS(
|
||||
|
||||
# if we're only looking to use the head
|
||||
request_method => ( $mode eq 'h' ? 'HEAD' : 'GET' )
|
||||
|
||||
);
|
||||
$http->fetch( $url, $parameters );
|
||||
|
||||
# get the resource
|
||||
while ( not defined ( $doc = $http->do_iteration(-1) ) ) {}
|
||||
|
||||
# and print out the headers if wanted
|
||||
if ( $mode ) {
|
||||
$headers = $http->resource_attrib();
|
||||
foreach $key ( sort keys %{$headers || {}} ) {
|
||||
print "$key: $headers->{$key}\n";
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
# and output the resource...
|
||||
print $doc;
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
||||
http://www.gossamer-threads.com/
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
Revision: $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $
|
||||
|
||||
=cut
|
||||
|
Reference in New Issue
Block a user