602 lines
20 KiB
Perl
602 lines
20 KiB
Perl
# ==================================================================
|
|
# Gossamer Threads Module Library
|
|
#
|
|
# GT::URI::HTTP
|
|
# Author : Aki Mimoto (support@gossamer-threads.com)
|
|
# CVS Info : 087,071,086,086,085
|
|
# $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $
|
|
#
|
|
# Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
|
# ==================================================================
|
|
#
|
|
# Description:
|
|
# Gets HTTP data
|
|
#
|
|
|
|
package GT::URI::HTTPS;
|
|
# ===============================================================
|
|
|
|
|
|
use strict;
|
|
use lib '..';
|
|
use GT::Socket;
|
|
use GT::Base;
|
|
use GT::URI::HTTP qw/ build_path build_parameters /;
|
|
use Net::SSLeay qw(die_now die_if_ssl_error) ;
|
|
use Exporter ();
|
|
use vars qw/$ATTRIBS $EOL @ISA $DEBUG @EXPORT_OK $ERRORS/;
|
|
|
|
$ERRORS = {
|
|
'NOSSLCTX' => 'Failed to create SSL_CTX: %s',
|
|
'NOSSL' => 'Failed to create SSL: %s',
|
|
'CTXOPTIONS' => 'Failed to set SSL CTX options: %s',
|
|
'NOSSLCONNECT' => 'Failed to connect ssl',
|
|
'CANTCONN' => 'Cannot connect to server'
|
|
};
|
|
|
|
@ISA = ('GT::URI::HTTP', 'GT::Base', 'Exporter');
|
|
|
|
$EOL = "\015\012";
|
|
$DEBUG = 0;
|
|
|
|
$ATTRIBS = {
|
|
'URL' => '',
|
|
'request_method' => 'GET',
|
|
'parameters' => {},
|
|
'agent_name' => 'Mozilla/4.73 [en]',
|
|
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
|
|
'headers' => {},
|
|
'cookies' => {},
|
|
'resource_attribs' => {},
|
|
'resource_data' => undef,
|
|
'max_down' => 200000,
|
|
'debug' => 0,
|
|
'ctx' => undef,
|
|
'ssl' => undef,
|
|
'cipher' => undef,
|
|
};
|
|
|
|
@EXPORT_OK = qw/ parse_url deparse_url build_path build_parameters /;
|
|
|
|
Net::SSLeay::load_error_strings();
|
|
Net::SSLeay::SSLeay_add_ssl_algorithms();
|
|
Net::SSLeay::randomize();
|
|
|
|
sub pending {
|
|
#-------------------------------------------------------------------------------
|
|
# return true if there is some data to be picked up
|
|
my $self = shift;
|
|
my $tics = shift;
|
|
return if ( $self->{resource_downloaded} );
|
|
my $sock = $self->{'sock'} or return 1;
|
|
return $sock->pending();
|
|
}
|
|
|
|
sub EOF {
|
|
#-------------------------------------------------------------------------------
|
|
# return true if the system has finished downloading
|
|
my $self = shift;
|
|
my $sock = $self->{'sock'};
|
|
return ( $sock and $sock->fh() ? $self->{resource_downloaded} : 1 );
|
|
}
|
|
|
|
sub do_iteration {
|
|
#-------------------------------------------------------------------------------
|
|
# useful in a loop, this is a generally non blocking way to get a page
|
|
# it will return nulls until it gets the page (and returns the page)
|
|
#
|
|
my ( $self, $fh, $sock, $pause );
|
|
$self = shift;
|
|
$pause = shift;
|
|
$sock = $self->{'sock'};
|
|
|
|
# if we haven't done the request yet
|
|
if ( not( $self->{'resource_downloaded'} or $sock ) and $self->{URL} ) {
|
|
$self->fetch() or return '';
|
|
$sock = $self->{'sock'};
|
|
return;
|
|
}
|
|
|
|
# if we can't connect
|
|
if ( $self->{'flag_err'} ) {
|
|
return '';
|
|
}
|
|
|
|
$sock->fh() or return;
|
|
# only if the resource hasn't been downloaded yet..,
|
|
if ( !$self->{'resource_downloaded'} ) {
|
|
# if there's data pending, download it!
|
|
if ( $sock->pending( $pause ) ) {
|
|
# get as much data as we can
|
|
my $ssl = $self->{'ssl'};
|
|
my $buf = Net::SSLeay::read($ssl);
|
|
if ( $buf ) {
|
|
$self->{'resource_data'} .= $buf;
|
|
# if we need to parse the server headers
|
|
$self->_parse_server_headers();
|
|
}
|
|
# we've run out of data, let's stop
|
|
else {
|
|
$self->{resource_downloaded} = 1;
|
|
return $self->{'resource_data'};
|
|
Net::SSLeay::free ($ssl);
|
|
Net::SSLeay::CTX_free ($self->{'ctx'});
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
else {
|
|
return $self->{'resource_data'};
|
|
}
|
|
|
|
}
|
|
|
|
|
|
sub fetch {
|
|
#-------------------------------------------------------------------------------
|
|
# fetches a page and handles CGI if requested
|
|
#
|
|
my ( $self, $url, $parameters, $sock, $request_str, $fh );
|
|
|
|
$self = shift;
|
|
$url = shift || $self->{'URL'} or return undef;
|
|
$parameters = shift || $self->{'parameters'};
|
|
|
|
# save the headers
|
|
$self->{'URL'} = $url;
|
|
$self->{'parameters'} = $parameters;
|
|
|
|
my ($host, $port, $dirs, $file, $params) = parse_url($url);
|
|
|
|
# see if we can connect to the host
|
|
$sock = GT::Socket->open({
|
|
'host' => $host,
|
|
'port' => $port,
|
|
'_debug' => $self->{_debug},
|
|
'debug' => $self->{debug},
|
|
'max_down' => $self->{max_down}
|
|
});
|
|
|
|
# submit the request to the host
|
|
if ( $sock ) {
|
|
$fh = $sock->fh();
|
|
|
|
# The network connection is now open, lets fire up SSL
|
|
my $ctx = $self->{'ctx'} = Net::SSLeay::CTX_new() or return $self->error("NOSSLCTX", 'WARN', $!);
|
|
Net::SSLeay::CTX_set_options($ctx, &Net::SSLeay::OP_ALL) and return $self->error("CTXOPTIONS", 'WARN');
|
|
my $ssl = $self->{'ssl'} = Net::SSLeay::new($ctx) or return $self->error("NOSSL", 'WARN', $!);
|
|
Net::SSLeay::set_fd($ssl, fileno($fh));
|
|
my $res = $self->{'res'} = Net::SSLeay::connect($ssl);# and return $self->error( "NOSSLCONNECT", 'WARN' );
|
|
|
|
$self->{'cipher'} = Net::SSLeay::get_cipher($ssl);
|
|
|
|
$self->debug( "Connected to $url" ) if ($self->{_debug});
|
|
$request_str = $self->_create_request( $url, $parameters, $host, $port, $dirs, $file, $params );
|
|
$self->debug( "Sent request:\n$request_str" ) if ($self->{_debug});
|
|
$res = Net::SSLeay::write($ssl, $request_str);
|
|
}
|
|
else {
|
|
$self->debug( "Couldn't connect to $url" ) if ($self->{_debug});
|
|
$self->{resource_attribs} = {
|
|
ERROR_CODE => 'CANTCONN',
|
|
ERROR_MESSAGE => 'Cannot connect to server'
|
|
};
|
|
$self->{resource_downloaded} = 1;
|
|
$self->{flag_err} = 1;
|
|
}
|
|
|
|
# store the socket
|
|
return($self->{'sock'} = $sock);
|
|
}
|
|
|
|
sub parse_url {
|
|
#-------------------------------------------------------------------------------
|
|
# parses the url and converts it into host
|
|
# assume HTTP/1.0 format
|
|
#
|
|
my ( $url, $host, $port, $path, $file, $page, @dirs, $param, %params, $item );
|
|
$url = shift;
|
|
|
|
# first, do the initial parse
|
|
$url =~ s/^[a-zA-Z]+:\/\///i;
|
|
($host, $port, $path, $param) = $url =~ /([\w.\-]*):?(\d*)\/?([^?]*)\??(.*)/;
|
|
$port ||= 443;
|
|
|
|
# now handle the paths
|
|
$path =~ s/^\s*(.*?)\s*$/$1/;
|
|
|
|
# the paths are not simply directories
|
|
if ( $path !~ /\/$/ ) {
|
|
$path =~ s/([^\/]*)$//;
|
|
$file = $1;
|
|
}
|
|
|
|
# handle the directories
|
|
@dirs = grep $_, split '/', $path;
|
|
|
|
# and finally parse out the parameters if there are any
|
|
foreach $item ( split /&/, $param ) {
|
|
if ( $item =~ /^(.*?)=(.*)$/ ) {
|
|
push @{$params{$1}}, $2;
|
|
}
|
|
else {
|
|
push @{$params{$item}}, undef;
|
|
}
|
|
}
|
|
|
|
# now return the collection
|
|
return( $host, $port, \@dirs, $file, \%params );
|
|
|
|
}
|
|
|
|
sub deparse_url {
|
|
#-------------------------------------------------------------------------------
|
|
# takes a number of parameters and builds the request url
|
|
my ( $url, $host, $port, $path, $file, $page, $dirs, $param, $params, $item );
|
|
|
|
( $host, $port, $dirs, $file, $params ) = @_;
|
|
|
|
$url = $host;
|
|
|
|
# build the host portion
|
|
$url .= ":$port" unless ($port == 443);
|
|
|
|
# now build the page access
|
|
$url .= "/" . build_path( $dirs, $file );
|
|
|
|
# and finally handle the parameters if any
|
|
if ( $params and keys %{$params} ) {
|
|
$url .= "?" . build_parameters( $params );
|
|
}
|
|
|
|
return $url;
|
|
|
|
}
|
|
|
|
sub get {
|
|
#-------------------------------------------------------------------------------
|
|
# simple blocking method to get some data
|
|
my ( $uri, $options, $HTTPS, $doc );
|
|
|
|
shift; # get rid of the first parameter
|
|
$uri = shift or return undef;
|
|
$options = ( @_ and ref $_[0] ) ? shift : { @_ };
|
|
$HTTPS = new GT::URI::HTTPS($options);
|
|
$HTTPS->fetch($uri);
|
|
|
|
while ( not defined( $doc = $HTTPS->do_iteration(0) ) ) {};
|
|
|
|
return $doc;
|
|
}
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
GT::URI::HTTPS - HTTPS request broker. Can be used stand-alone or through GT::URI
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use GT::URI::HTTPS;
|
|
|
|
print GT::URI::HTTPS->get( "http://www.gossamer-threads.com" );
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
GT::URI::HTTPS makes requests and retrieves resources from http servers (not limited to text).
|
|
|
|
=head1 Method List
|
|
|
|
Socket Handling
|
|
|
|
sub pending() Returns true if data awaiting
|
|
sub EOF() Returns open/closed status of socket
|
|
sub gulp_read() Alias to do_iteration
|
|
sub do_iteration() Basic looping function that downloads resources in the background
|
|
|
|
Acquisition
|
|
|
|
sub fetch() Tell the object which URL to acquire
|
|
sub method() The method of acquisition
|
|
sub load_parameter() Add a item for CGI parameters
|
|
sub delete_parameter() Delete a CGI parameter
|
|
sub resource_attrib() Headers related to resource and server
|
|
sub get() Simple resource aquisition function
|
|
|
|
Support Methods (must be imported)
|
|
|
|
sub parse_url() Decomposes a URL into constituent parts
|
|
sub deparse_url() Takes those parts and builds an URL
|
|
sub build_path() Takes a list of directories and builds a path
|
|
sub build_parameters() Takes a hash of parameter->values and builds a CGI request string
|
|
|
|
=head1 Basics
|
|
|
|
=head2 Getting a resource, the simple way
|
|
|
|
|
|
Just want a single item? Call GT::URI::HTTPS->get and all the magic will be done for you.
|
|
|
|
use GT::URI::HTTPS;
|
|
my $buf = GT::URI::HTTPS->get( "http://www.gossamer-threads.com/" );
|
|
|
|
Get based requests are permissable as well:
|
|
|
|
use GT::URI::HTTPS;
|
|
my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads" );
|
|
|
|
If extra options need to be set, simply append the options to the parameter list, like follows.
|
|
|
|
use GT::URI::HTTPS;
|
|
my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads", { request_method => 'POST' } );
|
|
|
|
=head2 When just the document is not enough
|
|
|
|
|
|
If a new GT::URI::HTTPS object is instantiated, much more control is available, including facilities for non-blocking downloading of pages.
|
|
|
|
To create a GT::URI::HTTPS object, call new with all the options required:
|
|
|
|
use GT::URI::HTTPS;
|
|
|
|
my $http = new GT::URI::HTTPS(
|
|
|
|
# URL to acquire (optional)
|
|
'URL' => '',
|
|
|
|
# Can also be set to POST/GET/HEAD (optional)
|
|
'request_method' => 'GET',
|
|
|
|
# a hash of keys pointing to an arrayref of values to be sent to the server
|
|
# {
|
|
# 'key' => [ 'value1', 'value2'... ],
|
|
# }
|
|
# (optional)
|
|
'parameters' => {},
|
|
|
|
# Name portion of the User-Agent: string the server acquires (optional)
|
|
'agent_name' => 'Mozilla/4.73 [en]',
|
|
|
|
# Host-from portion of the User-Agent: string the server acquires (optional)
|
|
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
|
|
|
|
# To prevent downloading of 80Tb files, but if you still wanted to, set this to 0 (optional)
|
|
'max_down' => 200000
|
|
|
|
);
|
|
|
|
If URL has been specified in the options, for interactions with a CGI, you can set extra parameters with $http->load_parameter(). Finally, loop on $http->do_iteration() until the value is defined. To replicate the "simple get" example:
|
|
|
|
use GT::URI::HTTPS;
|
|
$|++;
|
|
|
|
my $http = new GT::URI::HTTPS(
|
|
|
|
URL => 'http://search.yahoo.com/bin/search',
|
|
|
|
# can also use the following:
|
|
parameters => {
|
|
'p' => [ 'gossamer threads' ]
|
|
}
|
|
|
|
);
|
|
|
|
my $doc;
|
|
while ( not defined( $doc = $http->do_iteration() ) ) {
|
|
# do something here while waiting for the resource to arrive
|
|
print "."
|
|
}
|
|
|
|
print $doc, "\n\n";
|
|
|
|
Beyond the resource, the http server often supplies extra information in a header. To access this information, use $http->resource_attrib().
|
|
|
|
Appending this code to the previous example, a list of all the associated server headers can be seen:
|
|
|
|
my $attribs = $http->resource_attrib();
|
|
foreach my $key ( sort keys %{$attribs} ) {
|
|
print "$key => $attribs->{$key}\n";
|
|
}
|
|
|
|
=head2 Support Methods
|
|
|
|
In addition to the basic fetching abilities, since the module must parse HTTPS URLs, the methods used to do so have been made public.
|
|
|
|
These methods decompose URLs into datastructures that make URLs easily studied or modified and then reconstructed.
|
|
|
|
However, these routines have not been polished for useability so beware! The following is a very basic example:
|
|
|
|
use GT::URI::HTTPS qw/ parse_url deparse_url build_path build_parameters /;
|
|
|
|
# fragment the URL
|
|
my ( $host, $port, $dirs, $file, $params ) = parse_url( 'http://www.gossamer-threads.com/perl/forum/showflat.pl?Cat=&Board=GosDisc&Number=113355&page=0&view=' );
|
|
|
|
print "Parsed Data:\n\n";
|
|
print "Host: $host\n";
|
|
print "Port: $port\n";
|
|
print "Dirs:\n";
|
|
foreach my $dir ( @{$dirs} ) {
|
|
print " $dir/\n";
|
|
}
|
|
print "Resource Filename: $file\n";
|
|
print "Params:\n";
|
|
foreach my $key ( sort keys %{$params} ) {
|
|
print " $key: ";
|
|
my $values = ( $params->{$key} || {} );
|
|
foreach my $value ( sort @{$values} ) {
|
|
print "'", quotemeta($value), "' ";
|
|
}
|
|
print "\n";
|
|
}
|
|
|
|
# put the data back together again
|
|
my $url = deparse_url( $host, $port, $dirs, $file, $params );
|
|
print "\nDeparsed Data:\n\n";
|
|
print "URL: http://$url\n";
|
|
|
|
=head1 Methods List
|
|
|
|
=head2 build_path ( dir ARRAYREF, [ page STRING ] ) : STRING
|
|
|
|
Takes an array ref of directory names and an optional filename and returns a filepath.
|
|
|
|
use GT::URI::HTTPS qw/ build_path /;
|
|
print build_path( [ 'topdir', 'middir', 'bottomdir' ], 'file.html' );
|
|
|
|
This function must be imported.
|
|
|
|
=head2 build_parameters ( parameter HASHREF ) : STRING
|
|
|
|
Builds a CGI request string from list of keys and values. The function has the ability to handle keys with more than one parameter, simply use an arrayref with multiple values.
|
|
|
|
use GT::URI::HTTPS qw/ build_parameters /;
|
|
my $params = {
|
|
'simplekey' => 'value'
|
|
'onekey' => [ 'one value' ],
|
|
'anotherkey' => [ 'another value', 'and yet anotherone!' ],
|
|
};
|
|
print build_parameters($params);
|
|
|
|
This function must be imported.
|
|
|
|
=head2 delete_parameter ( keys ARRAYREF/ARRAY )
|
|
|
|
When loading the object with parameters before a request, it is possible to delete an entire set of keys and values.
|
|
|
|
=head2 deparse_url ( host STRING, [ port STRING, [ dirs ARRAYREF, [ file STRING, [ params HASH ] ] ] ] ) : STRING
|
|
|
|
This builds an entire URL from basic parameters.
|
|
|
|
For an example of this function, see the example in "Support Methods".
|
|
|
|
This function must be imported.
|
|
|
|
=head2 do_iteration ( tics INTEGER ) : STRING
|
|
|
|
The basic iteration function. This function will return undef until the resource is received which, upon receipt will return the resource data.
|
|
|
|
The function can return an empty string, so it is important to checked defined'ness. If the return is an empty string, check the ERROR_CODE in resource_attrib to find out if the script simply can't connect to the host or the resource is empty.
|
|
|
|
=head2 EOF () : BOOLEAN
|
|
|
|
Returns '1' or '0' depending if the object has stopped receiving/sending data to the remote server.
|
|
|
|
=head2 fetch ( url STRING, [ parameters HASHREF ] )
|
|
|
|
Tells the server the URL to retreive the resource of. If CGI parameters are required pass in a hash of keys and values.
|
|
|
|
=head2 GT::URI::HTTPS->get ( url, [ options HASH/HASHREF ] ) : RESOURCE_DATA
|
|
|
|
Simplest resource aquision method. Give it the URL and any options and the function will return after the resource has been downloaded.
|
|
|
|
=head2 gulp_read ( tic INTEGER ) : RESOURCE_DATA
|
|
|
|
This is just an alias to the function do_iteration. This method is used by GT::URI in its mass resource aquisition runs.
|
|
|
|
Unless you feel like being different, you shouldn't need to use this.
|
|
|
|
=head2 load_parameter ( params HASH/HASHREF ) : HASHREF
|
|
|
|
Takes a list of keys and values and loads the values into the list of CGI parameters to be sent to the remote server.
|
|
|
|
=head2 method ( method STRING ) : STRING
|
|
|
|
Sets the acquisition method for the resource. Currently, GET/POST/HEAD are supported.
|
|
|
|
If no parameters are supplied the function simply returns the current acquisition method.
|
|
|
|
=head2 parse_url ( url STRING ) : host STRING, port INTEGER, dirs ARRAYREF, file STRING, params HASHREF
|
|
|
|
Takes an URL and decomposes it into easily manipulated datastructures. The output can be fed back into deparse_url to reconstruct an URL.
|
|
|
|
This function must be imported.
|
|
|
|
=head2 pending ( tics INTEGER ) : BOOLEAN
|
|
|
|
If there is data available to be downloaded, this function returns '1', otherwise '0'. This is another function used by GT::URI in it's mass downloads and unlikely to be of any use to anyone using this module directly. This function exists because it is lighter than do_iteration which can be quite a load as opposed to this if there were 100 racked downloads, all being polled every tenth of a second!
|
|
|
|
=head2 resource_attrib ( [ key STRING ] ) STRING or HASHREF
|
|
|
|
If a key is requested, function will return the value associated with the resource attribute. If not, the function will return a hashref keyed by server parameter to its corresponding value.
|
|
|
|
All the server keys have been converted into lower-case. This prevents conflict with two very important keys, ERROR_CODE, and ERROR_MESSAGE, which carry the HTTPS error code and message associated with the aquisition of this page.
|
|
|
|
=head1 EXAMPLES
|
|
|
|
=head2 HTTPS get example
|
|
|
|
#!/usr/bin/perl
|
|
|
|
use GT::URI::HTTPS;
|
|
|
|
if ( not @ARGV ) {
|
|
print qq!
|
|
SYNOPSIS
|
|
|
|
$0 url [-f/-h] [ cgi_parameter1=value1 cgi_parameter2=value2 ... ]
|
|
|
|
basic HTTPS requestor
|
|
|
|
OPTIONS
|
|
|
|
-f : full information; headers and resource. Usually only a dump of the resource is provided.
|
|
-h : just the headers\n\n!;
|
|
exit;
|
|
}
|
|
|
|
# parse out the command line
|
|
# first argument, URL
|
|
$url = shift @ARGV;
|
|
|
|
# next arguments, parameters
|
|
foreach my $item ( @ARGV ) {
|
|
|
|
# ... check for special requests
|
|
if ( $item =~ /^-([fd])$/ ) {
|
|
$mode = $1;
|
|
}
|
|
|
|
# ... is not a special request, but probably a parameter
|
|
( $key, $value ) = ( $item =~ /([^=]+)=(.*)/ );
|
|
$key ||= $item;
|
|
push @{$parameters->{$key}}, $value;
|
|
}
|
|
|
|
# setup and send the request
|
|
$http = new GT::URI::HTTPS(
|
|
|
|
# if we're only looking to use the head
|
|
request_method => ( $mode eq 'h' ? 'HEAD' : 'GET' )
|
|
|
|
);
|
|
$http->fetch( $url, $parameters );
|
|
|
|
# get the resource
|
|
while ( not defined ( $doc = $http->do_iteration(-1) ) ) {}
|
|
|
|
# and print out the headers if wanted
|
|
if ( $mode ) {
|
|
$headers = $http->resource_attrib();
|
|
foreach $key ( sort keys %{$headers || {}} ) {
|
|
print "$key: $headers->{$key}\n";
|
|
}
|
|
print "\n";
|
|
}
|
|
|
|
# and output the resource...
|
|
print $doc;
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
|
|
http://www.gossamer-threads.com/
|
|
|
|
=head1 VERSION
|
|
|
|
Revision: $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $
|
|
|
|
=cut
|
|
|