discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/admin/GT/URI/HTTPS.pm
2024-06-17 21:49:12 +10:00

602 lines
20 KiB
Perl

# ==================================================================
# Gossamer Threads Module Library
#
# GT::URI::HTTP
# Author : Aki Mimoto (support@gossamer-threads.com)
# CVS Info : 087,071,086,086,085
# $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $
#
# Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description:
# Gets HTTP data
#
package GT::URI::HTTPS;
# ===============================================================
use strict;
use lib '..';
use GT::Socket;
use GT::Base;
use GT::URI::HTTP qw/ build_path build_parameters /;
use Net::SSLeay qw(die_now die_if_ssl_error) ;
use Exporter ();
use vars qw/$ATTRIBS $EOL @ISA $DEBUG @EXPORT_OK $ERRORS/;
$ERRORS = {
'NOSSLCTX' => 'Failed to create SSL_CTX: %s',
'NOSSL' => 'Failed to create SSL: %s',
'CTXOPTIONS' => 'Failed to set SSL CTX options: %s',
'NOSSLCONNECT' => 'Failed to connect ssl',
'CANTCONN' => 'Cannot connect to server'
};
@ISA = ('GT::URI::HTTP', 'GT::Base', 'Exporter');
$EOL = "\015\012";
$DEBUG = 0;
$ATTRIBS = {
'URL' => '',
'request_method' => 'GET',
'parameters' => {},
'agent_name' => 'Mozilla/4.73 [en]',
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
'headers' => {},
'cookies' => {},
'resource_attribs' => {},
'resource_data' => undef,
'max_down' => 200000,
'debug' => 0,
'ctx' => undef,
'ssl' => undef,
'cipher' => undef,
};
@EXPORT_OK = qw/ parse_url deparse_url build_path build_parameters /;
Net::SSLeay::load_error_strings();
Net::SSLeay::SSLeay_add_ssl_algorithms();
Net::SSLeay::randomize();
sub pending {
#-------------------------------------------------------------------------------
# return true if there is some data to be picked up
my $self = shift;
my $tics = shift;
return if ( $self->{resource_downloaded} );
my $sock = $self->{'sock'} or return 1;
return $sock->pending();
}
sub EOF {
#-------------------------------------------------------------------------------
# return true if the system has finished downloading
my $self = shift;
my $sock = $self->{'sock'};
return ( $sock and $sock->fh() ? $self->{resource_downloaded} : 1 );
}
sub do_iteration {
#-------------------------------------------------------------------------------
# useful in a loop, this is a generally non blocking way to get a page
# it will return nulls until it gets the page (and returns the page)
#
my ( $self, $fh, $sock, $pause );
$self = shift;
$pause = shift;
$sock = $self->{'sock'};
# if we haven't done the request yet
if ( not( $self->{'resource_downloaded'} or $sock ) and $self->{URL} ) {
$self->fetch() or return '';
$sock = $self->{'sock'};
return;
}
# if we can't connect
if ( $self->{'flag_err'} ) {
return '';
}
$sock->fh() or return;
# only if the resource hasn't been downloaded yet..,
if ( !$self->{'resource_downloaded'} ) {
# if there's data pending, download it!
if ( $sock->pending( $pause ) ) {
# get as much data as we can
my $ssl = $self->{'ssl'};
my $buf = Net::SSLeay::read($ssl);
if ( $buf ) {
$self->{'resource_data'} .= $buf;
# if we need to parse the server headers
$self->_parse_server_headers();
}
# we've run out of data, let's stop
else {
$self->{resource_downloaded} = 1;
return $self->{'resource_data'};
Net::SSLeay::free ($ssl);
Net::SSLeay::CTX_free ($self->{'ctx'});
}
}
return;
}
else {
return $self->{'resource_data'};
}
}
sub fetch {
#-------------------------------------------------------------------------------
# fetches a page and handles CGI if requested
#
my ( $self, $url, $parameters, $sock, $request_str, $fh );
$self = shift;
$url = shift || $self->{'URL'} or return undef;
$parameters = shift || $self->{'parameters'};
# save the headers
$self->{'URL'} = $url;
$self->{'parameters'} = $parameters;
my ($host, $port, $dirs, $file, $params) = parse_url($url);
# see if we can connect to the host
$sock = GT::Socket->open({
'host' => $host,
'port' => $port,
'_debug' => $self->{_debug},
'debug' => $self->{debug},
'max_down' => $self->{max_down}
});
# submit the request to the host
if ( $sock ) {
$fh = $sock->fh();
# The network connection is now open, lets fire up SSL
my $ctx = $self->{'ctx'} = Net::SSLeay::CTX_new() or return $self->error("NOSSLCTX", 'WARN', $!);
Net::SSLeay::CTX_set_options($ctx, &Net::SSLeay::OP_ALL) and return $self->error("CTXOPTIONS", 'WARN');
my $ssl = $self->{'ssl'} = Net::SSLeay::new($ctx) or return $self->error("NOSSL", 'WARN', $!);
Net::SSLeay::set_fd($ssl, fileno($fh));
my $res = $self->{'res'} = Net::SSLeay::connect($ssl);# and return $self->error( "NOSSLCONNECT", 'WARN' );
$self->{'cipher'} = Net::SSLeay::get_cipher($ssl);
$self->debug( "Connected to $url" ) if ($self->{_debug});
$request_str = $self->_create_request( $url, $parameters, $host, $port, $dirs, $file, $params );
$self->debug( "Sent request:\n$request_str" ) if ($self->{_debug});
$res = Net::SSLeay::write($ssl, $request_str);
}
else {
$self->debug( "Couldn't connect to $url" ) if ($self->{_debug});
$self->{resource_attribs} = {
ERROR_CODE => 'CANTCONN',
ERROR_MESSAGE => 'Cannot connect to server'
};
$self->{resource_downloaded} = 1;
$self->{flag_err} = 1;
}
# store the socket
return($self->{'sock'} = $sock);
}
sub parse_url {
#-------------------------------------------------------------------------------
# parses the url and converts it into host
# assume HTTP/1.0 format
#
my ( $url, $host, $port, $path, $file, $page, @dirs, $param, %params, $item );
$url = shift;
# first, do the initial parse
$url =~ s/^[a-zA-Z]+:\/\///i;
($host, $port, $path, $param) = $url =~ /([\w.\-]*):?(\d*)\/?([^?]*)\??(.*)/;
$port ||= 443;
# now handle the paths
$path =~ s/^\s*(.*?)\s*$/$1/;
# the paths are not simply directories
if ( $path !~ /\/$/ ) {
$path =~ s/([^\/]*)$//;
$file = $1;
}
# handle the directories
@dirs = grep $_, split '/', $path;
# and finally parse out the parameters if there are any
foreach $item ( split /&/, $param ) {
if ( $item =~ /^(.*?)=(.*)$/ ) {
push @{$params{$1}}, $2;
}
else {
push @{$params{$item}}, undef;
}
}
# now return the collection
return( $host, $port, \@dirs, $file, \%params );
}
sub deparse_url {
#-------------------------------------------------------------------------------
# takes a number of parameters and builds the request url
my ( $url, $host, $port, $path, $file, $page, $dirs, $param, $params, $item );
( $host, $port, $dirs, $file, $params ) = @_;
$url = $host;
# build the host portion
$url .= ":$port" unless ($port == 443);
# now build the page access
$url .= "/" . build_path( $dirs, $file );
# and finally handle the parameters if any
if ( $params and keys %{$params} ) {
$url .= "?" . build_parameters( $params );
}
return $url;
}
sub get {
#-------------------------------------------------------------------------------
# simple blocking method to get some data
my ( $uri, $options, $HTTPS, $doc );
shift; # get rid of the first parameter
$uri = shift or return undef;
$options = ( @_ and ref $_[0] ) ? shift : { @_ };
$HTTPS = new GT::URI::HTTPS($options);
$HTTPS->fetch($uri);
while ( not defined( $doc = $HTTPS->do_iteration(0) ) ) {};
return $doc;
}
1;
__END__
=head1 NAME
GT::URI::HTTPS - HTTPS request broker. Can be used stand-alone or through GT::URI
=head1 SYNOPSIS
use GT::URI::HTTPS;
print GT::URI::HTTPS->get( "http://www.gossamer-threads.com" );
=head1 DESCRIPTION
GT::URI::HTTPS makes requests and retrieves resources from http servers (not limited to text).
=head1 Method List
Socket Handling
sub pending() Returns true if data awaiting
sub EOF() Returns open/closed status of socket
sub gulp_read() Alias to do_iteration
sub do_iteration() Basic looping function that downloads resources in the background
Acquisition
sub fetch() Tell the object which URL to acquire
sub method() The method of acquisition
sub load_parameter() Add a item for CGI parameters
sub delete_parameter() Delete a CGI parameter
sub resource_attrib() Headers related to resource and server
sub get() Simple resource aquisition function
Support Methods (must be imported)
sub parse_url() Decomposes a URL into constituent parts
sub deparse_url() Takes those parts and builds an URL
sub build_path() Takes a list of directories and builds a path
sub build_parameters() Takes a hash of parameter->values and builds a CGI request string
=head1 Basics
=head2 Getting a resource, the simple way
Just want a single item? Call GT::URI::HTTPS->get and all the magic will be done for you.
use GT::URI::HTTPS;
my $buf = GT::URI::HTTPS->get( "http://www.gossamer-threads.com/" );
Get based requests are permissable as well:
use GT::URI::HTTPS;
my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads" );
If extra options need to be set, simply append the options to the parameter list, like follows.
use GT::URI::HTTPS;
my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads", { request_method => 'POST' } );
=head2 When just the document is not enough
If a new GT::URI::HTTPS object is instantiated, much more control is available, including facilities for non-blocking downloading of pages.
To create a GT::URI::HTTPS object, call new with all the options required:
use GT::URI::HTTPS;
my $http = new GT::URI::HTTPS(
# URL to acquire (optional)
'URL' => '',
# Can also be set to POST/GET/HEAD (optional)
'request_method' => 'GET',
# a hash of keys pointing to an arrayref of values to be sent to the server
# {
# 'key' => [ 'value1', 'value2'... ],
# }
# (optional)
'parameters' => {},
# Name portion of the User-Agent: string the server acquires (optional)
'agent_name' => 'Mozilla/4.73 [en]',
# Host-from portion of the User-Agent: string the server acquires (optional)
'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586',
# To prevent downloading of 80Tb files, but if you still wanted to, set this to 0 (optional)
'max_down' => 200000
);
If URL has been specified in the options, for interactions with a CGI, you can set extra parameters with $http->load_parameter(). Finally, loop on $http->do_iteration() until the value is defined. To replicate the "simple get" example:
use GT::URI::HTTPS;
$|++;
my $http = new GT::URI::HTTPS(
URL => 'http://search.yahoo.com/bin/search',
# can also use the following:
parameters => {
'p' => [ 'gossamer threads' ]
}
);
my $doc;
while ( not defined( $doc = $http->do_iteration() ) ) {
# do something here while waiting for the resource to arrive
print "."
}
print $doc, "\n\n";
Beyond the resource, the http server often supplies extra information in a header. To access this information, use $http->resource_attrib().
Appending this code to the previous example, a list of all the associated server headers can be seen:
my $attribs = $http->resource_attrib();
foreach my $key ( sort keys %{$attribs} ) {
print "$key => $attribs->{$key}\n";
}
=head2 Support Methods
In addition to the basic fetching abilities, since the module must parse HTTPS URLs, the methods used to do so have been made public.
These methods decompose URLs into datastructures that make URLs easily studied or modified and then reconstructed.
However, these routines have not been polished for useability so beware! The following is a very basic example:
use GT::URI::HTTPS qw/ parse_url deparse_url build_path build_parameters /;
# fragment the URL
my ( $host, $port, $dirs, $file, $params ) = parse_url( 'http://www.gossamer-threads.com/perl/forum/showflat.pl?Cat=&Board=GosDisc&Number=113355&page=0&view=' );
print "Parsed Data:\n\n";
print "Host: $host\n";
print "Port: $port\n";
print "Dirs:\n";
foreach my $dir ( @{$dirs} ) {
print " $dir/\n";
}
print "Resource Filename: $file\n";
print "Params:\n";
foreach my $key ( sort keys %{$params} ) {
print " $key: ";
my $values = ( $params->{$key} || {} );
foreach my $value ( sort @{$values} ) {
print "'", quotemeta($value), "' ";
}
print "\n";
}
# put the data back together again
my $url = deparse_url( $host, $port, $dirs, $file, $params );
print "\nDeparsed Data:\n\n";
print "URL: http://$url\n";
=head1 Methods List
=head2 build_path ( dir ARRAYREF, [ page STRING ] ) : STRING
Takes an array ref of directory names and an optional filename and returns a filepath.
use GT::URI::HTTPS qw/ build_path /;
print build_path( [ 'topdir', 'middir', 'bottomdir' ], 'file.html' );
This function must be imported.
=head2 build_parameters ( parameter HASHREF ) : STRING
Builds a CGI request string from list of keys and values. The function has the ability to handle keys with more than one parameter, simply use an arrayref with multiple values.
use GT::URI::HTTPS qw/ build_parameters /;
my $params = {
'simplekey' => 'value'
'onekey' => [ 'one value' ],
'anotherkey' => [ 'another value', 'and yet anotherone!' ],
};
print build_parameters($params);
This function must be imported.
=head2 delete_parameter ( keys ARRAYREF/ARRAY )
When loading the object with parameters before a request, it is possible to delete an entire set of keys and values.
=head2 deparse_url ( host STRING, [ port STRING, [ dirs ARRAYREF, [ file STRING, [ params HASH ] ] ] ] ) : STRING
This builds an entire URL from basic parameters.
For an example of this function, see the example in "Support Methods".
This function must be imported.
=head2 do_iteration ( tics INTEGER ) : STRING
The basic iteration function. This function will return undef until the resource is received which, upon receipt will return the resource data.
The function can return an empty string, so it is important to checked defined'ness. If the return is an empty string, check the ERROR_CODE in resource_attrib to find out if the script simply can't connect to the host or the resource is empty.
=head2 EOF () : BOOLEAN
Returns '1' or '0' depending if the object has stopped receiving/sending data to the remote server.
=head2 fetch ( url STRING, [ parameters HASHREF ] )
Tells the server the URL to retreive the resource of. If CGI parameters are required pass in a hash of keys and values.
=head2 GT::URI::HTTPS->get ( url, [ options HASH/HASHREF ] ) : RESOURCE_DATA
Simplest resource aquision method. Give it the URL and any options and the function will return after the resource has been downloaded.
=head2 gulp_read ( tic INTEGER ) : RESOURCE_DATA
This is just an alias to the function do_iteration. This method is used by GT::URI in its mass resource aquisition runs.
Unless you feel like being different, you shouldn't need to use this.
=head2 load_parameter ( params HASH/HASHREF ) : HASHREF
Takes a list of keys and values and loads the values into the list of CGI parameters to be sent to the remote server.
=head2 method ( method STRING ) : STRING
Sets the acquisition method for the resource. Currently, GET/POST/HEAD are supported.
If no parameters are supplied the function simply returns the current acquisition method.
=head2 parse_url ( url STRING ) : host STRING, port INTEGER, dirs ARRAYREF, file STRING, params HASHREF
Takes an URL and decomposes it into easily manipulated datastructures. The output can be fed back into deparse_url to reconstruct an URL.
This function must be imported.
=head2 pending ( tics INTEGER ) : BOOLEAN
If there is data available to be downloaded, this function returns '1', otherwise '0'. This is another function used by GT::URI in it's mass downloads and unlikely to be of any use to anyone using this module directly. This function exists because it is lighter than do_iteration which can be quite a load as opposed to this if there were 100 racked downloads, all being polled every tenth of a second!
=head2 resource_attrib ( [ key STRING ] ) STRING or HASHREF
If a key is requested, function will return the value associated with the resource attribute. If not, the function will return a hashref keyed by server parameter to its corresponding value.
All the server keys have been converted into lower-case. This prevents conflict with two very important keys, ERROR_CODE, and ERROR_MESSAGE, which carry the HTTPS error code and message associated with the aquisition of this page.
=head1 EXAMPLES
=head2 HTTPS get example
#!/usr/bin/perl
use GT::URI::HTTPS;
if ( not @ARGV ) {
print qq!
SYNOPSIS
$0 url [-f/-h] [ cgi_parameter1=value1 cgi_parameter2=value2 ... ]
basic HTTPS requestor
OPTIONS
-f : full information; headers and resource. Usually only a dump of the resource is provided.
-h : just the headers\n\n!;
exit;
}
# parse out the command line
# first argument, URL
$url = shift @ARGV;
# next arguments, parameters
foreach my $item ( @ARGV ) {
# ... check for special requests
if ( $item =~ /^-([fd])$/ ) {
$mode = $1;
}
# ... is not a special request, but probably a parameter
( $key, $value ) = ( $item =~ /([^=]+)=(.*)/ );
$key ||= $item;
push @{$parameters->{$key}}, $value;
}
# setup and send the request
$http = new GT::URI::HTTPS(
# if we're only looking to use the head
request_method => ( $mode eq 'h' ? 'HEAD' : 'GET' )
);
$http->fetch( $url, $parameters );
# get the resource
while ( not defined ( $doc = $http->do_iteration(-1) ) ) {}
# and print out the headers if wanted
if ( $mode ) {
$headers = $http->resource_attrib();
foreach $key ( sort keys %{$headers || {}} ) {
print "$key: $headers->{$key}\n";
}
print "\n";
}
# and output the resource...
print $doc;
=head1 COPYRIGHT
Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved.
http://www.gossamer-threads.com/
=head1 VERSION
Revision: $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $
=cut