# ================================================================== # Gossamer Threads Module Library # # GT::URI::HTTP # Author : Aki Mimoto (support@gossamer-threads.com) # CVS Info : 087,071,086,086,085 # $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $ # # Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved. # ================================================================== # # Description: # Gets HTTP data # package GT::URI::HTTPS; # =============================================================== use strict; use lib '..'; use GT::Socket; use GT::Base; use GT::URI::HTTP qw/ build_path build_parameters /; use Net::SSLeay qw(die_now die_if_ssl_error) ; use Exporter (); use vars qw/$ATTRIBS $EOL @ISA $DEBUG @EXPORT_OK $ERRORS/; $ERRORS = { 'NOSSLCTX' => 'Failed to create SSL_CTX: %s', 'NOSSL' => 'Failed to create SSL: %s', 'CTXOPTIONS' => 'Failed to set SSL CTX options: %s', 'NOSSLCONNECT' => 'Failed to connect ssl', 'CANTCONN' => 'Cannot connect to server' }; @ISA = ('GT::URI::HTTP', 'GT::Base', 'Exporter'); $EOL = "\015\012"; $DEBUG = 0; $ATTRIBS = { 'URL' => '', 'request_method' => 'GET', 'parameters' => {}, 'agent_name' => 'Mozilla/4.73 [en]', 'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586', 'headers' => {}, 'cookies' => {}, 'resource_attribs' => {}, 'resource_data' => undef, 'max_down' => 200000, 'debug' => 0, 'ctx' => undef, 'ssl' => undef, 'cipher' => undef, }; @EXPORT_OK = qw/ parse_url deparse_url build_path build_parameters /; Net::SSLeay::load_error_strings(); Net::SSLeay::SSLeay_add_ssl_algorithms(); Net::SSLeay::randomize(); sub pending { #------------------------------------------------------------------------------- # return true if there is some data to be picked up my $self = shift; my $tics = shift; return if ( $self->{resource_downloaded} ); my $sock = $self->{'sock'} or return 1; return $sock->pending(); } sub EOF { #------------------------------------------------------------------------------- # return true if the system has finished downloading my $self = shift; my $sock = $self->{'sock'}; return ( $sock and $sock->fh() ? $self->{resource_downloaded} : 1 ); } sub do_iteration { #------------------------------------------------------------------------------- # useful in a loop, this is a generally non blocking way to get a page # it will return nulls until it gets the page (and returns the page) # my ( $self, $fh, $sock, $pause ); $self = shift; $pause = shift; $sock = $self->{'sock'}; # if we haven't done the request yet if ( not( $self->{'resource_downloaded'} or $sock ) and $self->{URL} ) { $self->fetch() or return ''; $sock = $self->{'sock'}; return; } # if we can't connect if ( $self->{'flag_err'} ) { return ''; } $sock->fh() or return; # only if the resource hasn't been downloaded yet.., if ( !$self->{'resource_downloaded'} ) { # if there's data pending, download it! if ( $sock->pending( $pause ) ) { # get as much data as we can my $ssl = $self->{'ssl'}; my $buf = Net::SSLeay::read($ssl); if ( $buf ) { $self->{'resource_data'} .= $buf; # if we need to parse the server headers $self->_parse_server_headers(); } # we've run out of data, let's stop else { $self->{resource_downloaded} = 1; return $self->{'resource_data'}; Net::SSLeay::free ($ssl); Net::SSLeay::CTX_free ($self->{'ctx'}); } } return; } else { return $self->{'resource_data'}; } } sub fetch { #------------------------------------------------------------------------------- # fetches a page and handles CGI if requested # my ( $self, $url, $parameters, $sock, $request_str, $fh ); $self = shift; $url = shift || $self->{'URL'} or return undef; $parameters = shift || $self->{'parameters'}; # save the headers $self->{'URL'} = $url; $self->{'parameters'} = $parameters; my ($host, $port, $dirs, $file, $params) = parse_url($url); # see if we can connect to the host $sock = GT::Socket->open({ 'host' => $host, 'port' => $port, '_debug' => $self->{_debug}, 'debug' => $self->{debug}, 'max_down' => $self->{max_down} }); # submit the request to the host if ( $sock ) { $fh = $sock->fh(); # The network connection is now open, lets fire up SSL my $ctx = $self->{'ctx'} = Net::SSLeay::CTX_new() or return $self->error("NOSSLCTX", 'WARN', $!); Net::SSLeay::CTX_set_options($ctx, &Net::SSLeay::OP_ALL) and return $self->error("CTXOPTIONS", 'WARN'); my $ssl = $self->{'ssl'} = Net::SSLeay::new($ctx) or return $self->error("NOSSL", 'WARN', $!); Net::SSLeay::set_fd($ssl, fileno($fh)); my $res = $self->{'res'} = Net::SSLeay::connect($ssl);# and return $self->error( "NOSSLCONNECT", 'WARN' ); $self->{'cipher'} = Net::SSLeay::get_cipher($ssl); $self->debug( "Connected to $url" ) if ($self->{_debug}); $request_str = $self->_create_request( $url, $parameters, $host, $port, $dirs, $file, $params ); $self->debug( "Sent request:\n$request_str" ) if ($self->{_debug}); $res = Net::SSLeay::write($ssl, $request_str); } else { $self->debug( "Couldn't connect to $url" ) if ($self->{_debug}); $self->{resource_attribs} = { ERROR_CODE => 'CANTCONN', ERROR_MESSAGE => 'Cannot connect to server' }; $self->{resource_downloaded} = 1; $self->{flag_err} = 1; } # store the socket return($self->{'sock'} = $sock); } sub parse_url { #------------------------------------------------------------------------------- # parses the url and converts it into host # assume HTTP/1.0 format # my ( $url, $host, $port, $path, $file, $page, @dirs, $param, %params, $item ); $url = shift; # first, do the initial parse $url =~ s/^[a-zA-Z]+:\/\///i; ($host, $port, $path, $param) = $url =~ /([\w.\-]*):?(\d*)\/?([^?]*)\??(.*)/; $port ||= 443; # now handle the paths $path =~ s/^\s*(.*?)\s*$/$1/; # the paths are not simply directories if ( $path !~ /\/$/ ) { $path =~ s/([^\/]*)$//; $file = $1; } # handle the directories @dirs = grep $_, split '/', $path; # and finally parse out the parameters if there are any foreach $item ( split /&/, $param ) { if ( $item =~ /^(.*?)=(.*)$/ ) { push @{$params{$1}}, $2; } else { push @{$params{$item}}, undef; } } # now return the collection return( $host, $port, \@dirs, $file, \%params ); } sub deparse_url { #------------------------------------------------------------------------------- # takes a number of parameters and builds the request url my ( $url, $host, $port, $path, $file, $page, $dirs, $param, $params, $item ); ( $host, $port, $dirs, $file, $params ) = @_; $url = $host; # build the host portion $url .= ":$port" unless ($port == 443); # now build the page access $url .= "/" . build_path( $dirs, $file ); # and finally handle the parameters if any if ( $params and keys %{$params} ) { $url .= "?" . build_parameters( $params ); } return $url; } sub get { #------------------------------------------------------------------------------- # simple blocking method to get some data my ( $uri, $options, $HTTPS, $doc ); shift; # get rid of the first parameter $uri = shift or return undef; $options = ( @_ and ref $_[0] ) ? shift : { @_ }; $HTTPS = new GT::URI::HTTPS($options); $HTTPS->fetch($uri); while ( not defined( $doc = $HTTPS->do_iteration(0) ) ) {}; return $doc; } 1; __END__ =head1 NAME GT::URI::HTTPS - HTTPS request broker. Can be used stand-alone or through GT::URI =head1 SYNOPSIS use GT::URI::HTTPS; print GT::URI::HTTPS->get( "http://www.gossamer-threads.com" ); =head1 DESCRIPTION GT::URI::HTTPS makes requests and retrieves resources from http servers (not limited to text). =head1 Method List Socket Handling sub pending() Returns true if data awaiting sub EOF() Returns open/closed status of socket sub gulp_read() Alias to do_iteration sub do_iteration() Basic looping function that downloads resources in the background Acquisition sub fetch() Tell the object which URL to acquire sub method() The method of acquisition sub load_parameter() Add a item for CGI parameters sub delete_parameter() Delete a CGI parameter sub resource_attrib() Headers related to resource and server sub get() Simple resource aquisition function Support Methods (must be imported) sub parse_url() Decomposes a URL into constituent parts sub deparse_url() Takes those parts and builds an URL sub build_path() Takes a list of directories and builds a path sub build_parameters() Takes a hash of parameter->values and builds a CGI request string =head1 Basics =head2 Getting a resource, the simple way Just want a single item? Call GT::URI::HTTPS->get and all the magic will be done for you. use GT::URI::HTTPS; my $buf = GT::URI::HTTPS->get( "http://www.gossamer-threads.com/" ); Get based requests are permissable as well: use GT::URI::HTTPS; my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads" ); If extra options need to be set, simply append the options to the parameter list, like follows. use GT::URI::HTTPS; my $buf = GT::URI::HTTPS->get( "http://search.yahoo.com/bin/search?p=gossamer+threads", { request_method => 'POST' } ); =head2 When just the document is not enough If a new GT::URI::HTTPS object is instantiated, much more control is available, including facilities for non-blocking downloading of pages. To create a GT::URI::HTTPS object, call new with all the options required: use GT::URI::HTTPS; my $http = new GT::URI::HTTPS( # URL to acquire (optional) 'URL' => '', # Can also be set to POST/GET/HEAD (optional) 'request_method' => 'GET', # a hash of keys pointing to an arrayref of values to be sent to the server # { # 'key' => [ 'value1', 'value2'... ], # } # (optional) 'parameters' => {}, # Name portion of the User-Agent: string the server acquires (optional) 'agent_name' => 'Mozilla/4.73 [en]', # Host-from portion of the User-Agent: string the server acquires (optional) 'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586', # To prevent downloading of 80Tb files, but if you still wanted to, set this to 0 (optional) 'max_down' => 200000 ); If URL has been specified in the options, for interactions with a CGI, you can set extra parameters with $http->load_parameter(). Finally, loop on $http->do_iteration() until the value is defined. To replicate the "simple get" example: use GT::URI::HTTPS; $|++; my $http = new GT::URI::HTTPS( URL => 'http://search.yahoo.com/bin/search', # can also use the following: parameters => { 'p' => [ 'gossamer threads' ] } ); my $doc; while ( not defined( $doc = $http->do_iteration() ) ) { # do something here while waiting for the resource to arrive print "." } print $doc, "\n\n"; Beyond the resource, the http server often supplies extra information in a header. To access this information, use $http->resource_attrib(). Appending this code to the previous example, a list of all the associated server headers can be seen: my $attribs = $http->resource_attrib(); foreach my $key ( sort keys %{$attribs} ) { print "$key => $attribs->{$key}\n"; } =head2 Support Methods In addition to the basic fetching abilities, since the module must parse HTTPS URLs, the methods used to do so have been made public. These methods decompose URLs into datastructures that make URLs easily studied or modified and then reconstructed. However, these routines have not been polished for useability so beware! The following is a very basic example: use GT::URI::HTTPS qw/ parse_url deparse_url build_path build_parameters /; # fragment the URL my ( $host, $port, $dirs, $file, $params ) = parse_url( 'http://www.gossamer-threads.com/perl/forum/showflat.pl?Cat=&Board=GosDisc&Number=113355&page=0&view=' ); print "Parsed Data:\n\n"; print "Host: $host\n"; print "Port: $port\n"; print "Dirs:\n"; foreach my $dir ( @{$dirs} ) { print " $dir/\n"; } print "Resource Filename: $file\n"; print "Params:\n"; foreach my $key ( sort keys %{$params} ) { print " $key: "; my $values = ( $params->{$key} || {} ); foreach my $value ( sort @{$values} ) { print "'", quotemeta($value), "' "; } print "\n"; } # put the data back together again my $url = deparse_url( $host, $port, $dirs, $file, $params ); print "\nDeparsed Data:\n\n"; print "URL: http://$url\n"; =head1 Methods List =head2 build_path ( dir ARRAYREF, [ page STRING ] ) : STRING Takes an array ref of directory names and an optional filename and returns a filepath. use GT::URI::HTTPS qw/ build_path /; print build_path( [ 'topdir', 'middir', 'bottomdir' ], 'file.html' ); This function must be imported. =head2 build_parameters ( parameter HASHREF ) : STRING Builds a CGI request string from list of keys and values. The function has the ability to handle keys with more than one parameter, simply use an arrayref with multiple values. use GT::URI::HTTPS qw/ build_parameters /; my $params = { 'simplekey' => 'value' 'onekey' => [ 'one value' ], 'anotherkey' => [ 'another value', 'and yet anotherone!' ], }; print build_parameters($params); This function must be imported. =head2 delete_parameter ( keys ARRAYREF/ARRAY ) When loading the object with parameters before a request, it is possible to delete an entire set of keys and values. =head2 deparse_url ( host STRING, [ port STRING, [ dirs ARRAYREF, [ file STRING, [ params HASH ] ] ] ] ) : STRING This builds an entire URL from basic parameters. For an example of this function, see the example in "Support Methods". This function must be imported. =head2 do_iteration ( tics INTEGER ) : STRING The basic iteration function. This function will return undef until the resource is received which, upon receipt will return the resource data. The function can return an empty string, so it is important to checked defined'ness. If the return is an empty string, check the ERROR_CODE in resource_attrib to find out if the script simply can't connect to the host or the resource is empty. =head2 EOF () : BOOLEAN Returns '1' or '0' depending if the object has stopped receiving/sending data to the remote server. =head2 fetch ( url STRING, [ parameters HASHREF ] ) Tells the server the URL to retreive the resource of. If CGI parameters are required pass in a hash of keys and values. =head2 GT::URI::HTTPS->get ( url, [ options HASH/HASHREF ] ) : RESOURCE_DATA Simplest resource aquision method. Give it the URL and any options and the function will return after the resource has been downloaded. =head2 gulp_read ( tic INTEGER ) : RESOURCE_DATA This is just an alias to the function do_iteration. This method is used by GT::URI in its mass resource aquisition runs. Unless you feel like being different, you shouldn't need to use this. =head2 load_parameter ( params HASH/HASHREF ) : HASHREF Takes a list of keys and values and loads the values into the list of CGI parameters to be sent to the remote server. =head2 method ( method STRING ) : STRING Sets the acquisition method for the resource. Currently, GET/POST/HEAD are supported. If no parameters are supplied the function simply returns the current acquisition method. =head2 parse_url ( url STRING ) : host STRING, port INTEGER, dirs ARRAYREF, file STRING, params HASHREF Takes an URL and decomposes it into easily manipulated datastructures. The output can be fed back into deparse_url to reconstruct an URL. This function must be imported. =head2 pending ( tics INTEGER ) : BOOLEAN If there is data available to be downloaded, this function returns '1', otherwise '0'. This is another function used by GT::URI in it's mass downloads and unlikely to be of any use to anyone using this module directly. This function exists because it is lighter than do_iteration which can be quite a load as opposed to this if there were 100 racked downloads, all being polled every tenth of a second! =head2 resource_attrib ( [ key STRING ] ) STRING or HASHREF If a key is requested, function will return the value associated with the resource attribute. If not, the function will return a hashref keyed by server parameter to its corresponding value. All the server keys have been converted into lower-case. This prevents conflict with two very important keys, ERROR_CODE, and ERROR_MESSAGE, which carry the HTTPS error code and message associated with the aquisition of this page. =head1 EXAMPLES =head2 HTTPS get example #!/usr/bin/perl use GT::URI::HTTPS; if ( not @ARGV ) { print qq! SYNOPSIS $0 url [-f/-h] [ cgi_parameter1=value1 cgi_parameter2=value2 ... ] basic HTTPS requestor OPTIONS -f : full information; headers and resource. Usually only a dump of the resource is provided. -h : just the headers\n\n!; exit; } # parse out the command line # first argument, URL $url = shift @ARGV; # next arguments, parameters foreach my $item ( @ARGV ) { # ... check for special requests if ( $item =~ /^-([fd])$/ ) { $mode = $1; } # ... is not a special request, but probably a parameter ( $key, $value ) = ( $item =~ /([^=]+)=(.*)/ ); $key ||= $item; push @{$parameters->{$key}}, $value; } # setup and send the request $http = new GT::URI::HTTPS( # if we're only looking to use the head request_method => ( $mode eq 'h' ? 'HEAD' : 'GET' ) ); $http->fetch( $url, $parameters ); # get the resource while ( not defined ( $doc = $http->do_iteration(-1) ) ) {} # and print out the headers if wanted if ( $mode ) { $headers = $http->resource_attrib(); foreach $key ( sort keys %{$headers || {}} ) { print "$key: $headers->{$key}\n"; } print "\n"; } # and output the resource... print $doc; =head1 COPYRIGHT Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved. http://www.gossamer-threads.com/ =head1 VERSION Revision: $Id: HTTPS.pm,v 1.10 2004/08/23 20:07:44 jagerman Exp $ =cut