# ================================================================== # Gossamer Threads Module Library # # GT::URI::HTTP # Author : Aki Mimoto (support@gossamer-threads.com) # CVS Info : 087,071,086,086,085 # $Id: HTTP.pm,v 1.30 2002/06/27 18:36:02 aki Exp $ # # Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved. # ================================================================== # # Description: # Gets HTTP data # package GT::URI::HTTP; # =============================================================== use strict; use GT::Socket; use GT::Base; use GT::CGI; use Exporter (); use vars qw/$ATTRIBS $EOL @ISA $DEBUG @EXPORT_OK $BASE_PORT /; @ISA = ('GT::Base', 'Exporter'); $EOL = "\015\012"; $DEBUG = 0; $ATTRIBS = { URL => '', request_method => 'GET', parameters => {}, agent_name => 'Mozilla/4.73 [en]', agent_host => 'X11; I; Linux 2.2.15-4mdk i586', headers => {}, cookies => {}, resource_attribs => {}, resource_data => undef, max_down => 200000, debug => 0 }; @EXPORT_OK = qw/ parse_url deparse_url build_path build_parameters /; sub pending { #------------------------------------------------------------------------------- # return true if there is some data to be picked up my $self = shift; my $tics = shift; my $sock = $self->{'sock'} or return 1; return $sock->pending(); } sub EOF { #------------------------------------------------------------------------------- # return true if the system has finished downloading my $self = shift; my $sock = $self->{'sock'}; return ( $sock and $sock->fh() ? $sock->EOF() : 1 ); } sub gulp_read { #------------------------------------------------------------------------------- my $self = shift; my $tics = shift; return $self->do_iteration( $tics ); } sub resource_attrib { #------------------------------------------------------------------------------- my $self = shift; my $attrib = shift; my $attribs = $self->{resource_attribs} || {}; return $attrib ? $attribs->{$attrib} : $attribs; } sub do_iteration { #------------------------------------------------------------------------------- # useful in a loop, this is a generally non blocking way to get a page # it will return nulls until it gets the page (and returns the page) # my ( $self, $fh, $sock, $pause ); $self = shift; $pause = shift; $sock = $self->{'sock'}; # if we haven't done the request yet if ( not( $self->{'resource_downloaded'} or $sock ) and $self->{URL} ) { $self->fetch(); $sock = $self->{'sock'}; return; } # if we can't connect if ( $self->{'flag_err'} ) { return ''; } $sock->fh() or return; # only if the resource hasn't been downloaded yet.., if ( !$self->{'resource_downloaded'} ) { # if there's data pending, download it! if ( $sock->pending( $pause ) ) { # get as much data as we can while ( my $buf = $sock->gulpread() ) { $self->{'resource_data'} .= $buf; } # if we need to parse the server headers $self->_parse_server_headers(); } # we've run out of data, let's stop if ( $sock->EOF() ) { # make sure that the object know $self->{'resource_downloaded'} = 1; return $self->{'resource_data'}; } return; } else { return $self->{'resource_data'}; } } sub fetch { #------------------------------------------------------------------------------- # fetches a page and handles CGI if requested # my ( $self, $url, $parameters, $sock, $request_str, $host, $port, $dirs, $file, $params, $fh, $order ); $self = shift; $url = shift || $self->{'URL'} or return undef; $parameters = shift || $self->{'parameters'}; # save the headers $self->{'URL'} = $url; $self->{'parameters'} = $parameters; ( $host, $port, $dirs, $file, $params, $order ) = parse_url($url); # see if we can connect to the host $sock = GT::Socket->open({ 'host' => $host, 'port' => $port, '_debug' => $self->{_debug}, 'debug' => $self->{debug}, 'max_down' => $self->{max_down} }); # submit the request to the host if ( $sock ) { $fh = $sock->fh(); $self->debug( "Connected to $url" ) if ($self->{_debug}); $request_str = $self->_create_request( $url, $parameters, $host, $port, $dirs, $file, $params, $order ); $self->debug( "Sent request:\n$request_str" ) if ($self->{_debug}); print $fh $request_str; } else { $self->debug( "Couldn't connect to $url. Reason: $GT::Socket::error" ) if ($self->{_debug}); $self->{resource_attribs} = { ERROR_CODE => 'CANTCONN', ERROR_MESSAGE => 'Cannot connect to server. Reason: $GT::Socket::error' }; $self->{resource_downloaded} = 1; $self->{flag_err} = 1; } # store the socket return( $self->{'sock'} = $sock ); } sub _parse_server_headers { #------------------------------------------------------------------------------- # this next part parses the server headers # my ( $self, $doc, $fh, $sock, $line, $error_code, $error_message, %resource_attribs, @lines ); $self = shift; $doc = shift || $self->{'resource_data'}; if ( !$self->{'headers_parsed'} ) { # check is there is a full header yet ( $doc =~ s/((.*?))(\012\015\012\015|\015\012\015\012|\015\015|\012\012)//so ) or return undef; $line = $1; $self->{'resource_data'} = $doc; $line =~ s/(\012\015|\015\012)/\n/g; # now get all the lines of the header.., @lines = grep { not /^(\n|\r)+$/ } split /(\n|\r)/, $line; # read the first header, which is the status line $line = shift @lines; ($error_code, $error_message) = $line =~ /[^ ]*[ ]*(\d*)(.*)/; # loop until we're done the headers while ( $line = shift @lines ) { if ( $line =~ /^(.*?): (.*)/ ) { $resource_attribs{lc($1)} = $2; } } $resource_attribs{ERROR_CODE} = $error_code; $resource_attribs{ERROR_MESSAGE} = $error_message; # now parse cookies since we're dealing in http if ( $resource_attribs{'set-cookie'} ) { $resource_attribs{'COOKIE'} = _parse_cookie_string( $resource_attribs{'set-cookie'} ); } # add the headers to the local object $self->{'resource_attribs'} = \%resource_attribs; # now set the headers parsed flag $self->{'headers_parsed'} = 1; } return \%resource_attribs; } sub _parse_cookie_string { #------------------------------------------------------------------------------- my $str = shift; my @cookie = grep $_, map { s/^\s*(.*?)\s*$/$1/; $_ } split /[;]/, $str; # as first element of the cookie is a hash my $value = shift @cookie; my %segments = map { /^([^=]*)=(.*)$/ ? ( lc($1) => $2 ) : ( $_ => undef ) } @cookie; if ( $value =~ /^([^=]*)=(.*)$/ ) { $segments{name} = GT::CGI::unescape($1); $segments{value} = GT::CGI::unescape($2); } else { $segments{name} = GT::CGI::unescape($value); $segments{value} = undef; }; return \%segments; } sub _hash_to_phash { #------------------------------------------------------------------------------- # creates a parameter hash from a standard hash my $hash = shift or return undef; my $phash = { map { $_ => [ $$hash{$_} ] } keys %{$hash} }; return $phash; } sub _combine_phash { #------------------------------------------------------------------------------- my ( $a, $b ) = @_; foreach my $key ( %$b ) { my $container = $a->{$key}; if ( ref $container ) { push @{$container}, @{$b->{$key}}; } else { $a->{$key} = [ $container, @{$b->{$key}} ]; } } return $a; } sub _create_request { #------------------------------------------------------------------------------- # creates the http request to be submitted to the server my ( $sock, $fh, $get_str, $req, $headers, $method, $tmp ); my ( $self, $url, $parameters, $host, $port, $dirs, $file, $params, $order ) = @_; $method = lc($self->{'request_method'}); # now handle the request $get_str = uc($method) . ' /' . build_path($dirs,$file); # if this is a get request, if there are any parameters we want to pass, handle that if ( $method eq 'get' ) { $tmp = build_parameters( _combine_phash( $params, _hash_to_phash($parameters) ), $order ); $get_str.= $tmp ? "?$tmp" : ""; } $get_str .= " HTTP/1.0$EOL"; # handle the host field..., $get_str .= "Host: $host" . ( ( $port == 80 ) ? "" : ":$port" ) . $EOL; # handle the useragent field $get_str .= "User-Agent: $self->{agent_name} ($self->{agent_host})" . $EOL; $get_str .= "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*" . $EOL; $get_str .= "Accept-Language: en" . $EOL; $get_str .= "Accept-Charset: iso-8859-1,*,utf-8" . $EOL; $get_str .= "Connection: close" . $EOL; # handle cookies if ( my $cookiestr = $self->_create_cookie_request() ) { $get_str .= $cookiestr . $EOL; } # handle any extra headers $headers = $self->{'headers'}; $get_str .= join( "", map { $_ = ucfirst; "$_: $$headers{$_}$EOL"; } keys %{$headers} ); # handle if the request is a post if ( $method eq 'post' ) { my $request = build_parameters( { %{$params}, %{_hash_to_phash($parameters)} } ); $get_str .= "Content-type: application/x-www-form-urlencoded$EOL" . "Content-length: " .length($request)."$EOL$EOL". $request; } else { $get_str .= "$EOL$EOL"; } return $get_str; } sub cookie { } sub _create_cookie_request { #------------------------------------------------------------------------------- # creates the string the client sends to the server for a cookie my $self = shift; my $cookies = $self->{'cookies'}; my $str = join "; ", grep $_, map { my $value = $cookies->{$_}; if ( ref $value eq 'GT::CGI::Cookie' ) { GT::CGI::escape($value->{-name}) . "=" . GT::CGI::escape( $value->{-value} ); } elsif ( ref $value eq 'HASH' ) { GT::CGI::escape($value->{name}) . "=" . GT::CGI::escape( $value->{value} ); } elsif ( $value and not ref $value ) { GT::CGI::escape($_) . "=" . GT::CGI::escape($value); } } keys %{$cookies}; return $str ? "Cookie: $str" : ""; } sub method { #------------------------------------------------------------------------------- # sets the acquisition method for parameter quering # my $self = shift; my $method = shift or return $self->{'form_method'}; if ( lc $method eq 'get' ) { $self->{'form_method'} = lc $method; } elsif ( lc $method eq 'post' ) { $self->{'form_method'} = lc $method; } elsif ( lc $method eq 'head' ) { $self->{'form_method'} = lc $method; } return $self->{'form_method'}; } sub load_parameter { #------------------------------------------------------------------------------- # loads the passed parameters into the database my ( $self, $params, $key, $parameters ); $self = shift; $params = ( ref $_[0] ? $_[0] : { @_ } ) or return undef; $parameters = $self->{'parameters'}; # add each entry into the object's attribs foreach $key ( keys %{$params} ) { $$parameters{ $key } = $$params{ $key }; } return $self->{'parameters'}; } sub delete_parameter { #------------------------------------------------------------------------------- # deletes the named parameters my ( $self, $params, $key, $parameters ); $self = shift; $params = ref $_[0] ? $_[0] : [ @_ ]; $parameters = $self->{'parameters'}; # now delete the parameters foreach $key ( @{$params} ) { delete $$parameters{$key}; } } sub parse_url { #------------------------------------------------------------------------------- # parses the url and converts it into host # assume HTTP/1.0 format # my ( $url, $host, $port, $path, $file, $page, @dirs, $param, %params, $item, @order ); $url = shift; # first, do the initial parse $url =~ s/^[a-zA-Z]+:\/\///i; $url =~ /([\w.\-]*):?(\d*)\/?([^?]*)\??(.*)/i; $host = $1; $port = $2 || 80; $path = $3; $param = $4; # now handle the paths $path =~ s/^\s*(.*?)\s*$/$1/; # the paths are not simply directories if ( $path !~ /\/$/ ) { $path =~ s/([^\/]*)$//; $file = $1; } # handle the directories @dirs = grep $_, split /\//, $path; # and finally parse out the parameters if there are any foreach $item ( split /\&/, $param ) { if ( $item =~ /^(.*?)=(.*?)$/ ) { push @order, $1; push @{$params{$1}}, $2; } else { push @order, $item; push @{$params{$item}}, undef; } } # now return the collection return ( $host, $port, \@dirs, $file, \%params, \@order ); } sub deparse_url { #------------------------------------------------------------------------------- # takes a number of parameters and builds the request url my ( $url, $host, $port, $path, $file, $page, $dirs, $param, $params, $item ); ( $host, $port, $dirs, $file, $params ) = @_; $url = $host; # build the host portion if ( $port != 80 ) { $url .= ":80"; } # now build the page access $url .= "/" . build_path( $dirs, $file ); # and finally handle the parameters if any if ( $params and keys %{$params} ) { $url .= "?" . build_parameters( $params ); } return $url; } sub build_path { #------------------------------------------------------------------------------- # takes a dir array and builds a path my ( $path, $dirs, $page ); $dirs = shift; $page = shift; if ( $dirs ) { $path = join( "/", grep $_, @{$dirs} ); $path &&= "$path/"; } return $path .= $page; } sub build_parameters { #------------------------------------------------------------------------------- # takes a hash of parameters and builds a string with it my ( $params, $paramstr, @paramlist, $order ); $params = shift or return undef; $order = shift; if ( keys %{$params} ) { # Deep copy hash and array refs only. my $tmp; while (my ($k, $v) = each %$params) { if (! ref $v) { $tmp->{$k} = $v; } elsif (ref $v eq 'HASH') { $tmp->{$k} = {}; foreach my $k1 (keys %{$params->{$k}}) { $tmp->{$k}->{$k1} = $params->{$k}->{$k1}; } } elsif (ref $v eq 'ARRAY') { $tmp->{$k} = []; foreach my $v1 (@{$params->{$k}}) { push @{$tmp->{$k}}, $v1; } } else { $tmp->{$k} = $v; } } if ( $order ) { foreach my $key ( @$order ) { my $value = shift @{$tmp->{$key} || []}; if ( $tmp->{$key} and not @{$tmp->{$key}} ) { delete $tmp->{$key} } $key = GT::CGI::escape( $key ); push @paramlist, ( defined $value ? "$key=$value" : $key ); } } require GT::CGI; $paramstr = join "&", ( @paramlist, map { my $key = GT::CGI::escape($_); my $list = $tmp->{$_}; ( ref $list eq 'ARRAY' ) ? join "&", map { "$key" . ( defined $_ ? "=". GT::CGI::escape($_) : '' ); } @{$list} : "$key=$list"; } keys %{$tmp} ); } return $paramstr; } sub get { #------------------------------------------------------------------------------- # simple blocking method to get some data my ( $uri, $options, $HTTP, $doc ); shift; # get rid of the first parameter $uri = shift or return undef; $options = ( @_ and ref $_[0] ) ? shift : {@_}; $HTTP = new GT::URI::HTTP($options); $HTTP->fetch($uri); while ( not defined( $doc = $HTTP->do_iteration(-1) ) ) {}; return $doc; } 1; __END__ =head1 NAME GT::URI::HTTP - HTTP request broker. =head1 SYNOPSIS use GT::URI::HTTP; print GT::URI::HTTP->get( "http://www.gossamer-threads.com" ); =head1 DESCRIPTION GT::URI::HTTP makes requests and retrieves resources from http servers (not limited to text). Can be used stand-alone or through GT::URI =head1 Method List Socket Handling sub pending() Returns true if data awaiting sub EOF() Returns open/closed status of socket sub gulp_read() Alias to do_iteration sub do_iteration() Basic looping function that downloads resources in the background Acquisition sub fetch() Tell the object which URL to acquire sub method() The method of acquisition sub load_parameter() Add a item for CGI parameters sub delete_parameter() Delete a CGI parameter sub resource_attrib() Headers related to resource and server sub get() Simple resource aquisition function Support Methods (must be imported) sub parse_url() Decomposes a URL into constituent parts sub deparse_url() Takes those parts and builds an URL sub build_path() Takes a list of directories and builds a path sub build_parameters() Takes a hash of parameter->values and builds a CGI request string =head1 Basics =head2 Getting a resource, the simple way Just want a single item? Call GT::URI::HTTP->get and all the magic will be done for you. use GT::URI::HTTP; my $buf = GT::URI::HTTP->get( "http://www.gossamer-threads.com/" ); Get based requests are permissable as well: use GT::URI::HTTP; my $buf = GT::URI::HTTP->get( "http://search.yahoo.com/bin/search?p=gossamer+threads" ); If extra options need to be set, simply append the options to the parameter list, like follows. use GT::URI::HTTP; my $buf = GT::URI::HTTP->get( "http://search.yahoo.com/bin/search?p=gossamer+threads", { request_method => 'POST' } ); =head2 When just the document is not enough If a new GT::URI::HTTP object is instantiated, much more control is available, including facilities for non-blocking downloading of pages. To create a GT::URI::HTTP object, call new with all the options required: use GT::URI::HTTP; my $http = new GT::URI::HTTP( # URL to acquire (optional) 'URL' => '', # Can also be set to POST/GET/HEAD (optional) 'request_method' => 'GET', # a hash of keys pointing to an arrayref of values to be sent to the server # { # 'key' => [ 'value1', 'value2'... ], # } # (optional) 'parameters' => {}, # Name portion of the User-Agent: string the server acquires (optional) 'agent_name' => 'Mozilla/4.73 [en]', # Host-from portion of the User-Agent: string the server acquires (optional) 'agent_host' => 'X11; I; Linux 2.2.15-4mdk i586', # To prevent downloading of 80Tb files, but if you still wanted to, set this to 0 (optional) 'max_down' => 200000 ); If URL has been specified in the options, for interactions with a CGI, you can set extra parameters with $http->load_parameter(). Finally, loop on $http->do_iteration() until the value is defined. To replicate the "simple get" example: use GT::URI::HTTP; $|++; my $http = new GT::URI::HTTP( URL => 'http://search.yahoo.com/bin/search', # can also use the following: parameters => { 'p' => [ 'gossamer threads' ] } ); my $doc; while ( not defined( $doc = $http->do_iteration() ) ) { # do something here while waiting for the resource to arrive print "." } print $doc, "\n\n"; Beyond the resource, the http server often supplies extra information in a header. To access this information, use $http->resource_attrib(). Appending this code to the previous example, a list of all the associated server headers can be seen: my $attribs = $http->resource_attrib(); foreach my $key ( sort keys %{$attribs} ) { print "$key => $attribs->{$key}\n"; } =head2 Support Methods In addition to the basic fetching abilities, since the module must parse HTTP URLs, the methods used to do so have been made public. These methods decompose URLs into datastructures that make URLs easily studied or modified and then reconstructed. However, these routines have not been polished for useability so beware! The following is a very basic example: use GT::URI::HTTP qw/ parse_url deparse_url build_path build_parameters /; # fragment the URL my ( $host, $port, $dirs, $file, $params ) = parse_url( 'http://www.gossamer-threads.com/perl/forum/showflat.pl?Cat=&Board=GosDisc&Number=113355&page=0&view=' ); print "Parsed Data:\n\n"; print "Host: $host\n"; print "Port: $port\n"; print "Dirs:\n"; foreach my $dir ( @{$dirs} ) { print " $dir/\n"; } print "Resource Filename: $file\n"; print "Params:\n"; foreach my $key ( sort keys %{$params} ) { print " $key: "; my $values = ( $params->{$key} || {} ); foreach my $value ( sort @{$values} ) { print "'", quotemeta($value), "' "; } print "\n"; } # put the data back together again my $url = deparse_url( $host, $port, $dirs, $file, $params ); print "\nDeparsed Data:\n\n"; print "URL: http://$url\n"; =head1 Methods List =head2 build_path ( dir ARRAYREF, [ page STRING ] ) : STRING Takes an array ref of directory names and an optional filename and returns a filepath. use GT::URI::HTTP qw/ build_path /; print build_path( [ 'topdir', 'middir', 'bottomdir' ], 'file.html' ); This function must be imported. =head2 build_parameters ( parameter HASHREF ) : STRING Builds a CGI request string from list of keys and values. The function has the ability to handle keys with more than one parameter, simply use an arrayref with multiple values. use GT::URI::HTTP qw/ build_parameters /; my $params = { 'simplekey' => 'value' 'onekey' => [ 'one value' ], 'anotherkey' => [ 'another value', 'and yet anotherone!' ], }; print build_parameters($params); This function must be imported. =head2 delete_parameter ( keys ARRAYREF/ARRAY ) When loading the object with parameters before a request, it is possible to delete an entire set of keys and values. =head2 deparse_url ( host STRING, [ port STRING, [ dirs ARRAYREF, [ file STRING, [ params HASH ] ] ] ] ) : STRING This builds an entire URL from basic parameters. For an example of this function, see the example in "Support Methods". This function must be imported. =head2 do_iteration ( tics INTEGER ) : STRING The basic iteration function. This function will return undef until the resource is received which, upon receipt will return the resource data. The function can return an empty string, so it is important to checked defined'ness. If the return is an empty string, check the ERROR_CODE in resource_attrib to find out if the script simply can't connect to the host or the resource is empty. =head2 EOF () : BOOLEAN Returns '1' or '0' depending if the object has stopped receiving/sending data to the remote server. =head2 fetch ( url STRING, [ parameters HASHREF ] ) Tells the server the URL to retreive the resource of. If CGI parameters are required pass in a hash of keys and values. =head2 GT::URI::HTTP->get ( url, [ options HASH/HASHREF ] ) : RESOURCE_DATA Simplest resource aquision method. Give it the URL and any options and the function will return after the resource has been downloaded. =head2 gulp_read ( tic INTEGER ) : RESOURCE_DATA This is just an alias to the function do_iteration. This method is used by GT::URI in its mass resource aquisition runs. Unless you feel like being different, you shouldn't need to use this. =head2 load_parameter ( params HASH/HASHREF ) : HASHREF Takes a list of keys and values and loads the values into the list of CGI parameters to be sent to the remote server. =head2 method ( method STRING ) : STRING Sets the acquisition method for the resource. Currently, GET/POST/HEAD are supported. If no parameters are supplied the function simply returns the current acquisition method. =head2 parse_url ( url STRING ) : host STRING, port INTEGER, dirs ARRAYREF, file STRING, params HASHREF Takes an URL and decomposes it into easily manipulated datastructures. The output can be fed back into deparse_url to reconstruct an URL. This function must be imported. =head2 pending ( tics INTEGER ) : BOOLEAN If there is data available to be downloaded, this function returns '1', otherwise '0'. This is another function used by GT::URI in it's mass downloads and unlikely to be of any use to anyone using this module directly. This function exists because it is lighter than do_iteration which can be quite a load as opposed to this if there were 100 racked downloads, all being polled every tenth of a second! =head2 resource_attrib ( [ key STRING ] ) STRING or HASHREF If a key is requested, function will return the value associated with the resource attribute. If not, the function will return a hashref keyed by server parameter to its corresponding value. All the server keys have been converted into lower-case. This prevents conflict with two very important keys, ERROR_CODE, and ERROR_MESSAGE, which carry the HTTP error code and message associated with the aquisition of this page. =head1 EXAMPLES =head2 HTTP get example #!/usr/bin/perl use GT::URI::HTTP; if ( not @ARGV ) { print qq! SYNOPSIS $0 url [-f/-h] [ cgi_parameter1=value1 cgi_parameter2=value2 ... ] basic HTTP requestor OPTIONS -f : full information; headers and resource. Usually only a dump of the resource is provided. -h : just the headers\n\n!; exit; } # parse out the command line # first argument, URL $url = shift @ARGV; # next arguments, parameters foreach my $item ( @ARGV ) { # ... check for special requests if ( $item =~ /^-([fd])$/ ) { $mode = $1; } # ... is not a special request, but probably a parameter ( $key, $value ) = ( $item =~ /([^=]+)=(.*)/ ); $key ||= $item; push @{$parameters->{$key}}, $value; } # setup and send the request $http = new GT::URI::HTTP( # if we're only looking to use the head request_method => ( $mode eq 'h' ? 'HEAD' : 'GET' ) ); $http->fetch( $url, $parameters ); # get the resource while ( not defined ( $doc = $http->do_iteration(-1) ) ) {} # and print out the headers if wanted if ( $mode ) { $headers = $http->resource_attrib(); foreach $key ( sort keys %{$headers || {}} ) { print "$key: $headers->{$key}\n"; } print "\n"; } # and output the resource... print $doc; =head1 COPYRIGHT Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved. http://www.gossamer-threads.com/ =head1 VERSION Revision: $Id: HTTP.pm,v 1.30 2002/06/27 18:36:02 aki Exp $ =cut