# ==================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::WWW::http # Author: Jason Rhinelander # CVS Info : 087,071,086,086,085 # $Id: http.pm,v 1.31 2005/04/08 19:20:00 jagerman Exp $ # # Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. # ==================================================================== # # Description: # Handles HTTP GT::WWW connections # package GT::WWW::http; use strict; use Carp; use GT::Socket::Client qw/:crlf/; use GT::WWW::http::Response; use GT::WWW::http::Header; use vars qw/@ISA $VERSION/; @ISA = 'GT::WWW'; $VERSION = sprintf "%d.%03d", q$Revision: 1.31 $ =~ /(\d+)\.(\d+)/; use constants BUFLENGTH => 4096, # The amount to read at a time from files to upload CHUNK_SIZE => 64 * 1024; # The default maximum chunk size, if not specified my $valid_username = '\x00-\x08\x0a-\x1f\x7f'; # Find out if Compress::Zlib of at least 1.14 is available. Versions prior to # 1.14 had a "memory overwrite" bug in inflate(), which this module uses. my $have_zlib = eval { require Compress::Zlib; Compress::Zlib->require_version(1.14) } and import Compress::Zlib; sub get { my $self = shift; $self->{redirected} = {}; $self->{redirect} = undef; my $ret = $self->_get; $self->chunk(undef); $ret; } sub _get { # May recurse, if a 3xx status is encountered my ($self, $head) = @_; $self->{port} ||= $self->default_port; $self->{host} or croak 'You must set a host before calling ' . ($head ? 'head()' : 'get()'); $self->{cancelled} = undef; my $request_header = $self->header(); $request_header->replace_header( Host => ($self->{port} == 80 ? $self->{host} : "$self->{host}:$self->{port}") ); $self->_set_basic_headers; CONNECT: { $self->debug("Connecting...") if $self->{debug} >= 2; $self->_connect or return undef; # _connect generates the error $self->debug("Connected.") if $self->{debug} >= 2; $self->{path} = '/' unless defined $self->{path} and length $self->{path}; my $path = $self->_escape_path; if (my $query = $self->query_string) { $path .= "?$query"; } my $header = ($head ? "HEAD" : "GET") . " $path HTTP/"; $header .= $self->{http_10} ? "1.0" : "1.1"; $header .= CRLF; $header .= $request_header; $self->debug("Sending header:\n$header") if $self->{debug} >= 2; $self->send($header) or return $self->error("Could not write to the server: $!"); $self->debug("Header sent") if $self->{debug}; $self->debug("Reading header") if $self->{debug} >= 2; my $read_header = $self->_read_header; if (!$read_header and $head and $self->{reused}) { $self->debug("Assuming broken server sending a body for a HEAD response; reconnecting") if $self->{debug}; delete $self->{sock}; delete $self->{reused}; redo CONNECT; } $read_header or return; $self->debug($self->{debug} >= 2 ? "Read header: $self->{response}->{header}" : "Read header") if $self->{debug}; } unless ($head or $self->response->status == 304) { $self->debug("Reading body") if $self->{debug}; $self->_read_body or return; $self->debug("Read body") if $self->{debug}; if ($self->{cancelled}) { $self->debug("Closing socket (action cancelled in mid-download)") if $self->{debug} >= 2; close $self->{sock}; $self->{connected} = undef; } } elsif (!$head and $self->{debug} >= 2) { $self->debug("Not reading body: Server returned status: 304 " . $self->response->status); } my $code = int($self->response->status); if ($code == 300 or $code == 301 or $code == 302 or $code == 303 or $code == 307) { $self->debug("Server returned status $code - redirect possible") if $self->{debug}; unless ($self->{no_redirect}) { if (my $location = $self->response->header("Location")) { my ($protocol, $username, $password, $host, $port, $path, $query_string) = $self->parse_url($location); if (!$protocol and substr($location, 0, 1) eq '/') { $self->debug("Server returned standard-violating non-absolute Location: $location") if $self->{debug}; if ($self->{strict}) { $self->debug("Strict mode enabled; not following local redirect") if $self->{debug}; } else { my $hrelaxed = '(?:[^\x00-\x08\x0a-\x1f#%/?]|%[0-9A-Fa-f]{2})+'; my $qrelaxed = '(?:[^\x00-\x08\x0a-\x1f#%]|%[0-9A-Fa-f]{2})+'; if ($location =~ m( ^ ( /(?:$hrelaxed(?:/(?:$hrelaxed)?)*)? # Match the path (/foo/bar.cgi) ) (?: \?($qrelaxed)? )? (?: \#.* # Allow a possible anchor - but we don't care about it )? $ )iox) { $self->debug("Strict mode not enabled; following local redirect") if $self->{debug}; $protocol = $self->protocol; $host = $self->host; $port = $self->port; $path = $1; $query_string = $2; } elsif ($self->{debug}) { $self->debug("Unable to extract relative URL from '$location'"); } } } if ($protocol and $protocol eq $self->protocol and not $self->{redirected}->{$location}++) { $self->debug("Redirecting to: $location") if $self->{debug}; push @{$self->{redirect}}, $self->{response}; $self->host($host); $self->port($port) if $port; if ($username) { $self->username($username); $self->password($password) if $self->password; } elsif ($self->{debug} >= 2) { $self->debug("Not resetting username/password for redirect"); } $self->path($path); $self->query_string($query_string); return $self->_get($head); } elsif ($self->{debug}) { if ($protocol ne $self->protocol) { $self->debug("Not redirecting: Unable to redirect to a different protocol"); } elsif ($protocol) { $self->debug("Not redirecting: Redirect loop detected"); } else { $self->debug("Not redirecting: '$location' is not valid (or not supported by GT::WWW)"); } } } elsif ($self->{debug}) { $self->debug("Not redirecting: server header did not include a Location"); } } elsif ($self->{debug}) { $self->debug("Not redirecting - no_redirect option is enabled"); } } return $self->response if defined wantarray; } sub post { my $self = shift; $self->{port} ||= $self->default_port; $self->{host} or croak 'You must set a host before calling post()'; $self->{cancelled} = undef; my $request_header = $self->header; $request_header->replace_header( Host => ($self->{port} == 80 ? $self->{host} : "$self->{host}:$self->{port}") ); my @param = $self->parameters; my $file_upload; for (my $i = 1; $i < @param; $i += 2) { if ($param[$i] and UNIVERSAL::isa($param[$i], 'GT::WWW::File')) { $file_upload++; last; } } my (@ct) = $request_header->header_words('Content-Type'); unless (@ct) { if ($file_upload) { @ct = ('multipart/form-data' => undef); } else { @ct = ('application/x-www-form-urlencoded' => undef); } $request_header->replace_header('Content-Type' => \@ct); } elsif ($ct[0] eq 'form-data') { $ct[0] = 'multipart/form-data'; $request_header->replace_header('Content-Type' => \@ct); } my ($content, $chunked); my @transfer_encoding = $request_header->header_words('Transfer-Encoding'); for (my $i = 0; $i < @transfer_encoding; $i += 2) { if ($transfer_encoding[$i] eq 'chunked') { $chunked++; splice @transfer_encoding, $i, 2; $i -= 2; } } if (@param) { my %ct = @ct; if (exists $ct{'multipart/form-data'}) { my ($boundary, $boundary_i); for (my $i = 0; $i < @ct; $i += 2) { if (lc $ct[$i] eq 'boundary') { $boundary = $ct[$boundary_i = $i + 1]; } } ($content, $boundary, my $content_length) = $self->_form_data($boundary); # N.B.: The RFC recommends that boundary NOT be quoted, which means "token" characters only if ($boundary_i) { $ct[$boundary_i] = $boundary; } else { push @ct, boundary => $boundary; } $request_header->replace_header('Content-Type' => \@ct); $content_length or $chunked++; # If the content length cannot be obtained, we have to use chunked encoding } else { $content = $self->query_string; $request_header->replace_header('Content-Length' => length($content)); } } elsif (my $data = $self->post_data) { $content = $data; $chunked = 1 if ref $data; $request_header->replace_header('Content-Length' => length($content)) if (! ref $data); } if ($chunked) { unshift @transfer_encoding, chunked => undef; $request_header->replace_header('Transfer-Encoding' => \@transfer_encoding); } $self->_set_basic_headers; $self->debug("Connecting...") if $self->{debug} >= 2; $self->_connect or return undef; $self->debug("Connected.") if $self->{debug} >= 2; $self->{path} = '/' unless defined $self->{path} and length $self->{path}; my $path = $self->_escape_path; my $header = "POST $path HTTP/"; $header .= $self->{http_10} ? "1.0" : "1.1"; $header .= CRLF; $header .= $request_header; $self->debug("Sending header:\n$header") if $self->{debug} >= 2; $self->send($header) or return $self->error("Could not write to the server: $!"); $self->debug("Header sent") if $self->{debug} >= 2; if ($request_header->contains(Expect => '100-continue')) { $self->_read_header(1) or return; my $response = $self->response; int($response->status) == 100 or return $response; } if (ref $content) { # Content is either a string, or a code ref $self->debug("Sending content to server using chunked encoding") if $self->{debug} >= 2; while (defined(my $c = $content->())) { if ($chunked) { $self->debug(sprintf("Chunk: %x", length($c))) if $self->{debug} >= 3; $self->send(sprintf("%x$CRLF", length($c))); } $self->debug("Content:\n$c\n") if $self->{debug} >= 3; $self->send($c); } $self->debug("Chunked content sent") if $self->{debug} >= 2; } elsif (defined $content) { $self->debug("Sending content") if $self->{debug} >= 2; $self->debug("Content:\n$content\n") if $self->{debug} >= 3; $self->send($content); $self->debug("Content sent") if $self->{debug} >= 2; } $self->debug("Reading header") if $self->{debug} >= 2; $self->_read_header or return; $self->debug("Read header: $self->{response}->{header}") if $self->{debug} >= 2; $self->debug("Reading body") if $self->{debug} >= 2; $self->_read_body or return; $self->debug("Read body") if $self->{debug} >= 2; if ($self->{cancelled}) { $self->debug("Closing socket (action cancelled in mid-download)") if $self->{debug} >= 2; close $self->{sock}; $self->{connected} = undef; } $self->chunk(undef); return $self->response if defined wantarray; } sub head { my $self = shift; $self->{redirected} = {}; $self->{redirect} = undef; $self->_get(1); } sub default_port { 80 } sub send { my ($self, $data) = @_; my $sock = $self->{sock}; print $sock $data; } sub cancel { my $self = shift; $self->{cancelled} = 1; } sub cancelled { my $self = shift; $self->{cancelled}; } # URL escapes the path sub _escape_path { my $self = shift; my $path = $self->{path}; $path =~ s#([^\w/\$.+!*'()|,;~&=:\@-])#sprintf "%%%02X", ord $1#eg; $path; } sub _form_data { my ($self, $boundary) = @_; my @param = $self->parameters; my (@parts, $fh_parts); for (my $i = 0; $i < @param; $i += 2) { my $part = $self->new_form_part(@param[$i, $i + 1]); push @parts, $part; $fh_parts++ if $part->fh; } if (!$boundary) { if ($fh_parts) { $boundary = boundary(8); # 512 random characters; hopefully good enough } else { $boundary = boundary(); my $bno = 1; CHECK_BOUNDARY: { for (@parts) { if (index($_->content, $boundary) >= 0) { # The data contains this boundary - look for a better one ++$bno; $boundary = boundary($bno > 8 ? 8 : $bno); redo CHECK_BOUNDARY; } } } } } for (my $i = 0; $i < @parts; $i += 2) { splice @parts, $i, 0, \"--$boundary"; } push @parts, \"--$boundary--"; my $content_length = 0; for (@parts) { if (ref eq 'SCALAR') { $content_length += length($$_) + length(CRLF); } else { if (my $size = $_->size) { $content_length += $size + length(CRLF); } else { # We can't determine the size $content_length = 0; last; } } } $self->{header}->replace_header('Content-Length' => $content_length) if $content_length; my $reading; my $content = sub { if ($reading) { my $n = read($reading, my $buf, BUFLENGTH); if ($n) { $content_length -= $n if $content_length; return $buf; } else { close $reading; $reading = undef; $content_length -= length(CRLF) if $content_length; return CRLF; } } while () { if (!@parts) { defined $content_length and $content_length != 0 and croak "Data sent did not match calculated content-length (still have $content_length). This probably means an uploaded file changed in size during transfer."; return; } elsif ($content_length and $content_length < 0) { croak "Data sent exceeded calculated content-length. This probably means an upload file changed in size during transfer."; } my $p = shift @parts; if (ref $p eq 'SCALAR') { $$p .= CRLF; while (@parts and ref $parts[0] eq 'SCALAR') { $$p .= ${shift @parts} . CRLF; } $content_length -= length $$p if $content_length; return $$p; } elsif (my $fh = $p->fh) { my $buf = "" . $p->header; my $n = read($fh, $buf, BUFLENGTH, length($buf)); # Append to $buf if ($n) { $reading = $fh; } else { close $fh; } if (my $len = length $buf) { $content_length -= $len if $content_length; return $buf; } } else { my $buf = $p->header . $p->content . CRLF; $content_length -= length($buf) if $content_length; return $buf; } } }; return wantarray ? ($content, $boundary, $content_length) : $content; } # Sets up the generic headers (such as Authorization and Accept-Encoding) for # any request (GET, POST, or HEAD). sub _set_basic_headers { my $self = shift; my $header = $self->header; if (!$header->header('Accept')) { $header->header(Accept => '*/*'); } if ($have_zlib and !$header->header('Accept-Encoding')) { $header->header('Accept-Encoding' => ['gzip' => undef, 'deflate' => undef]); } unless ($header->header('Connection')) { if ($self->{http_10}) { $header->header(Connection => 'close'); } else { $header->header(Connection => 'keep-alive'); } } unless ($header->header('User-Agent')) { $header->replace_header('User-Agent' => $self->agent); } if (defined(my $username = $self->username)) { # RFC 2616 my $password = $self->password; $password = '' if not defined $password; $username =~ /[$valid_username]/ and croak "Invalid HTTP username: contains CTL's"; $username =~ /:/ and croak "Invalid HTTP username: Username cannot contain ':'"; my $encoded = encode_base64("$username:$password"); $header->replace_header(Authorization => "Basic $encoded"); } 1; } sub boundary { my $c = shift or return "XyZzAYz"; my $return = "XyZzAYz"; for (1 .. $c ** 2) { $return .= ('a' .. 'z', 'A' .. 'Z', 0 .. 9)[rand 62]; } $return; } # Returns the header object for the current object, creating a new one if # necessary. Note that this is the _request_ header - the response header is # available through the header() method of the response object (see response()) sub header { my $self = shift; $self->{header} ||= $self->new_header; if (@_) { $self->{header}->header(@_); } else { $self->{header}; } } # Attempts to read the header from the server. Note that a true argument tells # the method NOT to continue if a "100 Continue" header is received - without # the true argument, the 100 Continue header will be skipped. sub _read_header { my ($self, $ignore_100) = @_; my $response = $self->new_response; my $header = $self->new_header; $self->{response} = $response; $response->{header} = $header; $self->{sock}->readline(my $status); defined $status or return $self->error("Server did not send status line"); if ($self->{reused} and $status !~ /\S/) { # When using keep-alive connections, there will (sometimes?) be a blank line here $self->{sock}->readline($status); } $status =~ m|^HTTP/(1\.[10]) ([1-5]\d\d)(?: ([^\r\n]*))?| or return $self->error("Server returned invalid status line: $status"); $response->server_version($1); my $int_status = $2; $response->status($int_status, $3 || ''); my ($line, $last, $last_value); while ($self->{sock}->readline($line) and $line =~ /[^\015\012]/) { $line =~ y/\015\012//d; $line =~ s/%([0-9a-fA-F]{2})/chr hex $1/eg; if ($last and $line =~ s/^[ \t]+/ /) { $last_value .= $line; } else { if ($last) { $header->header($last, $last_value); } ($last, $last_value) = split /:\s*/, $line, 2; for ($last, $last_value) { s/%([0-9a-fA-F]{2})/chr hex $1/eg } } } if ($last) { $header->header($last, $last_value); } # If we received a 100 status, read the header over again return $self->_read_header if $int_status == 100 and not $ignore_100; 1; } sub _read_body { my $self = shift; my $response = $self->response; my $header = $response->header; my ($chunked, $gzip, $deflate); my @te = $header->header_words('Transfer-Encoding'); for (my $i = 0; $i < @te; $i += 2) { if ($te[$i] eq 'chunked') { $chunked++; } if ($have_zlib) { if ($te[$i] eq 'gzip') { $gzip++; $deflate++; } elsif ($te[$i] eq 'deflate') { $deflate++; } } } if ($have_zlib and !$deflate) { my @ce = $header->header_words('Content-Encoding'); for (my $i = 0; $i < @ce; $i += 2) { if ($ce[$i] eq 'gzip') { $gzip++; $deflate++; } elsif ($ce[$i] eq 'deflate') { $deflate++; } } } if ($self->{debug}) { my $encoding = join ', ', ($gzip ? 'gzipped' : ()), ($deflate && !$gzip ? 'deflated' : ()), ($chunked ? 'chunked' : ()); $encoding ||= 'normal (not chunked or compressed)'; $self->debug("Server headers indicate $encoding encoding"); } my $body; $body = '' unless $self->{chunk_code}; if ($chunked) { $self->debug("Server sending in chunked encoding") if $self->{debug}; my ($bytes, $chunks, $data_length, $content_length) = (0, 0, 0, 0); my ($gzip_buffer, $inflator, $gzip_header_removed) = (''); if ($deflate) { # A negative WindowBits is an "undocumented" (it is referenced only as # a comment in the zlib source) zlib feature to not look for a zlib # header. Only "gzip" data needs this option, since the gzip header is # different - and handled seperately. $inflator = inflateInit($gzip ? (-WindowBits => -MAX_WBITS()) : ()); } CHUNK: while () { $self->{sock}->readline(my $line) or return $self->error("Could not read from server: " . $self->{sock}->error); $line =~ /^([0-9A-Fa-f]+)\s*$/ or return $self->error("Server told about chunk encoding, but did not transfer properly (read '$line', wanted hex)"); $bytes = hex($1); unless ($bytes) { $self->debug("Finished reading chunked data") if $self->{debug} >= 2; last; } $self->debug("Reading $bytes byte chunk") if $self->{debug} >= 2; my $b = $bytes; $chunks++; if ($deflate) { $self->{sock}->readblock(my $chunk, $b); $data_length += length $chunk; $gzip_buffer .= $chunk; if ($gzip and not $gzip_header_removed) { _remove_gzip_header(\$gzip_buffer) and $gzip_header_removed++; } my ($output, $status) = $inflator->inflate(\$gzip_buffer); if ($status != Z_OK() and $status != Z_STREAM_END()) { return $self->error("Decompression failed (zlib error status: $status)"); } my $size = length($output); $content_length += $size; $self->debug("After inflation, read:\n$output") if $self->{debug} >= 3; if ($self->{chunk_code}) { my $chunk_size = $self->{chunk_size} || CHUNK_SIZE; while (length($output)) { my $block = \substr($output, 0, $chunk_size); $self->debug("Passing block to chunk_code") if $self->{debug} >= 2; $self->{chunk_code}->($block); $$block = ''; # Clears the $output substring referenced above if ($self->{cancelled}) { $self->debug("Current operation cancelled!") if $self->{debug}; return 1; } } } else { $body .= $output; } } else { if ($self->{chunk_code}) { my $chunk_size = $self->{chunk_size} || CHUNK_SIZE; while ($b > $chunk_size) { $self->debug("Reading $chunk_size bytes ($b remaining in current chunk)") if $self->{debug} >= 2; $self->{sock}->readblock(my $block, $chunk_size); my $size = length $block; $data_length += $size; $b -= $size; $self->debug("Read chunk:\n$block") if $self->{debug} >= 3; $self->debug("Passing chunk to chunk_code") if $self->{debug} >= 2; $self->{chunk_code}->(\$block); if ($self->{cancelled}) { $self->debug("Current operation cancelled!") if $self->{debug}; return 1; } } } if ($b) { $self->debug("Reading $b bytes of chunk") if $self->{debug} >= 2; $self->{sock}->readblock(my $block, $b); $data_length += length $block; $self->debug("Read chunk:\n$block") if $self->{debug} >= 3; if ($self->{chunk_code}) { $self->debug("Passing chunk to chunk_code") if $self->{debug} >= 2; $self->{chunk_code}->(\$block); if ($self->{cancelled}) { $self->debug("Current operation cancelled!") if $self->{debug}; return 1; } } else { $body .= $block; } } } $self->{sock}->readline($line); # Read a CRLF after the chunk $self->debug("Finished reading chunk") if $self->{debug} >= 2; } $self->debug("Read $data_length bytes " . ($deflate ? "of $content_length bytes compresses data " : '') . "in $chunks chunk" . ($chunks == 1 ? "" : "s")) if $chunks and $self->{debug}; } elsif ($deflate) { my ($read, $size); my $remaining; if (my $content_length = $header->header('Content-Length')) { $remaining = $content_length; } # A negative WindowBits is an "undocumented" (it is referenced only as # a comment in the zlib source) zlib feature to not look for a zlib # header. Only "gzip" data needs this option, since the gzip header is # different - and handled seperately. my $inflator = inflateInit($gzip ? (-WindowBits => -MAX_WBITS()) : ()); if ($self->{chunk_code}) { my $header_removed; my ($buffer, $data) = ('', ''); my $chunk_size = $self->{chunk_size} || CHUNK_SIZE; while () { # From my testing, a typical HTML page gzips to anywhere from # 1/10 to 1/2 the size, so for these purposes we try to read # and inflate about 1/2 the maximum block size at a time, in # the hopes that what we grab will be close to the proper # amount of data. This value is essentially arbitrary. my $read_size = int($chunk_size * 0.5) || 1; $read_size = $remaining if $remaining and $remaining < $chunk_size; $self->debug("Reading block of $read_size compressed bytes" . ($remaining ? " ($remaining remaining)" : "")) if $self->{debug} >= 2; $self->{sock}->readblock(my $block, $read_size); my $block_size = length($block); if (defined $remaining) { if (!$block_size or $block_size < $read_size) { return $self->error("Could not read all content from server: " . ($self->{sock}->error || "Unknown error")); } } else { if (!$block_size or $block_size < $read_size) { last; } } $read += $block_size; $remaining -= $block_size if defined $remaining; if (not $header_removed and $gzip) { $data .= $block; if (_remove_gzip_header(\$data)) { $header_removed = 1; } else { if (defined $remaining and $remaining <= 0 and not length $data) { return $self->error("No content after removing gzip header from gzip compressed data"); } next; } $block = $data; } my ($output, $status) = $inflator->inflate(\$block); if ($status != Z_OK() and $status != Z_STREAM_END()) { return $self->error("Decompression failed (zlib error status: $status)"); } $size += length($output); $buffer .= $output; $self->debug("After inflation, read:\n$output") if $self->{debug} >= 3; while (length($buffer) > $chunk_size) { my $block = \substr($buffer, 0, $chunk_size); $self->debug("Passing block to chunk_code") if $self->{debug} >= 2; $self->{chunk_code}->($block); $$block = ''; # Clears the $buffer substring referenced above if ($self->{cancelled}) { $self->debug("Current operation cancelled!") if $self->{debug}; return 1; } } last if defined $remaining and $remaining <= 0; } if (length $buffer) { $self->debug("Passing remaining data to chunk_code") if $self->{debug} >= 2; $self->{chunk_code}->(\$buffer); if ($self->{cancelled}) { $self->debug("Current operation cancelled!") if $self->{debug}; return 1; } } } else { if ($self->{debug} >= 2) { if (defined $remaining) { $self->debug("Reading $remaining compressed bytes from server") } else { $self->debug("Reading all compressed bytes from server") } } $self->{sock}->readblock(my $block, defined($remaining) ? $remaining : -1); $read = length($block); $body = Compress::Zlib::memGunzip(\$block); $size = length($body); defined $body or return $self->error("Decompression failed"); $self->debug("After inflation, read:\n$body") if $self->{debug} >= 3; } $self->debug("Read $read gziped bytes, containing $size bytes of uncompressed data") if $self->{debug}; } else { my $chunk_size = $self->{chunk_size} || CHUNK_SIZE; my $content_length = $header->header('Content-Length'); my $remaining; my $read_size; if ($content_length) { $read_size = $chunk_size < $content_length ? $chunk_size : $content_length; $remaining = $content_length; } elsif (defined $content_length) { # i.e. $content_length == 0 $self->debug("Document contains no data (headers contain Content-Length: 0)") if $self->{debug}; } else { $read_size = -1; } my $read; while ($read_size) { $read_size = $remaining if $remaining and $remaining < $read_size; if ($self->{debug} >= 2) { if ($read_size > 0) { $self->debug("Reading block of $read_size bytes ($remaining remaining) from server") } else { $self->debug("Reading all available data from server (blocking read)") } } my $bytes_read = $self->{sock}->readblock(my $block, $read_size); $read += $bytes_read; $self->debug("Read body:\n$block") if $self->{debug} >= 3; if (defined $remaining) { if (!$bytes_read or $bytes_read < $read_size) { return $self->error("Could not read all content from server: " . ($self->{sock}->error || "Unknown error")); } } else { $self->debug("Read $bytes_read bytes from server") if $self->{debug}; last if not $bytes_read; } $remaining -= $bytes_read if defined $remaining; if ($self->{chunk_code}) { $self->debug("Passing data to chunk_code") if $self->{debug} >= 2; $self->{chunk_code}->(\$block); if ($self->{cancelled}) { $self->debug("Current operation cancelled!") if $self->{debug}; return 1; } if (defined $content_length) { last if $read >= $content_length; } elsif (defined $remaining) { last if $bytes_read < $read_size; } else { last; } } else { $body .= $block; if (defined $content_length) { last if length($body) >= $content_length; } elsif (defined $remaining) { last if $bytes_read < $read_size } else { last; } } } } $response->{content} = $body; 1; } # gzip constants use constants MAGIC1 => 0x1f, MAGIC2 => 0x8b, OSCODE => 3, FTEXT => 1, FHCRC => 2, FEXTRA => 4, FNAME => 8, FCOMMENT => 16, NULL => chr(0), RESERVED => 0xE0, MIN_HDR_SIZE => 10; # This sub returns undef if the scalar ref passed in does not contain a gzip # header. If a header is found, it is removed and the value 1 is returned. sub _remove_gzip_header { my $gzip = shift; return undef if length($$gzip) < MIN_HDR_SIZE; my $substr_base = 0; # C = unsigned char, V = unsigned long my ($magic1, $magic2, $method, $flags, $time, $xflags, $oscode) = unpack('CCCCVCC', $$gzip); return undef unless $magic1 == MAGIC1 and $magic2 == MAGIC2 and $method == Z_DEFLATED() and !($flags & RESERVED); $substr_base += MIN_HDR_SIZE; # FEXTRA defines an extra field. We skip it. if ($flags & FEXTRA) { return undef if length($$gzip) - $substr_base < 2; my ($extra_len) = unpack("x${substr_base}v", $$gzip); # v = unsigned short $extra_len += 2; return undef if length($$gzip) - $substr_base < $extra_len; $substr_base += $extra_len; } # FNAME stores the original file name - we skip it as well. if ($flags & FNAME) { my $name_end = index($$gzip, NULL, $substr_base); return undef if $name_end < 0; $substr_base += $name_end + 1; } # FCOMMENT means the gzip header contains a comment if ($flags & FCOMMENT) { my $comment_end = index($$gzip, NULL, $substr_base); return undef if $comment_end < 0; $substr_base += $comment_end + 1; } # FHCRC stores the CRC for the gzip file, which we can't use if ($flags & FHCRC) { return undef if length($$gzip) - $substr_base < 2 ; $substr_base += 2; } substr($$gzip, 0, $substr_base) = ''; return 1; } sub response { $_[0]->{response} } sub protocol { my $self = shift; if (@_ and $_[0] ne $self->{protocol}) { croak "You cannot change protocols once a protocol has been set"; } return $self->{protocol}; } # Sets HTTP/1.0 mode. By default, all connections use HTTP/1.1 sub http_10 { my $self = shift; if (@_) { if ($self->{sock}) { croak "Cannot change HTTP connection version once connected"; } $self->{http_10} = shift && 1 || undef; # Force a value of 1 or undef } $self->{http_10}; } sub no_redirect { my $self = shift; if (@_) { $self->{no_redirect} = shift && 1 || undef; } $self->{no_redirect}; } sub redirects { my $self = shift; return $self->{redirect} ? @{$self->{redirect}} : undef; } sub _connect { my $self = shift; my $reuse = 0; my ($connect_time, $r, $rheader, $close, $sclose, $timeout); # Here for debugging purposes if (not $self->{http_10} and $self->{sock} and $connect_time = $self->{connected}) { $self->debug("Check existing connection for keep-alive appropriateness") if $self->{debug} >= 2; if ($r = $self->response and not ($close = $self->header->contains(Connection => 'close')) and $r->server_version eq '1.1' and $rheader = $r->header and $self->{host} eq $self->{connected_host} and $self->{port} eq $self->{connected_port}) { unless ($sclose = $rheader->contains(Connection => 'close')) { if ($rheader->header('Keep-Alive')) { # Apache is nice enough to send a header such as: Keep-Alive: timeout=15, max=100 # IIS, on the other hand, doesn't let us know when it's going to time out, nor is # it consistent in the amount of time it gives before timing out. my @p = $rheader->header_words("Keep-Alive"); for (my $i = 0; $i < @p; $i += 2) { $timeout = $p[$i + 1] if $p[$i] eq 'timeout'; } } $timeout ||= 10; # Completely arbitary value - Apache's default is 15, IIS takes some # arbitrary number of seconds to timeout, but it seems to be at least a minute. # If the last connection was made within the last ($timeout) seconds, then # we can attempt to reuse the connection. $reuse = 1 if $connect_time > time - $timeout; } } } if ($reuse) { $self->debug("Reusing established Keep-Alive connection") if $self->{debug}; $self->{connected} = time; $self->{reused} = 1; } else { if ($self->{sock}) { $self->debug("Connection not persisting") if $self->{debug}; if ($self->{debug} >= 2) { if ($self->{http_10}) { $self->debug("... The 'http_10' option has been enabled") } elsif (!$self->response) { $self->debug("... No response object from last request") } elsif ($close) { $self->debug("... Current header specifies 'Connection: close' for next request; honouring for current request") } elsif ($r->server_version ne '1.1') { $self->debug("... Server HTTP version (" . ($r->server_version) . ") is not 1.1") } elsif (!$rheader) { $self->debug("... No response header from last request") } elsif ($self->{host} ne $self->{connected_host}) { $self->debug("... Host has changed since last request") } elsif ($self->{port} ne $self->{connected_port}) { $self->debug("... Server port has changed since last request") } elsif ($sclose) { $self->debug("... Server indicated 'Connection: close' in last response header") } elsif ($connect_time > time - $timeout) { $self->debug("... Too long as passed since last request") } else { $self->debug("Unknown reason [bug]") } # Shouldn't happen - hence "[bug]" } } my $conn_timeout = $self->connection_timeout; # If a connection is already established, reassigning to $self->{sock} will free # the old one, thereby implicitly closing it. $self->{sock} = GT::Socket::Client->open( host => $self->{host}, port => $self->{port}, ($conn_timeout ? (timeout => $conn_timeout) : ()), debug => $self->{debug}, (UNIVERSAL::isa($self, 'GT::WWW::https') ? (ssl => 1) : ()) ); $self->{sock} or return $self->error("Could not connect to $self->{host}: " . GT::Socket::Client->error); $self->{connected} = time; $self->{connected_host} = $self->{host}; $self->{connected_port} = $self->{port}; delete $self->{reused}; } return 1; } # Not really "real" base64 encoding - this doesn't break into lines of 76 # characters, and so doesn't have an EOL character either. sub encode_base64 { my $res = ''; pos($_[0]) = 0; # In case something has previously adjusted pos while ($_[0] =~ /(.{1,45})/gs) { $res .= substr(pack(u => $1), 1, -1); } $res =~ tr|` -_|AA-Za-z0-9+/|; my $padding = (3 - length($_[0]) % 3) % 3; $res =~ s/.{$padding}$/'=' x $padding/e if $padding; $res; } sub new_header { shift; GT::WWW::http::Header->new(@_) } sub new_response { shift; GT::WWW::http::Response->new(@_) } sub new_form_part { shift; GT::WWW::http::Part->new(@_) } package GT::WWW::http::Part; use GT::Socket::Client qw/:crlf/; use Carp; sub new { my ($class, $k, $v) = @_; my $self = {}; bless $self, ref($class) || $class; my $h = $self->{header} = $self->new_header; $h->_separator(';'); if (!ref $v) { $self->{content} = $v; $h->_force_quotes; $h->header( 'Content-Disposition' => ['form-data' => undef, name => $k] ); $h->_unforce_quotes; } elsif (ref $v and UNIVERSAL::isa($v, 'GT::WWW::File')) { $h->_force_quotes; $h->header( 'Content-Disposition' => ['form-data' => undef, name => $k, filename => $v->filename] ); $h->_unforce_quotes; if ($h->header('Content')) { $self->{content} = $h->header('Content'); $h->delete_header('Content'); } else { my $fh = $v->fh; my $size; unless (-f $fh and $size = -s _) { # If we can't get the proper size (for example, /proc/cpuinfo), # or if this filehandle isn't a filehandle (socket, block # device, etc.) then we need to slurp the file. local $/; $self->{content} = <$fh>; close $fh; $size = length($self->{content}); } else { $self->{content} = $fh; } require GT::MIMETypes; $h->header( 'Content-Type' => scalar GT::MIMETypes->guess_type($v->filename), 'Content-Length' => $size ); } } $self; } # Returns the size of the part, in bytes, _IF_ it can be retrieved. Cases where # it can't be retrieved include where you passed a file to new() which is not a # regular file (e.g. /dev/audio) or a dynamic file (e.g. /proc/mounts), which # reports a size of 0. (This also means that an empty file will cause the size # to be unavailable). Note that the size includes the size of the header. sub size { my $self = shift; if (ref $self->{content} and -f $self->{content} and my $size = -s _) { my $header = $self->header; return length("$header") + $size; } elsif (!ref $self->{content}) { my $header = $self->header; return length("$header") + length($self->{content}); } return undef; } sub fh { my $self = shift; if (ref $self->{content} eq 'GLOB') { return $self->{content}; } else { return undef; } } sub header { my $self = shift; $self->{header}; } sub content { my $self = shift; $self->{content}; } sub new_header { shift; GT::WWW::http::Header->new(@_) } 1; __END__ =head1 NAME GT::WWW::http - HTTP interface for GT::WWW =head1 SYNOPSIS use GT::WWW; my $www = GT::WWW->new(); $www->protocol('http'); # any valid GT::WWW methods here # ... my $header = $www->header; $header->header("Some-Http-Header" => $value); $header->delete_header("Some-Other-Http-Header"); my $response = $www->get() or die "Could not connect to server: " . $www->error; my $status = $response->status; my $response_code = int $status; # For example, 200, 404, 500, etc. my $response_str = "$status"; # For example, 'OK', 'Not Found', 'Internal Server Error', etc. if ($status) { # This will be true if the status code is a successful one - in other # words, true for 2xx responses, false for others print "Response successful. Content:\n$response\n"; } else { die "Request was not successful ($response_code $response_str)\n"; } =head1 DESCRIPTION GT::WWW::http handles HTTP connections for GT::WWW. It uses some overloading to assist in the ease of use without sacrificing functionality. This document does not cover the basics of a GT::WWW object; those are covered by L. =head1 METHODS =head2 header This method returns the GT::WWW::http::Header object that will be (or has been) sent to the HTTP server. See L for information on using and manipulating a header object. Note that you can add headers without first getting a header object by simply specifying the headers as arguments to header(). Normally, you would call: $www->header->header('X-Foo' => 'bar'); This shortcut allows for: $www->header('X-Foo' => 'bar'); Check L for valid arguments to the header() method. =head2 http_10 This method can be used I initiating a request on the object to force HTTP/1.0 communication with the HTTP server. By default HTTP/1.1 connections are used. Note that HTTP/1.1 is strongly recommended as this module supports keep-alive connections only when using HTTP/1.1. To force HTTP/1.0 communication, pass a true value to this method, or a false value to use the default HTTP/1.1 connections. Returns true if HTTP/1.0 connections will be used. =head2 strict This works as described in GT::WWW. Specifically, in addition to the loose query string restrictions, this allows relative URL Location: redirects (HTTP/1.1 specifically states that Location: redirects MUST be absolute). =head2 no_redirect This method is used before a request to indicate that automatic, seemless redirection should B take place. By default, when a server responds with an acceptable, properly-formed 3xx response allowing a redirection, this module will automatically perform the redirection, unless this option has been enabled. To enable, call this method with a true value, or to disable, a false value. Returns true if automatic redirection is enabled. Note that redirections will only be performed on GET requests. =head2 redirects If redirections are enabled (i.e. the no_redirect option has not been turned on), you can call the redirects() method to get a list of response objects created while performing redirections. Typically this will be just one, but more are possible. =head2 response Returns the response object for the last request. When automatic redirection is enabled, this will be the response object for the final request. The response object can be used is multiple ways, which are described below, in the L section, and in L. =head2 cancel This works as described in L, with one exception: if cancelling a request immediately before a redirect takes place, only the current request is cancelled - the redirect still occurs. Note that cancelling is likely to be a resource hit in such a case because the connection cannot be reused and a new one must be established - typically, to the same server. =head1 RETURN VALUES The return values of the L|GT::WWW/get>, L|GT::WWW/head>, and L|GT::WWW/post> methods are simply the response object for the request, which can also be obtained by calling the L|/response> method after completing the request. The full documentation for the response object is covered in L, however the below description is provided for a brief overview. The following examples assume that "C<$response>" is an object that has been obtained by calling get(), head(), post(), or response(). =head2 Status The status of the request is available via the ->status method of the response object. It is made up of three pieces of data - status code, status string, and success. To get the status code (e.g. 500, 200, etc.), simply use the status as a number: my $status = int($response->status); To get the status string (e.g. "500 Internal Server Error", "200 OK"), use the status as a string: my $status = "" . $response->status; And finally, to get the success of the request, simply use status in boolean context: if ($response->status) { Success for HTTP is defined by any 200-level response status code. A request that returns "200 OK" will be pass the above if statement, while a request that returned "500 Internal Server Error" will fail. =head2 Content The content of the last request is available by simply using the response object as a string: my $content = "$response"; You should take note, however, that if you are using the L|GT::WWW/chunk> method no content will be available in this way. Also note that the response object is an object, not a string, so anything beyond basic string comparison/concatenation should not occur on the response object itself. See L for more details. =head2 Headers The header() method of the response object returns a GT::WWW::http::Header object which gives you easy access to the headers returned by the server with the request. As a special shortcut, calling header() with arguments will call the L method of the header object with the arguments provided. This allows you to optionally change this: my $location = $response->header->header('Location'); into the shorter and clearer: my $location = $response->header('Location'); Calling header() without arguments returns the header object for the response. =head1 SEE ALSO L L L RFC 2616: L =head1 MAINTAINER Jason Rhinelander =head1 COPYRIGHT Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. http://www.gossamer-threads.com/ =head1 VERSION Revision: $Id: http.pm,v 1.31 2005/04/08 19:20:00 jagerman Exp $ =cut