1117 lines
34 KiB
Perl
1117 lines
34 KiB
Perl
|
# ====================================================================
|
||
|
# Gossamer Threads Module Library - http://gossamer-threads.com/
|
||
|
#
|
||
|
# GT::WWW
|
||
|
# Author: Jason Rhinelander
|
||
|
# CVS Info :
|
||
|
# $Id: WWW.pm,v 1.25 2005/04/08 19:25:31 jagerman Exp $
|
||
|
#
|
||
|
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||
|
# ====================================================================
|
||
|
#
|
||
|
# Description:
|
||
|
# Implements retrieving and posting to WWW sites, using HTTP or HTTPS. HTTPS
|
||
|
# support requires Net::SSLeay.
|
||
|
#
|
||
|
# GT::URI should be considered deprecated in favour of this module.
|
||
|
#
|
||
|
|
||
|
package GT::WWW;
|
||
|
use strict;
|
||
|
use Carp;
|
||
|
use Symbol;
|
||
|
use Net::servent;
|
||
|
use vars qw/%PUBLIC $ERRORS %PROTOCOL $HOST_RE $VERSION @EXPORT_OK @ISA/;
|
||
|
require Exporter;
|
||
|
|
||
|
$VERSION = sprintf "%d.%03d", q$Revision: 1.25 $ =~ /(\d+)\.(\d+)/;
|
||
|
|
||
|
@EXPORT_OK = qw/get post head getprint postprint/;
|
||
|
@ISA = 'Exporter';
|
||
|
|
||
|
%PUBLIC = ( # Public methods, passable via new(). The values are the equivelant methods.
|
||
|
server => 'server',
|
||
|
port => 'port',
|
||
|
path => 'path',
|
||
|
username => 'username',
|
||
|
password => 'password',
|
||
|
header => 'header',
|
||
|
parameters => 'parameters',
|
||
|
query_string => 'query_string',
|
||
|
debug => 'debug_level',
|
||
|
debug_level => 'debug_level',
|
||
|
agent => 'agent',
|
||
|
chunk => 'chunk',
|
||
|
chunk_size => 'chunk_size'
|
||
|
);
|
||
|
|
||
|
%PROTOCOL = (
|
||
|
http => __PACKAGE__ . "::http",
|
||
|
https => __PACKAGE__ . "::https",
|
||
|
# ftp => __PACKAGE__ . "::ftp"
|
||
|
);
|
||
|
|
||
|
# This must capture two and only two subpatterns - host and port.
|
||
|
# The following _REQUIRES_ /x regex modifier! I'd use (?x), but it doesn't
|
||
|
# work the same way in 5.004 that it does now.
|
||
|
$HOST_RE = '(
|
||
|
(?: \w(?:[-\w]*\w)? \. )* # "foo.foo." of "foo.foo.com"
|
||
|
\w(?:[-\w]*\w)? # and the "com", or possibly just a host like "penguin"
|
||
|
|
|
||
|
(?:(?:[01]?\d?\d|2[0-4]\d|25[0-5])\.){3} # Match an IP, but only 0-255 in
|
||
|
(?:[01]?\d?\d|2[0-4]\d|25[0-5]) # each field - nothing > 255
|
||
|
)
|
||
|
(?::(\d+)?)? # port - The RFC says: http://foo:/path is valid
|
||
|
';
|
||
|
|
||
|
|
||
|
#use Socket; # GT::Socket's read/write methods aren't sufficient and are too
|
||
|
# buggy. The only way to make use of it would be to use it to connect, then
|
||
|
# directly access the filehandle, so instead of loading all the useless code,
|
||
|
# GT::WWW modules should use GT::Socket::Client instead.
|
||
|
|
||
|
sub new {
|
||
|
my $class = shift;
|
||
|
my $self = {};
|
||
|
bless $self, ref($class) || $class;
|
||
|
if (@_) {
|
||
|
if (ref $_[0] eq 'HASH' or @_ % 2 == 0) {
|
||
|
my $input = ref $_[0] eq 'HASH' ? shift : {@_};
|
||
|
$self->debug_level(delete $input->{debug}) if exists $input->{debug};
|
||
|
# Handle the protocol before the other options, because some might be available to all protocols
|
||
|
if (exists $input->{protocol}) {
|
||
|
my $proto = delete $input->{protocol};
|
||
|
$self->protocol($proto);
|
||
|
}
|
||
|
for my $key (keys %$input) {
|
||
|
croak "Invalid parameter '$key'"
|
||
|
unless substr($key, 0, 1) ne '_'
|
||
|
and $self->can($key);
|
||
|
$self->$key($input->{$key});
|
||
|
}
|
||
|
}
|
||
|
elsif (@_ == 1 and not ref $_[0]) {
|
||
|
$self->url(@_);
|
||
|
}
|
||
|
else {
|
||
|
croak "Unknown arguments: @_";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$self->{debug} ||= 0; # So that: 'if $self->{debug} >= 2' won't generate warnings
|
||
|
|
||
|
$self;
|
||
|
}
|
||
|
|
||
|
# Figures out the arguments, and returns a properly set up GT::WWW subclass.
|
||
|
# Used by the quick get* and post* methods/functions below.
|
||
|
sub _quick_args {
|
||
|
(my $called_from = (caller(1))[3]) =~ s/.*:://;
|
||
|
my ($class, $url);
|
||
|
if (@_ == 0 or @_ == 1 and UNIVERSAL::isa($_[0], __PACKAGE__)) {
|
||
|
# Subclasses override get(), so this is either a broken subclass or
|
||
|
# calling get() on a GT::WWW object.
|
||
|
croak "Usage: $called_from(URL), GT::WWW->$called_from(URL), or set a URL first";
|
||
|
# As soon as a URL is set on the object, the object is reblessed.
|
||
|
}
|
||
|
elsif (@_ == 1) {
|
||
|
# function: (GT::WWW::)get(URL)
|
||
|
($class, $url) = (__PACKAGE__, shift);
|
||
|
}
|
||
|
elsif (@_ == 2) {
|
||
|
if (ref $_[0] eq __PACKAGE__) {
|
||
|
croak "Usage: GT::WWW->$called_from(URL) or $called_from(URL); \$gtwww->$called_from(URL) is not permitted";
|
||
|
}
|
||
|
elsif (ref $_[0] or $_[0] ne __PACKAGE__) {
|
||
|
# $other->get(URL) or Other->get(URL) - if it gets here, $other's
|
||
|
# class is broken
|
||
|
croak "Usage: GT::WWW->$called_from(URL) or $called_from(URL) (Subclass " .
|
||
|
(ref $_[0] or $_[0]) . " is probably broken: ->$called_from() method came to GT::WWW)";
|
||
|
}
|
||
|
else {
|
||
|
($class, $url) = (@_);
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
croak "Usage: GT::WWW->$called_from(URL), \$gtwww->$called_from(URL), or $called_from(URL)";
|
||
|
}
|
||
|
|
||
|
$url or croak 'Error: No URL specified';
|
||
|
|
||
|
$class->new({ fatal_errors => 1, url => $url });
|
||
|
}
|
||
|
|
||
|
# This should be subclassed by all subclasses. The function below should be
|
||
|
# called only as a function (perl -MGT::WWW=get -e 'print get("...url...")')
|
||
|
sub get {
|
||
|
my $www = &_quick_args;
|
||
|
|
||
|
$www->get;
|
||
|
}
|
||
|
|
||
|
# Basically, this accomplishes the same as: print GT::WWW->get(URL), except
|
||
|
# that it also sets up chunked downloading so that it can efficiently be used
|
||
|
# with large pages.
|
||
|
sub getprint {
|
||
|
my $www = &_quick_args;
|
||
|
|
||
|
$www->chunk(sub { print ${$_[0]} });
|
||
|
$www->chunk_size(4096);
|
||
|
|
||
|
$www->get();
|
||
|
}
|
||
|
|
||
|
sub post {
|
||
|
my $www = &_quick_args;
|
||
|
|
||
|
$www->post();
|
||
|
}
|
||
|
|
||
|
sub postprint {
|
||
|
my $www = &_quick_args;
|
||
|
|
||
|
$www->chunk(sub { print ${$_[0]} });
|
||
|
$www->chunk_size(4096);
|
||
|
|
||
|
$www->post();
|
||
|
}
|
||
|
|
||
|
sub head {
|
||
|
my $www = &_quick_args;
|
||
|
|
||
|
$www->head;
|
||
|
}
|
||
|
|
||
|
sub cancel { } # No ops by default
|
||
|
sub cancelled { }
|
||
|
|
||
|
sub parse_url {
|
||
|
my ($self, $url) = splice @_, 0, 2;
|
||
|
my $strict = shift if not ref $self;
|
||
|
|
||
|
my $valid_user = '(?:[\w\$.+!*\'()|,;?&=-]|%[0-9A-Fa-f]{2})+';
|
||
|
my $hsegment = '(?:[\w\$.+!*\'()|,;~&=:@-]|%[0-9A-Fa-f]{2})+';
|
||
|
my $hrelaxed = '(?:[^\x00-\x08\x0a-\x1f#%/?]|%[0-9A-Fa-f]{2})+';
|
||
|
my $qrelaxed = '(?:[^\x00-\x08\x0a-\x1f#%]|%[0-9A-Fa-f]{2})+';
|
||
|
my $proto_re = join "|", map quotemeta, keys %PROTOCOL;
|
||
|
# The following regex is designed primarily for http and https URL's, and
|
||
|
# thus allows things that don't necessarily make sense for other protocols.
|
||
|
# i.e.: ftp://jagerman:password@my.server/foo/bar.txt?asdf=zxcv#anchor
|
||
|
# ^^^^^^^^^^^^^^^^^
|
||
|
# This _could_ allow for protocol extensions, for example 'type=ascii' for
|
||
|
# FTP requests.
|
||
|
my ($protocol, $username, $password, $server, $port, $path, $query_string) =
|
||
|
(ref $self ? $self->{strict} : $strict)
|
||
|
? $url =~ m(
|
||
|
^
|
||
|
($proto_re)
|
||
|
://
|
||
|
(?:
|
||
|
($valid_user) # username
|
||
|
(?:
|
||
|
:($valid_user)? # password
|
||
|
)?
|
||
|
@
|
||
|
)?
|
||
|
$HOST_RE
|
||
|
(
|
||
|
/(?:$hsegment(?:/(?:$hsegment)?)*)? # Match the path (/foo/bar.cgi)
|
||
|
)?
|
||
|
(?:
|
||
|
\?($hsegment)?
|
||
|
)?
|
||
|
(?:
|
||
|
\#.* # Allow a possible anchor - but we don't care about it
|
||
|
)?
|
||
|
$
|
||
|
)iox
|
||
|
: $url =~ m(
|
||
|
^
|
||
|
($proto_re)
|
||
|
://
|
||
|
(?:
|
||
|
($valid_user) # username
|
||
|
(?:
|
||
|
:($valid_user)? # password
|
||
|
)?
|
||
|
@
|
||
|
)?
|
||
|
$HOST_RE
|
||
|
(
|
||
|
/(?:$hrelaxed(?:/(?:$hrelaxed)?)*)? # Match the path (/foo/bar.cgi)
|
||
|
)?
|
||
|
(?:
|
||
|
\?($qrelaxed)?
|
||
|
)?
|
||
|
(?:
|
||
|
\#.* # Allow a possible anchor - but we don't care about it
|
||
|
)?
|
||
|
$
|
||
|
)iox;
|
||
|
|
||
|
return ($protocol, $username, $password, $server, $port, $path, $query_string);
|
||
|
}
|
||
|
|
||
|
sub url {
|
||
|
my $self = shift;
|
||
|
|
||
|
if (!@_) {
|
||
|
my $proto = $self->protocol;
|
||
|
my $host = $self->host;
|
||
|
$proto and $host or return undef;
|
||
|
my $url = "$proto://";
|
||
|
my $username = $self->username;
|
||
|
my $password = $self->password;
|
||
|
if ($username) {
|
||
|
$url .= $username;
|
||
|
$url .= ":$password" if $password;
|
||
|
$url .= "@";
|
||
|
}
|
||
|
$url .= "$host";
|
||
|
my $port = $self->port;
|
||
|
$url .= ":$port" unless $port == $self->default_port;
|
||
|
my $path = $self->path;
|
||
|
$url .= $path if $path;
|
||
|
my $query = $self->query_string;
|
||
|
$url .= "?$query" if $query;
|
||
|
return $url;
|
||
|
}
|
||
|
|
||
|
my $url = shift;
|
||
|
|
||
|
my ($protocol, $username, $password, $server, $port, $path, $query_string) =
|
||
|
$self->parse_url($url);
|
||
|
|
||
|
unless ($protocol) {
|
||
|
croak "Invalid URL: '$url'";
|
||
|
}
|
||
|
|
||
|
$self->protocol($protocol);
|
||
|
|
||
|
if (defined $username) {
|
||
|
$username =~ s/%([0-9A-Fa-f])/chr hex $1/eg;
|
||
|
$self->username($username);
|
||
|
}
|
||
|
else { $self->reset_username() }
|
||
|
|
||
|
if (defined $password) {
|
||
|
$password =~ s/%([0-9A-Fa-f])/chr hex $1/eg;
|
||
|
$self->password($password);
|
||
|
}
|
||
|
else { $self->reset_password() }
|
||
|
|
||
|
$self->host($server);
|
||
|
|
||
|
$port ? $self->port($port) : $self->reset_port;
|
||
|
|
||
|
$self->path($path);
|
||
|
|
||
|
defined $query_string ? $self->query_string($query_string) : $self->reset_parameters;
|
||
|
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
sub protocol {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
my $protocol = shift;
|
||
|
croak "Protocol '$protocol' not supported" unless $self->protocol_supported($protocol);
|
||
|
$self->{protocol} = lc $protocol;
|
||
|
my $pkg = $PROTOCOL{$self->{protocol}};
|
||
|
bless $self, $pkg;
|
||
|
}
|
||
|
$self->{protocol};
|
||
|
}
|
||
|
|
||
|
sub protocol_supported {
|
||
|
my ($self, $protocol) = @_;
|
||
|
unless ($protocol) {
|
||
|
$self->debug("Protocol not supported: No protocol entered") if ref $self and $self->{debug};
|
||
|
return undef;
|
||
|
}
|
||
|
my $pkg = $PROTOCOL{lc $protocol};
|
||
|
unless ($pkg) {
|
||
|
$self->debug("Protocol '$pkg' not supported") if ref $self and $self->{debug};
|
||
|
return undef;
|
||
|
}
|
||
|
(my $mod = $pkg) =~ s|::|/|g;
|
||
|
$mod .= ".pm";
|
||
|
my $loaded_ok = eval { require $mod };
|
||
|
unless ($loaded_ok) {
|
||
|
$self->debug("Protocol '$pkg' not supported: require $mod failed: $@") if ref $self and $self->{debug};
|
||
|
return undef;
|
||
|
}
|
||
|
unless ($pkg->isa(__PACKAGE__)) {
|
||
|
$self->debug("Protocol '$pkg' not supported: does not inherit from " . __PACKAGE__) if ref $self and $self->{debug};
|
||
|
return undef;
|
||
|
}
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
# Takes a host and port ("host:port") - returns true if valid in scalar
|
||
|
# context, (host, port) in list context, or undef if invalid.
|
||
|
sub valid_host {
|
||
|
my ($self, $host) = @_;
|
||
|
return $host && $host =~ /^$HOST_RE$/x;
|
||
|
}
|
||
|
|
||
|
sub host {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
my $host = shift;
|
||
|
$self->{host} = undef;
|
||
|
croak "Invalid hostname '$host' specified" unless ($host, my $port) = $self->valid_host($host);
|
||
|
$self->{host} = $host;
|
||
|
|
||
|
$port ? $self->port($port) : $self->reset_port;
|
||
|
}
|
||
|
$self->{host};
|
||
|
}
|
||
|
|
||
|
sub port {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
my $port = shift;
|
||
|
unless ($port and $port =~ /^[0-9]+$/) {
|
||
|
my $s = getservbyname($port) or croak "No such port: '$port'";
|
||
|
$port = $s->port;
|
||
|
}
|
||
|
$self->{port} = $port;
|
||
|
}
|
||
|
$self->{port} || $self->default_port;
|
||
|
}
|
||
|
|
||
|
sub post_data {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{post_data} = shift;
|
||
|
$self->debug("Setting post_data to '$self->{post_data}'") if $self->{debug} >= 2;
|
||
|
}
|
||
|
$self->{post_data};
|
||
|
}
|
||
|
|
||
|
sub reset_port {
|
||
|
my $self = shift;
|
||
|
delete $self->{port};
|
||
|
}
|
||
|
|
||
|
sub username {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{username} = shift;
|
||
|
$self->debug("Setting username to '$self->{username}'") if $self->{debug} >= 2;
|
||
|
}
|
||
|
$self->{username};
|
||
|
}
|
||
|
|
||
|
sub reset_username {
|
||
|
my $self = shift;
|
||
|
$self->debug("Username reset") if $self->{username} and $self->{debug} >= 2;
|
||
|
delete $self->{username};
|
||
|
}
|
||
|
|
||
|
sub password {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{password} = shift;
|
||
|
}
|
||
|
$self->{password};
|
||
|
}
|
||
|
|
||
|
sub reset_password {
|
||
|
my $self = shift;
|
||
|
delete $self->{password};
|
||
|
}
|
||
|
|
||
|
sub connection_timeout {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{conn_timeout} = shift;
|
||
|
}
|
||
|
exists $self->{conn_timeout} ? $self->{conn_timeout} : 10;
|
||
|
}
|
||
|
|
||
|
sub path {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
my $path = shift;
|
||
|
$path = '' unless defined $path;
|
||
|
$path =~ s/%([0-9A-Fa-f]{2})/chr hex $1/eg;
|
||
|
$path = '/' . $path unless substr($path, 0, 1) eq '/';
|
||
|
$self->{path} = $path;
|
||
|
}
|
||
|
$self->{path};
|
||
|
}
|
||
|
|
||
|
# Replaces the parameters hash after parsing the query string. Optionally takes
|
||
|
# a third argument - if true, the parameters are added, not replaced. Note that
|
||
|
# the query_string is only an interface to the parameters() method, and will be
|
||
|
# recreated before being sent to the server.
|
||
|
# Calling query_string without arguments in produces a query string (with
|
||
|
# necessary escaping) from all parameters that have been set.
|
||
|
sub query_string {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
my ($query, $add) = @_;
|
||
|
$query = '' if not defined $query; # An empty or undefined query string can be used to clear parameters
|
||
|
my $hsegment = '(?:[\w\$.+!*\'()|,;~&=:@-]|%[0-9A-Fa-f]{2})*';
|
||
|
my $qrelaxed = '(?:[^\x00-\x08\x0a-\x1f#%]|%[0-9A-Fa-f]{2})*';
|
||
|
$self->{strict} ? $query =~ /^$hsegment$/o : $query =~ /^$qrelaxed$/ or croak "Invalid query string '$query'";
|
||
|
unless ($add) {
|
||
|
$self->{params} = [];
|
||
|
}
|
||
|
for (split /[&;]/, $query) {
|
||
|
my @kv = split /=/, $_, 2;
|
||
|
$kv[0] =~ s/%([0-9A-Fa-f]{2})/chr hex $1/eg;
|
||
|
$kv[0] =~ y/+/ /;
|
||
|
if ($kv[1]) {
|
||
|
$kv[1] =~ s/%([0-9A-Fa-f]{2})/chr hex $1/eg;
|
||
|
$kv[1] =~ y/+/ /;
|
||
|
}
|
||
|
push @{$self->{params}}, @kv[0, 1]; # Will use "undef" for b in this case: a=1&b&c=2
|
||
|
}
|
||
|
return 1;
|
||
|
}
|
||
|
else {
|
||
|
my $ret = '';
|
||
|
return $ret if !$self->{params} or !@{$self->{params}};
|
||
|
for (my $i = 0; $i < @{$self->{params}}; $i += 2) {
|
||
|
my ($k, $v) = @{$self->{params}}[$i, $i + 1];
|
||
|
$k =~ s/([^\w. -])/sprintf "%%%02X", ord $1/eg;
|
||
|
$k =~ y/ /+/;
|
||
|
if ($v) {
|
||
|
$v =~ s/([^\w. -])/sprintf "%%%02X", ord $1/eg;
|
||
|
$v =~ y/ /+/;
|
||
|
}
|
||
|
$ret .= $k;
|
||
|
$ret .= "=$v" if defined $v;
|
||
|
$ret .= '&';
|
||
|
}
|
||
|
chop $ret;
|
||
|
return $ret;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub parameters {
|
||
|
my $self = shift;
|
||
|
my @ret;
|
||
|
@ret = @{$self->{params}} if $self->{params} and defined wantarray;
|
||
|
if (@_) {
|
||
|
my @params = @_;
|
||
|
@params = @{$params[0]} if @params == 1 and ref $params[0] eq 'ARRAY';
|
||
|
my $add;
|
||
|
$add = pop @params if @params % 2;
|
||
|
if ($add) {
|
||
|
push @{$self->{params}}, @params;
|
||
|
}
|
||
|
else {
|
||
|
$self->{params} = \@params;
|
||
|
}
|
||
|
}
|
||
|
@ret;
|
||
|
}
|
||
|
|
||
|
sub reset_parameters {
|
||
|
my $self = shift;
|
||
|
delete $self->{params};
|
||
|
}
|
||
|
|
||
|
sub strict {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{strict} = shift() ? 1 : undef;
|
||
|
}
|
||
|
$self->{strict};
|
||
|
}
|
||
|
|
||
|
sub agent {
|
||
|
my $self = shift;
|
||
|
my $ret = $self->{agent};
|
||
|
|
||
|
if (@_) {
|
||
|
my $agent = shift;
|
||
|
$agent =~ /[\x00-\x08\x0a-\x1f\x7f]/ and croak "Invalid User-Agent '$agent'";
|
||
|
$self->{agent} = $agent;
|
||
|
}
|
||
|
|
||
|
unless ($ret) { $ret = $self->default_agent() }
|
||
|
else {
|
||
|
if ($ret =~ /[\s,:;]$/) { $ret .= $self->default_agent() }
|
||
|
if ($ret =~ /^[\s,:;]/) { $ret = $self->default_agent() . $ret }
|
||
|
}
|
||
|
|
||
|
$ret;
|
||
|
}
|
||
|
|
||
|
sub default_agent {
|
||
|
my $self = shift;
|
||
|
my $pkg = ref $self || $self || __PACKAGE__;
|
||
|
|
||
|
my $pstash;
|
||
|
if ($pstash = $::{"$pkg\::"} and $pstash->{VERSION} and my $scalar = *{\$pstash->{VERSION}}{SCALAR}) {
|
||
|
return "$pkg/$$scalar";
|
||
|
}
|
||
|
else {
|
||
|
return __PACKAGE__ . '/' . $VERSION;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub chunk {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
my $coderef = shift;
|
||
|
ref $coderef eq 'CODE' or not defined $coderef or croak "Usage: \$www->chunk(CODEREF | undef)";
|
||
|
$self->{chunk_code} = $coderef;
|
||
|
return 1;
|
||
|
}
|
||
|
$self->{chunk_code};
|
||
|
}
|
||
|
|
||
|
sub chunk_size {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
my $chunk_size = shift;
|
||
|
defined $chunk_size and $chunk_size > 0
|
||
|
or croak 'Usage: $www->chunk_size(BYTES)';
|
||
|
$self->{chunk_size} = $chunk_size;
|
||
|
}
|
||
|
$self->{chunk_size};
|
||
|
}
|
||
|
|
||
|
sub debug {
|
||
|
my $self = @_ > 1 ? shift : __PACKAGE__;
|
||
|
$self = ref $self if ref $self;
|
||
|
carp "$self: @_";
|
||
|
}
|
||
|
|
||
|
sub debug_level {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{debug} = shift;
|
||
|
}
|
||
|
$self->{debug};
|
||
|
}
|
||
|
|
||
|
sub fatal_errors {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{fatal_errors} = shift;
|
||
|
}
|
||
|
$self->{fatal_errors};
|
||
|
}
|
||
|
|
||
|
sub error {
|
||
|
my $self = shift;
|
||
|
if (@_) {
|
||
|
$self->{error} = shift;
|
||
|
croak $self->{error} if $self->{fatal_errors};
|
||
|
$self->debug("Error: $self->{error}") if $self->{debug};
|
||
|
return undef;
|
||
|
}
|
||
|
$self->{error};
|
||
|
}
|
||
|
|
||
|
sub file {
|
||
|
shift if @_ and UNIVERSAL::isa($_[0], __PACKAGE__);
|
||
|
|
||
|
GT::WWW::File->new(@_);
|
||
|
}
|
||
|
|
||
|
package GT::WWW::File;
|
||
|
# This package is used when a protocol can take a file as input (for example,
|
||
|
# an HTTP POST file upload). To get a "file" object, call GT::WWW->file(params).
|
||
|
# The parameters are defined in the POD.
|
||
|
use Carp;
|
||
|
|
||
|
sub new {
|
||
|
my ($class, $filename, $handle) = @_;
|
||
|
|
||
|
my $self = {};
|
||
|
bless $self, ref($class) || $class;
|
||
|
|
||
|
defined $filename and $filename =~ m{([^\x00\x1f\x7f\\/]+)$} and $filename ne '.' and $filename ne '..'
|
||
|
or croak 'No (or invalid) filename specified. Usage: ' . __PACKAGE__ . '->new(FILENAME, PATH_OR_GLOBREF)';
|
||
|
|
||
|
$self->{filename} = $1;
|
||
|
|
||
|
if ($handle and ref $handle eq 'GLOB' and fileno($handle)) {
|
||
|
$self->{fh} = $handle;
|
||
|
}
|
||
|
else {
|
||
|
$handle = $filename if not defined $handle;
|
||
|
if ($handle) {
|
||
|
if (!-r $handle) {
|
||
|
croak "File specified ($handle) does not exist, or is not readable";
|
||
|
}
|
||
|
elsif (-d _) {
|
||
|
croak "File specified ($handle) is a directory";
|
||
|
}
|
||
|
else {
|
||
|
my $fh = \do { local *FH; *FH };
|
||
|
open $fh, "< $handle" or croak "Open to open file specified ($handle): $!";
|
||
|
binmode $fh;
|
||
|
$self->{fh} = $fh;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
croak 'No opened globref or filename specified. Usage: ' . __PACKAGE__ . '->new(FILENAME, PATH_OR_GLOBREF)';
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$self;
|
||
|
}
|
||
|
|
||
|
sub fh {
|
||
|
my $self = shift;
|
||
|
return $self->{fh};
|
||
|
}
|
||
|
|
||
|
# Returns the size of the file, if available, or undef if it can't be
|
||
|
# determined (such as a socket, device (e.g. /dev/audio), or special file (e.g.
|
||
|
# /proc/cpuinfo)).
|
||
|
sub size {
|
||
|
my $self = shift;
|
||
|
if (-f $self->{fh} and my $size = -s _) {
|
||
|
return $size;
|
||
|
}
|
||
|
return undef;
|
||
|
}
|
||
|
|
||
|
sub filename {
|
||
|
my $self = shift;
|
||
|
$self->{filename};
|
||
|
}
|
||
|
|
||
|
1;
|
||
|
|
||
|
__END__
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
GT::WWW - Multi-protocol retrieving and posting, related in functionality to
|
||
|
LWP.
|
||
|
|
||
|
=head1 DESCRIPTION
|
||
|
|
||
|
GT::WWW is designed to provide a common interface for multiple protocols (as of
|
||
|
this writing, only HTTP and HTTPS, however others are planned) and handles
|
||
|
HEAD, GET, and POST requests. For non-HTTP-based protocols, what, exactly, a
|
||
|
"HEAD", "GET", or "POST" request is depends on the module in question. For
|
||
|
example, with FTP "GET" might download a file, while "POST" might upload one to
|
||
|
the server, and "HEAD" might return just the size of the file.
|
||
|
|
||
|
The modules under GT::WWW B<should not> be used directly; this module should be
|
||
|
used instead. The documentation here describes the options common to all
|
||
|
protocols - however you should check the POD of the protocol subclasses
|
||
|
(GT::WWW::http, GT::WWW::https, etc.) to see any extra options or methods that
|
||
|
those modules provide.
|
||
|
|
||
|
=head1 SYNOPSIS
|
||
|
|
||
|
Quick way:
|
||
|
|
||
|
use GT::WWW;
|
||
|
my $www = GT::WWW->get("http://server.com/page");
|
||
|
... = GT::WWW->post("http://server.com/page");
|
||
|
... = GT::WWW->head("http://server.com/page");
|
||
|
... = GT::WWW->...("http://user:pass@server.com/page");
|
||
|
... = GT::WWW->...("https://server.com/page");
|
||
|
|
||
|
# This query string will be parsed and passed as POST input:
|
||
|
... = GT::WWW->post("http://server.com/page?foo=bar;bar=foo");
|
||
|
|
||
|
Longer, but more capable way:
|
||
|
|
||
|
use GT::WWW;
|
||
|
my $request = GT::WWW->new();
|
||
|
|
||
|
$request->protocol("http");
|
||
|
$request->host("server.com");
|
||
|
$request->port(8080);
|
||
|
$request->path("/path/foo.cgi");
|
||
|
$request->username("user");
|
||
|
$request->password("pass");
|
||
|
$request->parameters(foo => "bar", bar => "foo");
|
||
|
|
||
|
equivelant to the above, using ->url():
|
||
|
|
||
|
$request->url("http://user:pass@server.com:8080/path/foo.cgi?foo=bar;bar=foo");
|
||
|
|
||
|
Now call $request->get(), $request->post(), or $request->head().
|
||
|
|
||
|
|
||
|
Very quick way to print a page:
|
||
|
|
||
|
perl -MGT::WWW=get -e 'print get("http://server.com/page?foo=bar&bar=foo")'
|
||
|
|
||
|
=head1 METHODS
|
||
|
|
||
|
Note that all methods that set values (such as host(), port(), etc.) also
|
||
|
return the value when called without any argument.
|
||
|
|
||
|
=head2 new
|
||
|
|
||
|
Call new() to get a new GT::WWW object. You can call it without arguments to
|
||
|
get a generic GT::WWW object, or use arguments as described below.
|
||
|
|
||
|
=over 4
|
||
|
|
||
|
=item URL
|
||
|
|
||
|
You can call new with a single scalar argument - a URL to be parsed. The URL is
|
||
|
of the same format as taken by the url() method.
|
||
|
|
||
|
=item HASH
|
||
|
|
||
|
You can alternatively call new() with a hash (or hash reference) of options.
|
||
|
Each of the methods described below can be passed in to new in the form of
|
||
|
C<key =E<gt> value> pairs - the methods will be called with the values
|
||
|
specified automatically.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head2 head
|
||
|
|
||
|
=head2 get
|
||
|
|
||
|
=head2 post
|
||
|
|
||
|
These are the methods used to tell the module to actually connect to the server
|
||
|
and download the requested page.
|
||
|
|
||
|
When used as GT::WWW class methods or function calls (but B<NOT> as methods on
|
||
|
GT::WWW objects or sub-objects), they take a single URL as an argument. This
|
||
|
call creates an internal GT::WWW object, turns on
|
||
|
L<C<fatal_errors(1)>|/fatal_errors>, passes the URL to L<C<url()>|/url>, then
|
||
|
calls the appropriate C<get()>, C<head()>, or C<post()> method of the resulting
|
||
|
protocol-specific object.
|
||
|
|
||
|
Note, however, that once you have specified a protocol (either via
|
||
|
L<C<protocol()>|/protocol>, or as part of a url passed to L<C<url()>|/url>)
|
||
|
your object ceases to be a GT::WWW object and becomes a protocol-specific
|
||
|
GT::WWW subclass. All subclasses provide their own versions of these methods.
|
||
|
|
||
|
The subclassed methods are not described here because they may not be supported
|
||
|
for all protocols, and their return value(s) may differ from one protocol to
|
||
|
the next. For more details, see the modules listed in the
|
||
|
L<SEE ALSO|/"SEE ALSO"> section.
|
||
|
|
||
|
Generally, get() and post() return an overloaded object that can be used as a
|
||
|
string to get the content (i.e. for printing), but see the notes in the CAVEATS
|
||
|
section of L<GT::WWW::http::Response> for anything more complicated than
|
||
|
concatenation or printing.
|
||
|
|
||
|
=head2 url
|
||
|
|
||
|
Takes a URL as argument. The URL is parsed into several fields: C<protocol>,
|
||
|
C<username>, C<password>, C<host>, C<port>, C<path>, and C<query_string>, then
|
||
|
each of those properties are set for the current object. Also note that
|
||
|
calling url() on an existing object resets the host, port, username, password,
|
||
|
and all parameters.
|
||
|
|
||
|
Interally, this method calls L<C<parse_url()>|/"parse_url">.
|
||
|
|
||
|
=head2 parse_url
|
||
|
|
||
|
Takes a URI, and returns the following 7 element list:
|
||
|
|
||
|
# 0 1 2 3 4 5 6
|
||
|
($protocol, $username, $password, $host, $port, $path, $query_string) =
|
||
|
GT::WWW->parse_url($url);
|
||
|
|
||
|
URL's require, at a minimum, protocol and host, in URI form:
|
||
|
|
||
|
PROTOCOL://HOST
|
||
|
|
||
|
The URL can extend up to:
|
||
|
|
||
|
PROTOCOL://USERNAME:PASSWORD@HOST:PORT/PATH?QUERY_STRING
|
||
|
|
||
|
Only protocols known to GT::WWW are acceptable. To check if a URL is valid,
|
||
|
check C<$protocol>.
|
||
|
|
||
|
This method can be called as a class or object method, but not as a function.
|
||
|
If called as an object method, the strict option as currently set for the
|
||
|
object will be used; as a class method or function, an optional second
|
||
|
parameter can be passed in - if true, strict query string parsing mode will be
|
||
|
enabled.
|
||
|
|
||
|
=head2 protocol
|
||
|
|
||
|
Takes a protocol, such as 'http', 'https', 'ftp', etc. Note that when you call
|
||
|
protocol, you object ceases being a GT::WWW object, by becoming a GT::WWW subclass
|
||
|
(such as GT::WWW::http, GT::WWW::https, etc.). Before trying an unknown protocol,
|
||
|
you should generally call the L</protocol_supported> method - calling
|
||
|
C<protocol(...)> with an unsupported protocol will result in a fatal error.
|
||
|
|
||
|
=head2 protocol_supported
|
||
|
|
||
|
This method takes a protocol, such as 'http', 'https', 'ftp', etc. In order to
|
||
|
make sure the protocol is supported, this checks to see that it is an
|
||
|
internally supported protocol, and also tries to load the module to make sure
|
||
|
that the module can be loaded.
|
||
|
|
||
|
=head2 valid_host
|
||
|
|
||
|
Returns true in scalar context if the host appears valid, or the host and port
|
||
|
in list context if the host is valid. Note that no check is performed to see
|
||
|
whether or not the host resolves or is reachable - this simply verifies that
|
||
|
the host is at least valid enough to warrant a lookup.
|
||
|
|
||
|
=head2 host
|
||
|
|
||
|
Sets the host, and optionally the port (assuming the argument is of the form:
|
||
|
'hostname:port'). Returns a fatal error if the host is not valid. Note that
|
||
|
setting the host will B<reset> the port to the protocol's default value, so
|
||
|
this method B<must> be called before port().
|
||
|
|
||
|
=head2 port
|
||
|
|
||
|
Sets the port for the connection. This can be a name, such as "smtp", or a
|
||
|
numeric value. Note that the port value B<will be reset> when the host()
|
||
|
method is called, so setting a port B<must> happen after setting the host.
|
||
|
|
||
|
=head2 reset_port
|
||
|
|
||
|
Resets the port so that the next request will use the default port.
|
||
|
|
||
|
=head2 username
|
||
|
|
||
|
Sets or retrieves the login username.
|
||
|
|
||
|
=head2 reset_username
|
||
|
|
||
|
Removes the login username.
|
||
|
|
||
|
=head2 password
|
||
|
|
||
|
Sets the login password.
|
||
|
|
||
|
=head2 reset_password
|
||
|
|
||
|
Removes the login password.
|
||
|
|
||
|
=head2 connection_timeout
|
||
|
|
||
|
Specifies a timeout for connections, in seconds. By default, a value of 10 is
|
||
|
used. If you specify a false value here, the connection time out will be
|
||
|
system dependent; typically this is from one to several minutes. Note,
|
||
|
however, that the timeout is not supported on Windows systems and so should not
|
||
|
be depended on in code that runs on Windows systems.
|
||
|
|
||
|
=head2 path
|
||
|
|
||
|
Sets the path for the request. Any HTTP escapes (e.g. %20) are automatically
|
||
|
converted to the actual value (e.g. " "). If required, the path will be
|
||
|
automatically re-escaped before being sent to the server.
|
||
|
|
||
|
=head2 parameters
|
||
|
|
||
|
Takes a list (not a hash, since duplicate keys are permitted) of key => value
|
||
|
pairs. Optionally takes an extra argument - if true, the parameters are added,
|
||
|
not replaced - if omitted (or false), any existing parameters are deleted.
|
||
|
|
||
|
To specify a valueless parameter without a value, such as b in this example
|
||
|
query string:
|
||
|
|
||
|
a=1&b&c=3
|
||
|
|
||
|
Pass undef as b's value. Passing "" as the value will result in:
|
||
|
|
||
|
a=1&b=&c=3
|
||
|
|
||
|
For example, to set to two query strings above would require the following two
|
||
|
sets of arguments, respectively:
|
||
|
|
||
|
$www->parameters(a => 1, b => undef, c => 3);
|
||
|
|
||
|
$www->parameters(a => 1, b => "", c => 3);
|
||
|
|
||
|
To then add a "d=4" parameter to either one, you would call:
|
||
|
|
||
|
$www->parameters(d => 4, 1);
|
||
|
|
||
|
Omitting the extra "1" would cause you to erase the previously set parameters.
|
||
|
|
||
|
Values specified should B<not> be URL encoded.
|
||
|
|
||
|
|
||
|
If called without arguments, the list of key/value pairs is returned.
|
||
|
|
||
|
=head2 reset_parameters
|
||
|
|
||
|
Resets the parameters. You want to make sure you do this between each request
|
||
|
on the same object, unless using L<C<url()>|/url>, which calls this for you.
|
||
|
|
||
|
=head2 query_string
|
||
|
|
||
|
This function serves the same purpose as L<C<parameters()>|/parameters>, except
|
||
|
that it takes a query string as input instead of a list. Like C<parameters()>,
|
||
|
the default behaviour is to replace any existing parameters unless a second,
|
||
|
true argument is provided.
|
||
|
|
||
|
Note that if you already have your parameters in some sort of list, it is
|
||
|
preferable to pass them to C<parameters()> than to join them into a query
|
||
|
string and pass them into this function, because this function just splits them
|
||
|
back up into a list again.
|
||
|
|
||
|
You can also provide a query string (along with a host, path, and possibly
|
||
|
other data) using the L<C<url()>|/url> method.
|
||
|
|
||
|
|
||
|
If called without arguments, the current parameters will be joined into a valid
|
||
|
query string and returned.
|
||
|
|
||
|
=head2 strict
|
||
|
|
||
|
This function is used to tell the GT::WWW object to allow/disallow
|
||
|
standard-violating responses. This has a global effect of allowing query
|
||
|
strings to contain _any_ characters except for "\r", "\n", and "#" - normally,
|
||
|
characters such as /, ?, and various extended characters much be escaped into
|
||
|
%XX format. The C<strict> option may have other protocol-specific effects,
|
||
|
which will be indicated in each protocol's documentation.
|
||
|
|
||
|
The option defaults to non-strict.
|
||
|
|
||
|
=head2 post_data
|
||
|
|
||
|
This function allows you to pass in raw data to be posted. The data will not be
|
||
|
encoded. If you pass in a code reference, the data will be posted in chunks.
|
||
|
|
||
|
=head2 agent
|
||
|
|
||
|
Used to set or retrieve the User-Agent string that will be sent to the server.
|
||
|
If the agent string you pass starts or ends with whitespace or a comma, the
|
||
|
default agent will be added at the beginning of end of the User-Agent string,
|
||
|
respectively. This value is only meaningful to protocols supporting something
|
||
|
similar to the HTTP User-Agent.
|
||
|
|
||
|
=head2 default_agent
|
||
|
|
||
|
Returns the default user agent string. This will be automatically used if no
|
||
|
agent has been set, or if an agent ending with whitespace is specified. This
|
||
|
value is dependent on the protocol being used, but is typically something like
|
||
|
"GT::WWW::http/1.23". This method is read-only.
|
||
|
|
||
|
=head2 chunk
|
||
|
|
||
|
=head2 chunk_size
|
||
|
|
||
|
C<chunk> and C<chunk_size> are used to perform a large download in chunks. The
|
||
|
C<chunk()> method takes a code reference that will be called when a chunk of
|
||
|
data has been retrieved from the server, or a value of C<undef> to clear any
|
||
|
currently set chunk code. C<chunk_size()> takes a integer containing the
|
||
|
number bytes that you wish to retrieve at a time from the server; the C<chunk>
|
||
|
code reference will be called with a scalar reference containing up to
|
||
|
C<chunk_size> bytes.
|
||
|
|
||
|
Note that when using chunked downloading, the data will not be available using
|
||
|
the normal content retrieval interface.
|
||
|
|
||
|
Also note that, as of 1.024, the chunk code reference only applies to the next
|
||
|
get() or post() request - after each get() or post() request, the chunk_code is
|
||
|
cleared (in order to avoid self-references and possible memory leaks).
|
||
|
|
||
|
=head2 cancel
|
||
|
|
||
|
=head2 cancelled
|
||
|
|
||
|
The C<cancel> method can be used in conjunction with the L<C<chunk>|/chunk>
|
||
|
option to abort a download in progress. The chunk code will not be called
|
||
|
again, and the server connection will be closed. This should be used sparingly
|
||
|
and with care. C<cancelled> simply return a true/false value indicating
|
||
|
whether the operation has been cancelled. This value is reset at the beginning
|
||
|
of each operation.
|
||
|
|
||
|
Note that cancelling an operation is never performed automatically, and only
|
||
|
happens - if ever - in the C<chunk> code reference, so checking the
|
||
|
cancellation status is rarely needed.
|
||
|
|
||
|
=head2 debug_level
|
||
|
|
||
|
This is used to set or retrieve the debug level.
|
||
|
0 = no debugging
|
||
|
1 = debugging related to current operation
|
||
|
2 = adds operation details to debugging level 1
|
||
|
3 = adds data debugging (very large!) to debugging level 2
|
||
|
|
||
|
When passed as part of a hash to new(), the key for this option can be specified
|
||
|
as C<debug> instead of C<debug_level>.
|
||
|
|
||
|
=head2 error
|
||
|
|
||
|
This method will return a string containing an error that has occured. Note
|
||
|
that an error may be generated even for methods that _seem_ to be correct - for
|
||
|
example, if a server unexpectedly closes the connection before properly
|
||
|
finishing the transfer, a successful return will result since the transfer was
|
||
|
partially successful, but an error message will still be set.
|
||
|
|
||
|
=head2 fatal_errors
|
||
|
|
||
|
This method will alter the current object's error handling behaviour such that
|
||
|
any errors that occur will be propogated to fatal errors. It is enabled
|
||
|
automatically when using the quick (i.e. objectless) forms of C<get()>,
|
||
|
C<head()>, and C<post()> methods which have no associated object on which
|
||
|
->error can be called.
|
||
|
|
||
|
=head2 file
|
||
|
|
||
|
This method is used to create a parameter for uploading a file. It takes
|
||
|
either one or two arguments:
|
||
|
|
||
|
2 argument form:
|
||
|
First argument is a B<remote> filename, second argument is either a B<local>
|
||
|
filename, or a GLOB reference to an open filehandle.
|
||
|
|
||
|
1 argument form:
|
||
|
Argument is a filename to read.
|
||
|
|
||
|
Example usage:
|
||
|
|
||
|
my $file = $www->file("foo.txt");
|
||
|
$www->parameters(foobar => $file, 1);
|
||
|
my $response = $www->post();
|
||
|
|
||
|
This will upload the file from disk named "foo.txt", using a form parameter
|
||
|
named "foobar". This is similar to uploading a file named "foo.txt" via the
|
||
|
following HTML element:
|
||
|
|
||
|
<input type="file" name="foobar">
|
||
|
|
||
|
The two argument form with two filenames is used to lie to the server about the
|
||
|
actual name of the file. Using a filehandle as the second argument is for use
|
||
|
when a filename is not available - such as an opened socket, or a file that has
|
||
|
been opened elsewhere in the code.
|
||
|
|
||
|
Examples:
|
||
|
|
||
|
my $file = $www->file("foo.txt", "bar.txt");
|
||
|
my $file2 = $www->file("foo2.txt", \*FH);
|
||
|
$www->parameters(foobar => $file, foobar2 => $file2, 1);
|
||
|
my $response = $www->post();
|
||
|
|
||
|
This will upload two files - a file named F<foo.txt> (which is actually read
|
||
|
from the C<bar.txt> file) specified as form parameter C<foobar>, and a second
|
||
|
file, specified as parameter C<foobar2>, whose content is read from the
|
||
|
filehandle C<FH>.
|
||
|
|
||
|
=head1 SEE ALSO
|
||
|
|
||
|
L<GT::WWW::http>
|
||
|
L<GT::WWW::https>
|
||
|
|
||
|
=head1 MAINTAINER
|
||
|
|
||
|
Jason Rhinelander
|
||
|
|
||
|
=head1 COPYRIGHT
|
||
|
|
||
|
Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
|
||
|
http://www.gossamer-threads.com/
|
||
|
|
||
|
=head1 VERSION
|
||
|
|
||
|
Revision: $Id: WWW.pm,v 1.25 2005/04/08 19:25:31 jagerman Exp $
|
||
|
|
||
|
=cut
|