discourse-legacysite-perl/site/glist/lib/GT/Mail/Parse.pm
2024-06-17 21:49:12 +10:00

789 lines
24 KiB
Perl

# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Mail::Parse
# Author : Scott Beck
# CVS Info :
# $Id: Parse.pm,v 1.79 2004/10/23 02:16:39 brewt Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
package GT::Mail::Parse;
# =============================================================================
# If MIME::Base64 is installed use it - must eval before hand or 5.004_04 wipes
# our ISA.
my $have_b64 = eval {
local $SIG{__DIE__};
require MIME::Base64;
import MIME::Base64;
if ($] < 5.005) { local $^W; decode_base64('brok'); }
1;
};
$have_b64 or *decode_base64 = \&gt_old_decode_base64;
my $use_decode_qp;
if ($have_b64 and
$MIME::Base64::VERSION >= 2.16 and # Prior versions had decoding bugs
defined &MIME::QuotedPrint::decode_qp and (
not defined &MIME::QuotedPrint::old_decode_qp or
\&MIME::QuotedPrint::decode_qp != \&MIME::QuotedPrint::old_decode_qp
)
) {
$use_decode_qp = 1;
}
# Pragmas
use strict;
use vars qw($VERSION $DEBUG $ERRORS $CRLF $CR_LN @ISA);
# System modules
use Fcntl;
# Internal modules
use GT::Mail::Parts;
use GT::Base;
# Inherent from GT::Base for errors and debug
@ISA = qw(GT::Base);
# Debugging mode
$DEBUG = 0;
# The package version, both in 1.23 style *and* usable by MakeMaker:
$VERSION = substr q$Revision: 1.79 $, 10;
# The CRLF sequence:
$CRLF = "\n";
# The length of a crlf
$CR_LN = 1;
# Error messages
$ERRORS = {
PARSE => "An error occured while parsing: %s",
DECODE => "An error occured while decoding: %s",
NOPARTS => "Email has no parts!",
DEEPPARTS => "Deep recursion dected, email appears to have more than 50 parts!",
MALFORMED => "Found (%s) before finding the start of the boundary. Message malformed"
};
my %DecoderFor = (
# Standard...
'7bit' => 'NBit',
'8bit' => 'NBit',
'base64' => 'Base64',
'binary' => 'Binary',
'none' => 'Binary',
'quoted-printable' => 'QuotedPrint',
# Non-standard...
'x-uu' => 'UU',
'x-uuencode' => 'UU',
);
sub new {
# --------------------------------------------------------------------------
# CLASS->new (
# naming => \&naming,
# in_file => '/path/to/file/to/parse',
# handle => \*FH
# );
# ----------------------------------------------
# Class method to get a new object. Calles init if there are any additional
# argument. To set the arguments that are passed to naming call naming
# directly.
#
my $this = shift;
my $class = ref $this || $this;
my $self = bless {
file_handle => undef,
parts => [],
head_part => undef,
headers_intact => 1,
_debug => $DEBUG,
}, $class;
$self->init(@_) if @_;
$self->debug("Created new object ($self).") if $self->{_debug} > 1;
return $self;
}
sub init {
# --------------------------------------------------------------------------
# $obj->init (%opts);
# -------------------
# Sets the options for the current object.
#
my $self = shift;
my $opt = {};
if (@_ == 1 and ref $_[0] eq 'HASH') { $opt = shift }
elsif (defined $_[0] and not @_ % 2) { $opt = {@_} }
else { return $self->error("BADARGS", "FATAL", "init") }
$self->{_debug} = exists($opt->{debug}) ? $opt->{debug} : $DEBUG;
$self->{headers_intact} = exists($opt->{headers_intact}) ? $opt->{headers_intact} : 1;
for my $m (qw(crlf in_file in_handle in_string attach_rfc822)) {
$self->$m($opt->{$m}) if defined $opt->{$m};
}
}
sub attach_rfc822 {
# --------------------------------------------------------------------------
my $self = shift;
if (@_) {
$self->{attach_rfc822} = shift;
}
return $self->{attach_rfc822};
}
sub crlf {
# --------------------------------------------------------------------------
$CRLF = pop || return $CRLF;
$CR_LN = length($CRLF);
}
sub parse {
# --------------------------------------------------------------------------
# my $top = $obj->parse;
# ----------------------
# Parses the email set in new or init. Also calls init if there are any
# arguments passed in.
# Returns the top level part object.
#
my ($self, @opts) = @_;
# Any additional arguments goto init
$self->init(@opts) if @opts;
($self->{string} and ref($self->{string}) eq 'SCALAR')
or return $self->error('BADARGS', 'FATAL', "No input was given to parse before parse() was called");
# Recursive function to parse
$self->_parse_part(undef, $self->{string}); # parse!
# Return top part
return $self->{head_part};
}
sub parse_head {
# --------------------------------------------------------------------------
# my $head = $obj->parse_head;
# ----------------------------
# Passes any additional arguments to init. Parses only the top level header.
# This saves some overhead if for example all you need to do it find out who
# an email is to on a POP3 server.
#
my ($self, $in, @opts) = @_;
unless (ref $self) {
$self = $self->new(@opts);
}
$in ||= $self->{string};
$in || return $self->error("BADARGS", "FATAL", "No string to parse set!");
# Parse the head
return $self->_parse_head($in);
}
#--------------------------------------------
# Access
#--------------------------------------------
sub in_handle {
# --------------------------------------------------------------------------
# $obj->in_handle (\*FH);
# --------------------
# Pass in a file handle to parse from when parse is called.
#
my ($self, $value) = @_;
if (@_ > 1 and ref $value and defined fileno $value) {
read $value, ${$self->{string}}, -s $value;
}
return $self->{string};
}
sub in_file {
# --------------------------------------------------------------------------
# $obj->in_file ('/path/to/file');
# --------------------------------
# Pass in the path to a file to parse when parse is called
#
my $self = shift;
my $file = shift;
my $io = \do { local *FH; *FH };
open $io, "<$file" or return $self->error("READOPEN", "FATAL", $file, $!);
return $self->in_handle($io);
}
sub in_string {
# --------------------------------------------------------------------------
my ($self, $string) = @_;
return $self->{string} unless (@_ > 1);
if (ref($string) eq 'SCALAR') {
$self->{string} = $string;
}
else {
$self->{string} = \$string;
}
return $self->{string};
}
sub size {
# --------------------------------------------------------------------------
# my $email_size = $obj->size;
# ----------------------------
# Returns the total size of an email. Call this method after the email has
# been parsed.
#
my $self = shift;
(@{$self->{parts}} > 0) or return $self->error("NOPARTS", "WARN");
my $size = 0;
foreach (@{$self->{parts}}) {
$size += $_->size;
}
return $size;
}
sub all_parts {
# --------------------------------------------------------------------------
# my @parts = $obj->all_parts;
# ----------------------------
# Returns a list of all the part object for the current parsed email. If the
# email is not multipart this will be just the header part.
#
return @{shift()->{parts}}
}
sub top_part {
# --------------------------------------------------------------------------
return ${shift()->{parts}}[0];
}
#---------------------------------------------
# Internal Methods
#---------------------------------------------
sub _parse_head {
# --------------------------------------------------------------------------
# Internal Method
# ---------------
# Parse just the head. Returns the part object.
#
my ($self, $in) = @_;
# Get a new part object
my $part = GT::Mail::Parts->new(headers_intact => $self->{headers_intact});
if (ref $in eq 'ARRAY') {
$part->extract($in) or return $self->error("PARSE", "WARN", "Couldn't parse head!");
return $part;
}
$part->extract([map { $_ . $CRLF } split($CRLF => $$in)]) or return $self->error($GT::Mail::Parts::error, 'WARN');
return $part;
}
sub _parse_part {
# --------------------------------------------------------------------------
# Internal Method
# ---------------
# Parses all the parts of an email and stores them in there parts object.
# This function is recursive.
#
my ($self, $outer_bound, $in, $part) = @_;
my $state = 'OK';
# First part is going to be the top level part
if (!$part) {
$part = GT::Mail::Parts->new(headers_intact => $self->{headers_intact});
$self->{head_part} = $part;
}
push @{$self->{parts}}, $part;
# Get the header for this part
my $indx;
if (($indx = index($$in, $CRLF)) == 0) {
substr($$in, 0, $CR_LN) = '';
}
else {
$indx = index($$in, ($CRLF . $CRLF));
if ($indx == -1) {
$self->debug('Message has no body.') if $self->{_debug};
$indx = length($$in);
}
$part->extract([map { $_ . $CRLF } split($CRLF => substr($$in, 0, $indx))])
or return $self->error($GT::Mail::Parts::error, 'WARN');
substr($$in, 0, $indx + ($CR_LN * 2)) = '';
}
# Get the mime type
my ($type, $subtype) = split('/', $part->mime_type);
$type ||= 'text';
$subtype ||= 'plain';
if ($self->{_debug}) {
my $name = $part->recommended_filename || '[unnamed]';
$self->debug("Type is '$type/$subtype' ($name)");
}
# Deal with the multipart type with some recursion
if ($type eq 'multipart') {
my $retype = (($subtype eq 'digest') ? 'message/rfc822' : '');
# Find the multipart boundary
my $inner_bound = $part->multipart_boundary;
$self->debug("Boundary is $inner_bound") if $self->{_debug} > 1;
defined $inner_bound or return $self->error("PARSE", "WARN", "No multipart boundary in multipart message.");
index($inner_bound, $CRLF) == -1 or return $self->error("PARSE", "WARN", "CR or LF in multipart boundary.");
# Parse the Preamble
$self->debug("Parsing preamble.") if $self->{_debug} > 1;
$state = $self->_parse_preamble($inner_bound, $in, $part) or return;
chomp($part->preamble->[-1]) if @{$part->preamble};
# Get all the parts of the multipart message
my $partno = 0;
my $parts;
while (1) {
++$partno < 200 or return $self->error('DEEPPARTS', 'WARN');
$self->debug("Parsing part $partno.") if $self->{_debug};
($parts, $state) = $self->_parse_part($inner_bound, $in, GT::Mail::Parts->new(headers_intact => $self->{headers_intact})) or return;
($state eq 'EOF') and return $self->error('PARSE', 'WARN', 'Unexpected EOF before close.');
$parts->mime_type($retype) if $retype;
push(@{$part->{parts}}, $parts);
last if $state eq 'CLOSE';
}
# Parse the epilogue
$self->debug("Parsing epilogue.") if $self->{_debug} > 1;
$state = $self->_parse_epilogue($outer_bound, $in, $part) or return;
chomp($part->epilogue->[-1]) if @{$part->epilogue} and $state ne 'EOF';
}
# We are on a single part
else {
$self->debug("Decoding single part.") if $self->{_debug} > 1;
# Find the encoding for the body of the part
my $encoding = $part->mime_encoding || 'binary';
if (!exists($DecoderFor{lc($encoding)})) {
$self->debug("Unsupported encoding '$encoding': using 'binary'... \n" .
"The entity will have an effective MIME type of \n" .
"application/octet-stream, as per RFC-2045.")
if $self->{_debug};
$part->effective_type('application/octet-stream');
$encoding = 'binary';
}
my $reparse;
$reparse = ("$type/$subtype" eq "message/rfc822") unless $self->{attach_rfc822};
my $encoded = "";
# If we have boundaries we parse the body to the boundary
if (defined $outer_bound) {
$self->debug("Parsing to boundary.") if $self->{_debug} > 1;
$state = $self->_parse_to_bound($outer_bound, $in, \$encoded) or return;
}
# Else we would parse the rest of the input stream as the rest of the message
else {
$self->debug("No Boundries.") if $self->{_debug} > 1;
$encoded = $$in;
$state = 'EOF';
}
# Normal part so we get the body and decode it.
if (!$reparse) {
$self->debug("Not reparsing.") if $self->{_debug} > 1;
$part->{body_in} = 'MEMORY';
my $decoder = $DecoderFor{lc($encoding)};
$self->debug("Decoding part using: " . lc($encoding)) if $self->{_debug};
$part->{data} = '';
my $out = '';
my $res = $self->$decoder(\$encoded, \$out);
undef $encoded;
$res or return;
$part->{data} = $out;
undef $out;
}
else {
# If have an embeded email we reparse it.
$self->debug("Reparsing enclosed message.") if $self->{_debug};
my $out = '';
my $decoder = $DecoderFor{lc($encoding)};
$self->debug("Decoding " . lc($encoding)) if $self->{_debug};
my $res = $self->$decoder(\$encoded, \$out);
undef $encoded;
$res or return;
my $p = GT::Mail::Parts->new(headers_intact => $self->{headers_intact});
push @{$part->{parts}}, $p;
$self->_parse_part(undef, \$out, $p) or return;
}
}
return ($part, $state);
}
sub _parse_to_bound {
# --------------------------------------------------------------------------
# This method takes a boundary ($bound), an input string ref ($in), and an
# output string ref ($out). It will place into $$out the data contained by
# $bound, and remove the entire region (including boundary) from $$in.
#
my ($self, $bound, $in, $out) = @_;
# Set up strings for faster checking:
my ($delim, $close) = ("--$bound", "--$bound--");
$self->debug("Parsing bounds. Skip until\n\tdelim ($delim)\n\tclose ($close)") if $self->{_debug} > 1;
my ($pos, $ret);
# Place our part in $$out.
$$out = undef;
if (defined($pos = index($$in, "$CRLF$delim$CRLF")) and $pos != -1) {
$$out = substr($$in, 0, $pos);
substr($$in, 0, $pos + length("$CRLF$delim$CRLF")) = "";
$ret = 'DELIM';
}
elsif (index($$in, "$delim$CRLF") == 0) {
substr($$in, 0, length("$delim$CRLF")) = "";
$$out = "";
$ret = 'DELIM';
}
elsif (defined($pos = index($$in, "$CRLF$close$CRLF")) and $pos != -1) {
$$out = $$in;
substr($$out, -(length($$out) - $pos)) = '';
my $len = (length($$in) - (length("$CRLF$close$CRLF") + $pos)) * -1;
if ($len == 0) {
$$in = '';
}
else {
$$in = substr($$in, $len);
}
$ret = 'CLOSE';
}
elsif (index($$in, "$CRLF$close") == (length($$in) - length("$CRLF$close"))) {
$$out = substr($$in, 0, length($$in) - length("$CRLF$close"));
$$in = "";
$ret = 'CLOSE';
}
elsif (index($$in, "$close$CRLF") == 0) {
$$out = "";
substr($$in, 0, length("$close$CRLF")) = "";
$ret = 'CLOSE';
}
elsif (index($$in, $close) == 0 and (length($$in) == length($close))) {
$$out = "";
$$in = "";
$ret = 'CLOSE';
}
if (defined $$out) {
return $ret;
}
else {
# Broken Email, retype to text/plain
$self->{parts}->[$#{$self->{parts}}]->set('content-type' => 'text/plain');
$$out = $$in;
return 'CLOSE';
}
}
sub _parse_preamble {
# --------------------------------------------------------------------------
# Internal Method
# ---------------
# Parses preamble and sets it in part.
#
my ($self, $inner_bound, $in, $part) = @_;
my $loc;
my ($delim, $close) = ("--$inner_bound", "--$inner_bound--");
$self->debug("Parsing preamble. Skip until\n\tdelim ($delim)\n\tclose ($close)") if $self->{_debug} > 1;
my @saved;
$part->preamble(\@saved);
my ($data, $pos, $len);
if (index($$in, "$delim$CRLF") == 0) {
$data = '';
substr($$in, 0, length("$delim$CRLF")) = '';
}
else {
$pos = index($$in, "$CRLF$delim$CRLF");
if ($pos >= 0) {
$data = substr($$in, 0, $pos);
substr($$in, 0, $pos + length("$CRLF$delim$CRLF")) = '';
}
elsif ($pos == -1) {
return $self->error('PARSE', 'WARN', "Unable to find opening boundary: " .
"$delim\n" .
"Message is probably corrupt.");
}
}
push @saved, split $CRLF => $data;
undef $data;
return 'DELIM';
}
sub _parse_epilogue {
# --------------------------------------------------------------------------
# Internal Method
# ---------------
# Parses epilogue and sets it in part.
#
my ($self, $outer_bound, $in, $part) = @_;
my ($delim, $close, $loc);
($delim, $close) = ("--$outer_bound", "--$outer_bound--") if defined $outer_bound;
$self->debug("Parsing epilogue. Skip until\n\tdelim (" . ($delim || '') .
")\n\tclose (" . ($close || '') . ")")
if $self->{_debug} > 1;
my @saved;
$part->epilogue(\@saved);
if (defined $outer_bound) {
if ($$in =~ s/(.*?)(?:\A|$CRLF)\Q$delim\E$CRLF//s) {
push(@saved, split($CRLF => $1));
$self->debug("Found delim($delim)") if $self->{_debug};
return 'DELIM'
}
elsif ($$in =~ s/(.*?)(?:\A|$CRLF)\Q$close\E(?:\Z|$CRLF)//s) {
push(@saved, split($CRLF => $1));
$self->debug("Found close($close)") if $self->{_debug};
return 'CLOSE'
}
}
push(@saved, split($CRLF => $$in));
$$in = '';
$self->debug("EOF: epilogue is " . length(join '', @saved) . " bytes") if $self->{_debug};
return 'EOF';
}
sub Base64 {
# --------------------------------------------------------------------------
my ($self, $in, $out) = @_;
# Remove any non base64 characters.
$$in =~ tr{A-Za-z0-9+/}{}cd;
# Must pass multiple of 4 to decode_base64. Store any remainder in $rem_str and
# pad it with trailing equal signs.
my $rem = length($$in) % 4;
my ($rem_str);
if ($rem) {
my $pad = '=' x (4 - $rem);
$rem_str = substr($$in, length($$in) - $rem);
$rem_str .= $pad;
substr($$in, $rem * -1) = '';
}
$$out = decode_base64($$in);
if ($rem) {
$$out .= decode_base64($rem_str);
}
return 1;
}
sub Binary {
# --------------------------------------------------------------------------
my ($self, $in, $out) = @_;
$$out = $$in;
return 1;
}
sub NBit {
# --------------------------------------------------------------------------
my ($self, $in, $out) = @_;
$$out = $$in;
return 1;
}
sub QuotedPrint {
# --------------------------------------------------------------------------
my ($self, $in, $out) = @_;
if ($use_decode_qp) {
$$out = MIME::QuotedPrint::decode_qp($$in);
}
else {
$$out = $$in;
$$out =~ s/\r\n/\n/g; # normalize newlines
$$out =~ s/[ \t]+\n/\n/g; # rule #3 (trailing whitespace must be deleted)
$$out =~ s/=\n//g; # rule #5 (soft line breaks)
$$out =~ s/=([\da-fA-F]{2})/chr hex $1/ge;
}
return 1;
}
sub UU {
# --------------------------------------------------------------------------
my ($self, $in, $out) = @_;
my ($mode, $file);
# Find beginning...
while ($$in =~ s/^(.+$CRLF)//o) {
local $_ = $1;
last if ($mode, $file) = /^begin\s*(\d*)\s*(\S*)/;
}
return $self->error("uu decoding: no begin found", 'WARN') if (!defined($_));
# Decode:
while ($$in =~ s/^(.+$CRLF)//o) {
local $_ = $1;
last if /^end/;
next if /[a-z]/;
next unless int((((ord() - 32) & 077) + 2) / 3) == int(length($_) / 4);
$$out .= unpack('u', $_);
}
return 1;
}
sub gt_old_decode_base64 {
# --------------------------------------------------------------------------
my $str = shift;
my $res = "";
$str =~ tr|A-Za-z0-9+=/||cd;
$str =~ s/=+$//;
$str =~ tr|A-Za-z0-9+/| -_|;
return "" unless length $str;
my $uustr = '';
my ($i, $l);
$l = length($str) - 60;
for ($i = 0; $i <= $l; $i += 60) {
$uustr .= "M" . substr($str, $i, 60);
}
$str = substr($str, $i);
# and any leftover chars
if ($str ne "") {
$uustr .= chr(32 + length($str)*3/4) . $str;
}
return unpack("u", $uustr);
}
1;
__END__
=head1 NAME
GT::Mail::Parse - MIME Parse
=head1 SYNOPSIS
use GT::Mail::Parse
my $parser = new GT::Mail::Parse (
naming => \&name_files,
in_file => '/path/to/file.eml',
debug => 1
);
my $top = $parser->parse or die $GT::Mail::Parse::error;
- or -
my $parser = new GT::Mail::Parse;
open FH, '/path/to/file.eml' or die $!;
my $top = $parser->parse (
naming => \&name_files,
handle => \*FH,
debug => 1
) or die $GT::Mail::Parse::error;
close FH;
- or -
my $parser = new GT::Mail::Parse;
my $top_head = $parser->parse_head (
naming => \&name_files,
in_file => '/path/to/file.eml',
debug => 1
) or die $GT::Mail::Parse::error;
=head1 DESCRIPTION
GT::Mail::Parse is a 100% rfc822 email MIME parser that supports unlimited
nested levels of MIME. Emails are parsed into L<GT::Mail::Parts> objects. Each
part knows where it's body is and each part contains it's sub parts. See
L<GT::Mail::Parts> for details on parts methods.
=head2 new - Constructor method
This is the constructor method to get a GT::Mail::Parse object, which you
need to access all the methods (there are no Class methods). new() takes
a hash or hash ref as it's arguments. Each key has an accessor method by the
same name except debug, which can only be set by passing debug to new(), parse()
or parse_head().
=over 4
=item debug
Sets the debug level for this insance of the class.
=item naming
Specify a code reference to use as a naming convention for each part of the
email being parsed. This is useful to keep file IO down when you want the emails
seperated into each part as a file. If this is not specified GT::Mail::Parse
uses a default naming, which is to start at one and incriment that number for each
attachment. The attachments would go in the current working directory.
=item in_file
Specify the path to the file that contains the email to be parsed. One of in_file
and handle must be specified.
=item handle
Specify the file handle or IO stream that contains the email to be parsed.
=back
=item attach_rfc822
By default, the parser will decode any embeded emails, and flatten out all the
parts. If you prefer to leave embeded emails unparsed, pass in 1 to this option
and the parser will treat it as an attachment.
=back
=head2 parse - Parse an email
Instance method. Parses the email specified by either in_file or handle. Returns
the top level L<GT::Mail::Parts> object. Any additional parameters passed in are
treated the same as if they were passed to the constuctor.
=head2 parse_head - Parse just the header of the email
Instance method. This method is exactly the same as parse except only the top
level header is parsed and it's part object returned. This is useful to keep
overhead down if you only need to know about the header of the email.
=head2 size - Get the size
Instance method. Returns the total size in bytes of the parsed unencoded email. This
method will return undef if no email has been parsed.
=head2 all_parts - Get all parts
Instance method. Returns all the parts in the parsed email. This is a flatened
list of the objects. Somewhat similar to what MIME::Tools does. All the parts
still contain their sub parts.
=head1 COPYRIGHT
Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
http://www.gossamer-threads.com/
=head1 VERSION
Revision: $Id: Parse.pm,v 1.79 2004/10/23 02:16:39 brewt Exp $