# ================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::Mail::Parse # Author : Scott Beck # CVS Info : 087,071,086,086,085 # $Id: Parse.pm,v 1.90 2008/10/29 23:32:07 brewt Exp $ # # Copyright (c) 2005 Gossamer Threads Inc. All Rights Reserved. # ================================================================== package GT::Mail::Parse; # ============================================================================= # If MIME::Base64 is installed use it - must eval before hand or 5.004_04 wipes # our ISA. my $have_b64 = eval { local $SIG{__DIE__}; require MIME::Base64; import MIME::Base64; if ($] < 5.005) { local $^W; decode_base64('brok'); } 1; }; $have_b64 or *decode_base64 = \>_old_decode_base64; my $use_decode_qp; if ($have_b64 and $MIME::Base64::VERSION ge 2.16 and # Prior versions had decoding bugs defined &MIME::QuotedPrint::decode_qp and ( not defined &MIME::QuotedPrint::old_decode_qp or \&MIME::QuotedPrint::decode_qp != \&MIME::QuotedPrint::old_decode_qp ) ) { $use_decode_qp = 1; } # Pragmas use strict; use vars qw($VERSION $DEBUG $ERRORS @ISA); # System modules use Fcntl; # Internal modules use GT::Mail::Parts; use GT::Base; # Inherent from GT::Base for errors and debug @ISA = qw(GT::Base); # Debugging mode $DEBUG = 0; $VERSION = sprintf "%d.%03d", q$Revision: 1.90 $ =~ /(\d+)\.(\d+)/; # Error messages $ERRORS = { PARSE => "An error occurred while parsing: %s", DECODE => "An error occurred while decoding: %s", NOPARTS => "Email has no parts!", DEEPPARTS => "Deep recursion dected, email appears to have more than 50 parts!", MALFORMED => "Found (%s) before finding the start of the boundary. Message malformed" }; my %DecoderFor = ( # Standard... '7bit' => 'NBit', '8bit' => 'NBit', 'base64' => 'Base64', 'binary' => 'Binary', 'none' => 'Binary', 'quoted-printable' => 'QuotedPrint', # Non-standard... 'x-uu' => 'UU', 'x-uuencode' => 'UU', ); sub new { # -------------------------------------------------------------------------- # CLASS->new ( # naming => \&naming, # in_file => '/path/to/file/to/parse', # handle => \*FH # ); # ---------------------------------------------- # Class method to get a new object. Calles init if there are any additional # argument. To set the arguments that are passed to naming call naming # directly. # my $this = shift; my $class = ref $this || $this; my $self = bless { file_handle => undef, parts => [], head_part => undef, headers_intact => 1, _debug => $DEBUG, eol => "\012" }, $class; $self->init(@_) if @_; $self->debug("Created new object ($self).") if $self->{_debug} > 1; return $self; } sub init { # -------------------------------------------------------------------------- # $obj->init (%opts); # ------------------- # Sets the options for the current object. # my $self = shift; my $opt = {}; if (@_ == 1 and ref $_[0] eq 'HASH') { $opt = shift } elsif (defined $_[0] and not @_ % 2) { $opt = {@_} } else { return $self->error("BADARGS", "FATAL", "init") } $self->{_debug} = exists($opt->{debug}) ? $opt->{debug} : $DEBUG; $self->{headers_intact} = exists($opt->{headers_intact}) ? $opt->{headers_intact} : 1; for my $m (qw(crlf in_file in_handle in_string attach_rfc822)) { $self->$m($opt->{$m}) if defined $opt->{$m}; } } sub attach_rfc822 { # -------------------------------------------------------------------------- my $self = shift; if (@_) { $self->{attach_rfc822} = shift; } return $self->{attach_rfc822}; } sub crlf { # ----------------------------------------------------------------------------- # Sets the end-of-line character sequence to use when parsing. This defaults # to \012 (\n); you'll likely want to use \015\012 at times (for example, when # parsing mail downloaded from a POP3 server). This is set on a per-parser # basis (it used to be global, but that was significantly broken). # my ($self, $eol) = @_; $self->{eol} = $eol; } sub parse { # -------------------------------------------------------------------------- # my $top = $obj->parse; # ---------------------- # Parses the email set in new or init. Also calls init if there are any # arguments passed in. # Returns the top level part object. # my ($self, @opts) = @_; # Any additional arguments goto init $self->init(@opts) if @opts; ($self->{string} and ref($self->{string}) eq 'SCALAR') or return $self->error('BADARGS', 'FATAL', "No input was given to parse before parse() was called"); # Recursive function to parse $self->_parse_part(undef, $self->{string}); # parse! # Return top part return $self->{head_part}; } sub parse_head { # -------------------------------------------------------------------------- # my $head = $obj->parse_head; # ---------------------------- # Passes any additional arguments to init. Parses only the top level header. # This saves some overhead if for example all you need to do it find out who # an email is to on a POP3 server. # my ($self, $in, @opts) = @_; unless (ref $self) { $self = $self->new(@opts); } $in ||= $self->{string}; $in || return $self->error("BADARGS", "FATAL", "No string to parse set!"); # Parse the head return $self->_parse_head($in); } #-------------------------------------------- # Access #-------------------------------------------- sub in_handle { # -------------------------------------------------------------------------- # $obj->in_handle (\*FH); # -------------------- # Pass in a file handle to parse from when parse is called. # my ($self, $value) = @_; if (@_ > 1 and ref $value and defined fileno $value) { read $value, ${$self->{string}}, -s $value; } return $self->{string}; } sub in_file { # -------------------------------------------------------------------------- # $obj->in_file ('/path/to/file'); # -------------------------------- # Pass in the path to a file to parse when parse is called # my $self = shift; my $file = shift; my $io = \do { local *FH; *FH }; open $io, "<$file" or return $self->error("READOPEN", "FATAL", $file, $!); return $self->in_handle($io); } sub in_string { # -------------------------------------------------------------------------- my ($self, $string) = @_; return $self->{string} unless (@_ > 1); if (ref($string) eq 'SCALAR') { $self->{string} = $string; } else { $self->{string} = \$string; } return $self->{string}; } sub size { # -------------------------------------------------------------------------- # my $email_size = $obj->size; # ---------------------------- # Returns the total size of an email. Call this method after the email has # been parsed. # my $self = shift; (@{$self->{parts}} > 0) or return $self->error("NOPARTS", "WARN"); my $size = 0; foreach (@{$self->{parts}}) { $size += $_->size; } return $size; } sub all_parts { # -------------------------------------------------------------------------- # my @parts = $obj->all_parts; # ---------------------------- # Returns a list of all the part object for the current parsed email. If the # email is not multipart this will be just the header part. # return @{shift()->{parts}} } sub top_part { # -------------------------------------------------------------------------- return ${shift()->{parts}}[0]; } #--------------------------------------------- # Internal Methods #--------------------------------------------- sub _parse_head { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parse just the head. Returns the part object. # my ($self, $in) = @_; # Get a new part object my $part = GT::Mail::Parts->new(headers_intact => $self->{headers_intact}); if (ref $in eq 'ARRAY') { $part->extract($in) or return $self->error("PARSE", "WARN", "Couldn't parse head!"); return $part; } $part->extract([map "$_$self->{eol}", split /\Q$self->{eol}/, $$in]) or return $self->error($GT::Mail::Parts::error, 'WARN'); return $part; } sub _parse_part { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parses all the parts of an email and stores them in there parts object. # This function is recursive. # my ($self, $outer_bound, $in, $part) = @_; my $state = 'OK'; # First part is going to be the top level part if (!$part) { $part = GT::Mail::Parts->new(headers_intact => $self->{headers_intact}); $self->{head_part} = $part; } push @{$self->{parts}}, $part; # Get the header for this part =for comment According to rfc2045 and rfc2046, the MIME part headers are optional, so for parsing out the headers, we have the following cases: 1) no headers, no body EOL--boundary 2) no headers, body EOLbodyEOL--boundary 3) headers, no body headers[EOL]EOL--boundary 4) headers, body headersEOLbodyEOL--boundary _parse_to_bound parses everything after the header to EOL--boundary, so this header parsing must be careful not to remove the EOL before the --boundary (cases 1 and 3), or _parse_to_bound will parse more than it should. =cut my $eol_len = length $self->{eol}; if (defined $outer_bound and substr($$in, 0, length "$self->{eol}--$outer_bound") eq "$self->{eol}--$outer_bound") { # do nothing } elsif (substr($$in, 0, $eol_len) eq $self->{eol}) { substr($$in, 0, $eol_len) = ''; } else { my $indx = index($$in, $self->{eol} x 2); if ($indx == -1) { $self->debug('Message has no body.') if $self->{_debug}; $indx = length($$in); } $part->extract([map "$_$self->{eol}", split /\Q$self->{eol}/, substr $$in, 0, $indx]) or return $self->warn($GT::Mail::Parts::error); my $trim_len = $eol_len * 2; if (defined $outer_bound) { my $next_bound = "$self->{eol}$self->{eol}--$outer_bound"; if (substr($$in, $indx, length $next_bound) eq $next_bound) { $trim_len = $eol_len; } } substr($$in, 0, $indx + $trim_len) = ''; } # Get the mime type my ($type, $subtype) = split m{/}, $part->mime_type; $type ||= 'text'; $subtype ||= 'plain'; if ($self->{_debug}) { my $name = $part->recommended_filename || '[unnamed]'; $self->debug("Type is '$type/$subtype' ($name)"); } # Deal with the multipart type with some recursion if ($type eq 'multipart') { my $retype = (($subtype eq 'digest') ? 'message/rfc822' : ''); # Find the multipart boundary my $inner_bound = $part->multipart_boundary; $self->debug("Boundary is $inner_bound") if $self->{_debug} > 1; defined $inner_bound or return $self->error("PARSE", "WARN", "No multipart boundary in multipart message."); index($inner_bound, $self->{eol}) == -1 or return $self->error("PARSE", "WARN", "End-of-line character in multipart boundary."); # Parse the Preamble $self->debug("Parsing preamble.") if $self->{_debug} > 1; $state = $self->_parse_preamble($inner_bound, $in, $part) or return; chomp($part->preamble->[-1]) if @{$part->preamble}; # Get all the parts of the multipart message my $partno = 0; my $parts; while (1) { ++$partno < 200 or return $self->error('DEEPPARTS', 'WARN'); $self->debug("Parsing part $partno.") if $self->{_debug}; ($parts, $state) = $self->_parse_part($inner_bound, $in, GT::Mail::Parts->new(headers_intact => $self->{headers_intact})) or return; $parts->mime_type($retype) if $retype; push(@{$part->{parts}}, $parts); if ($state eq 'EOF') { $self->warn(PARSE => 'Unexpected EOF before close.'); return ($part, 'EOF'); } last if $state eq 'CLOSE'; } # Parse the epilogue $self->debug("Parsing epilogue.") if $self->{_debug} > 1; $state = $self->_parse_epilogue($outer_bound, $in, $part) or return; chomp($part->epilogue->[-1]) if @{$part->epilogue} and $state ne 'EOF'; } # We are on a single part else { $self->debug("Decoding single part.") if $self->{_debug} > 1; # Find the encoding for the body of the part my $encoding = $part->mime_encoding || 'binary'; if (!exists($DecoderFor{lc($encoding)})) { $self->debug("Unsupported encoding '$encoding': using 'binary'... \n" . "The entity will have an effective MIME type of \n" . "application/octet-stream, as per RFC-2045.") if $self->{_debug}; $part->effective_type('application/octet-stream'); $encoding = 'binary'; } my $reparse; $reparse = ("$type/$subtype" eq "message/rfc822") unless $self->{attach_rfc822}; my $encoded = ""; # If we have boundaries we parse the body to the boundary if (defined $outer_bound) { $self->debug("Parsing to boundary.") if $self->{_debug} > 1; $state = $self->_parse_to_bound($outer_bound, $in, \$encoded) or return; } # Else we would parse the rest of the input stream as the rest of the message else { $self->debug("No Boundries.") if $self->{_debug} > 1; $encoded = $$in; $state = 'EOF'; } # Normal part so we get the body and decode it. if (!$reparse) { $self->debug("Not reparsing.") if $self->{_debug} > 1; $part->{body_in} = 'MEMORY'; my $decoder = $DecoderFor{lc($encoding)}; $self->debug("Decoding part using: " . lc($encoding)) if $self->{_debug}; $part->{data} = ''; my $out = ''; my $res = $self->$decoder(\$encoded, \$out); undef $encoded; $res or return; $part->{data} = $out; undef $out; } else { # If have an embeded email we reparse it. $self->debug("Reparsing enclosed message.") if $self->{_debug}; my $out = ''; my $decoder = $DecoderFor{lc($encoding)}; $self->debug("Decoding " . lc($encoding)) if $self->{_debug}; my $res = $self->$decoder(\$encoded, \$out); undef $encoded; $res or return; my $p = GT::Mail::Parts->new(headers_intact => $self->{headers_intact}); push @{$part->{parts}}, $p; $self->_parse_part(undef, \$out, $p) or return; } } return ($part, $state); } sub _parse_to_bound { # -------------------------------------------------------------------------- # This method takes a boundary ($bound), an input string ref ($in), and an # output string ref ($out). It will place into $$out the data contained by # $bound, and remove the entire region (including boundary) from $$in. # my ($self, $bound, $in, $out) = @_; # Set up strings for faster checking: $self->debug("Parsing bounds. Skip until\n\tdelim (--$bound)\n\tclose (--$bound--)") if $self->{_debug} > 1; my $ret; # Various shortcut variables - 'e' is eol, 'd' is delimiter, 'c' is closing delimiter: my ($ede, $de, $ece, $ec, $ce) = ( "$self->{eol}--$bound$self->{eol}", "--$bound$self->{eol}", "$self->{eol}--$bound--$self->{eol}", "$self->{eol}--$bound--", "--$bound--$self->{eol}" ); # Place our part in $$out. $$out = undef; # eoldelimeol found anywhere: if ((my $pos = index $$in, $ede) >= 0) { $$out = substr($$in, 0, $pos); substr($$in, 0, $pos + length $ede) = ''; $ret = 'DELIM'; } # delimeol at beginning of string: elsif (substr($$in, 0, length $de) eq $de) { substr($$in, 0, length $de) = ''; $$out = ''; $ret = 'DELIM'; } # eolcloseeol found anywhere: elsif (($pos = index($$in, $ece)) >= 0) { # This code could be much more clearly written as: # #$$out = substr($$in, 0, $pos); #substr($$in, 0, $pos + length $ece) = ''; # # However, that can cause excessive memory usage in some cases (changed in revision 1.59). $$out = $$in; substr($$out, -(length($$out) - $pos)) = ''; my $len = $pos + length($ece) - length($$in); $$in = $len == 0 ? '' : substr($$in, $len); $ret = 'CLOSE'; } # The first eolclose occurs at the end of the string: elsif (index($$in, $ec) == (length($$in) - length($ec))) { $$out = substr($$in, 0, -length($ec)); $$in = ''; $ret = 'CLOSE'; } # closeeol at beginning of string: elsif (substr($$in, 0, length $ce) eq $ce) { $$out = ''; substr($$in, 0, length $ce) = ''; $ret = 'CLOSE'; } # The only thing in the string is the closing boundary: elsif ($$in eq "--$bound--") { $$out = ''; $$in = ''; $ret = 'CLOSE'; } if (defined $$out) { return $ret; } else { # Broken e-mail - we hit the end of the message without finding a boundary. # Assume that everything left is the part body. $$out = $$in; $$in = ''; return 'EOF'; } } sub _parse_preamble { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parses preamble and sets it in part. # my ($self, $inner_bound, $in, $part) = @_; my $delim = "--$inner_bound"; $self->debug("Parsing preamble. Skip until delim ($delim)") if $self->{_debug} > 1; my @saved; $part->preamble(\@saved); my $data; if (substr($$in, 0, length "$delim$self->{eol}") eq "$delim$self->{eol}") { $data = ''; substr($$in, 0, length "$delim$self->{eol}") = ''; } else { if ((my $pos = index($$in, "$self->{eol}$delim$self->{eol}")) >= 0) { $data = substr($$in, 0, $pos); substr($$in, 0, $pos + length("$self->{eol}$delim$self->{eol}")) = ''; } else { return $self->warn(PARSE => "Unable to find opening boundary: $delim\nMessage is probably corrupt."); } } push @saved, split /\Q$self->{eol}/, $data; undef $data; return 'DELIM'; } sub _parse_epilogue { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parses epilogue and sets it in part. # my ($self, $outer_bound, $in, $part) = @_; $self->debug("Parsing epilogue.") if $self->{_debug} > 1; $part->epilogue(\my @saved); if (defined $outer_bound) { my ($delim, $close) = ("--$outer_bound", "--$outer_bound--"); $self->debug("Skip until\n\tdelim ($delim)\n\tclose($close)") if $self->{_debug} > 1; if ($$in =~ s/(.*?)(?:\A|\Q$self->{eol}\E)\Q$delim$self->{eol}//s) { push @saved, split /\Q$self->{eol}/, $1; $self->debug("Found delim($delim)") if $self->{_debug}; return 'DELIM' } elsif ($$in =~ s/(.*?)(?:\A|\Q$self->{eol}\E)\Q$close\E(?:\Z|\Q$self->{eol}\E)//s) { push @saved, split /\Q$self->{eol}/, $1; $self->debug("Found close($close)") if $self->{_debug}; return 'CLOSE' } } push @saved, split /\Q$self->{eol}/, $$in; $$in = ''; $self->debug("EOF: epilogue is " . length(join '', @saved) . " bytes") if $self->{_debug}; return 'EOF'; } sub Base64 { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; # Remove any non base64 characters. $$in =~ tr{A-Za-z0-9+/}{}cd; # Must pass multiple of 4 to decode_base64. Store any remainder in $rem_str and # pad it with trailing equal signs. my $rem = length($$in) % 4; my ($rem_str); if ($rem) { my $pad = '=' x (4 - $rem); $rem_str = substr($$in, length($$in) - $rem); $rem_str .= $pad; substr($$in, $rem * -1) = ''; } $$out = decode_base64($$in); if ($rem) { $$out .= decode_base64($rem_str); } return 1; } sub Binary { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; $$out = $$in; return 1; } sub NBit { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; $$out = $$in; return 1; } sub QuotedPrint { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; if ($use_decode_qp) { $$out = MIME::QuotedPrint::decode_qp($$in); } else { $$out = $$in; $$out =~ s/\r\n/\n/g; # normalize newlines $$out =~ s/[ \t]+\n/\n/g; # rule #3 (trailing whitespace must be deleted) $$out =~ s/=\n//g; # rule #5 (soft line breaks) $$out =~ s/=([\da-fA-F]{2})/chr hex $1/ge; } return 1; } sub UU { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; my ($mode, $file); # Find beginning... while ($$in =~ s/^(.+\Q$self->{eol}\E)//) { local $_ = $1; last if ($mode, $file) = /^begin\s*(\d*)\s*(\S*)/; } return $self->warn("uu decoding: no begin found") if not defined $file; # Decode: while ($$in =~ s/^(.+\Q$self->{eol}\E)//) { local $_ = $1; last if /^end/; next if /[a-z]/; next unless int((((ord() - 32) & 077) + 2) / 3) == int(length($_) / 4); $$out .= unpack('u', $_); } return 1; } sub gt_old_decode_base64 { # -------------------------------------------------------------------------- my $str = shift; my $res = ""; $str =~ tr|A-Za-z0-9+=/||cd; $str =~ s/=+$//; $str =~ tr|A-Za-z0-9+/| -_|; return "" unless length $str; my $uustr = ''; my ($i, $l); $l = length($str) - 60; for ($i = 0; $i <= $l; $i += 60) { $uustr .= "M" . substr($str, $i, 60); } $str = substr($str, $i); # and any leftover chars if ($str ne "") { $uustr .= chr(32 + length($str)*3/4) . $str; } return unpack("u", $uustr); } 1; __END__ =head1 NAME GT::Mail::Parse - MIME Parse =head1 SYNOPSIS use GT::Mail::Parse my $parser = new GT::Mail::Parse ( naming => \&name_files, in_file => '/path/to/file.eml', debug => 1 ); my $top = $parser->parse or die $GT::Mail::Parse::error; - or - my $parser = new GT::Mail::Parse; open FH, '/path/to/file.eml' or die $!; my $top = $parser->parse ( naming => \&name_files, handle => \*FH, debug => 1 ) or die $GT::Mail::Parse::error; close FH; - or - my $parser = new GT::Mail::Parse; my $top_head = $parser->parse_head ( naming => \&name_files, in_file => '/path/to/file.eml', debug => 1 ) or die $GT::Mail::Parse::error; =head1 DESCRIPTION GT::Mail::Parse is a 100% rfc822 email MIME parser that supports unlimited nested levels of MIME. Emails are parsed into L objects. Each part knows where it's body is and each part contains it's sub parts. See L for details on parts methods. =head2 new - Constructor method This is the constructor method to get a GT::Mail::Parse object, which you need to access all the methods (there are no Class methods). new() takes a hash or hash ref as it's arguments. Each key has an accessor method by the same name except debug, which can only be set by passing debug to new(), parse() or parse_head(). =over 4 =item debug Sets the debug level for this insance of the class. =item naming Specify a code reference to use as a naming convention for each part of the email being parsed. This is useful to keep file IO down when you want the emails seperated into each part as a file. If this is not specified GT::Mail::Parse uses a default naming, which is to start at one and incriment that number for each attachment. The attachments would go in the current working directory. =item in_file Specify the path to the file that contains the email to be parsed. One of in_file and handle must be specified. =item handle Specify the file handle or IO stream that contains the email to be parsed. =item attach_rfc822 By default, the parser will decode any embeded emails, and flatten out all the parts. If you prefer to leave embeded emails unparsed, pass in 1 to this option and the parser will treat it as an attachment. =back =head2 parse - Parse an email Instance method. Parses the email specified by either in_file or handle. Returns the top level L object. Any additional parameters passed in are treated the same as if they were passed to the constuctor. =head2 parse_head - Parse just the header of the email Instance method. This method is exactly the same as parse except only the top level header is parsed and it's part object returned. This is useful to keep overhead down if you only need to know about the header of the email. =head2 size - Get the size Instance method. Returns the total size in bytes of the parsed unencoded email. This method will return undef if no email has been parsed. =head2 all_parts - Get all parts Instance method. Returns all the parts in the parsed email. This is a flatened list of the objects. Somewhat similar to what MIME::Tools does. All the parts still contain their sub parts. =head1 COPYRIGHT Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved. http://www.gossamer-threads.com/ =head1 VERSION Revision: $Id: Parse.pm,v 1.90 2008/10/29 23:32:07 brewt Exp $