discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/GT/Text/Tools.pm
2024-06-17 21:49:12 +10:00

214 lines
7.3 KiB
Perl

# ==================================================================
# Gossamer Threads Module Library - http://gossamer-threads.com/
#
# GT::Text::Tools
# Author : Scott Beck
# CVS Info : 087,071,086,086,085
# $Id: Tools.pm,v 1.9 2005/06/09 23:42:16 brewt Exp $
#
# Copyright (c) 2004 Gossamer Threads Inc. All Rights Reserved.
# ==================================================================
#
# Description: A general purpose text parsing module.
#
package GT::Text::Tools;
# ==================================================================
# Pragmas
use strict;
# Internal mules
use bases 'GT::Base' => '';
sub linesplit {
# --------------------------------------------------------------------
# my @words = GT::Text::Tools->linesplit($regex, $line);
# ------------------------------------------------------
# Splits $line by $regex outside of quotes ['"]
# If regex is false defaults to \s+.
#
# Ganged and modified from Text::ParseWords
local $^W;
my ($class, $delimiter, $line) = @_;
$delimiter ||= '\s+';
$delimiter =~ s/(\s)/\\$1/g;
my ($quote, $quoted, $unquoted, $delim, $word, @pieces);
while (length($line)) {
($quote, $quoted, undef, $unquoted, $delim, undef) =
$line =~ m/^(["']) # a $quote
((?:\\.|(?!\1)[^\\])*) # and $quoted text
\1 # followed by the same quote
([\000-\377]*) # and the rest
| # --OR--
^((?:\\.|[^\\"'])*?) # an $unquoted text
(\Z(?!\n)|(?:$delimiter)|(?!^)(?=["']))
# plus EOL, delimiter, or quote
([\000-\377]*) # the rest
/x; # extended layout
return () unless ( $quote || length($unquoted) || length($delim));
$line = $+;
$quoted = "$quote$quoted$quote";
$word .= defined $quote ? $quoted : $unquoted;
if (length($delim)) {
push(@pieces, $word);
undef $word;
}
if (!length($line)) {
push(@pieces, $word);
}
}
return (@pieces);
}
sub linewrap {
# --------------------------------------------------------------------
# GT::Text::Tools->linewrap( $string, $number, {
# nowrap => $regexs,
# eol => "\n",
# max_line_length => 50000
# });
# ----------------------------------------------
# linewrap takes a string, a number of characters per line and a
# hash ref of options. String will be wrapped to the number of
# characters specified on spaces.
# The following options apply:
# nowrap => array ref of regexes that if matched, will
# not be wrapped.
# eol => What to wrap the lines with, defaults to
# \n.
# eol_match => What to use to match eol characters; defaults to
# \r?\n
# max_line_length => The maximum length a line can be that will
# be wrapped on a space. Any line reaching
# this length will be wrapped without
# looking for spaces. Defaults to 50_000, set
# to non-true value to avoid this affect.
#
my ($class, $string, $i, $opts) = @_;
my $max_len = exists($opts->{max_line_length}) ? $opts->{max_line_length} : 50_000;
my $regexs = $opts->{nowrap} || [];
my $nl = $opts->{eol} || "\n";
my $eolre = $opts->{eol_match} || "\r?\n";
$regexs = (ref($regexs) eq 'ARRAY') ? $regexs : [$regexs || ()];
my @t = split /$eolre/, $string;
my $r = "";
while (@t) {
my $match = 0;
if (length $t[0] <= $i) {
$r .= shift(@t) . $nl;
$match = 1;
}
elsif ($t[0] =~ /^\s*$/) {
my $spaces = shift @t;
# Keep the string of spaces unless it's too long (don't bother wrapping them)
$r .= (length $spaces <= $i ? $spaces : '') . $nl;
$match = 1;
}
elsif ($max_len and length $t[0] > $max_len) { # Line is too long.
my $line = shift @t;
while ($line) {
$r .= substr($line, 0, $i) . $nl;
substr($line, 0, $i) = '';
}
$match = 1;
}
elsif (@{$regexs}) {
my $regex = join('|', @{$regexs});
if ($t[0] =~ m/$regex/) {
my $eos = ''; # Store any incomplete lines
while ($t[0] =~ s/^(.*?)(\s?)((?:$regex)\s?)//) {
my $pre = _wrap($i, $nl, $eos . $1);
$eos = '';
my $s = $2 || '';
my $mat = $3;
if (!length($pre) or $pre =~ /$nl$/) {
$r .= $pre;
if (length $mat > $i) {
$r .= $mat . $nl;
}
else {
$eos = $mat;
}
}
else {
$pre =~ s/($nl|^)(.*?)$//;
$r .= $pre . $1;
my $leftover = $2;
if (length($leftover . $s . $mat) <= $i) {
$eos = $leftover . $s . $mat;
}
else {
$r .= $leftover . $nl;
if (length $mat > $i) {
$r .= $mat . $nl;
}
else {
$eos = $mat;
}
}
}
}
$eos .= $t[0] if length $t[0];
if (length $eos) {
$r .= _wrap($i, $nl, $eos) . $nl;
}
shift(@t);
$match = 1;
}
}
next if $match;
$r .= _wrap($i, $nl, shift(@t) || '') . $nl;
}
return $r;
}
sub _wrap {
# --------------------------------------------------------------------
# _wrap($length, $newline, $string);
# ----------------------------
# Internal method called by linewrap() to wrap a line.
#
my ($i, $e);
$i = $e = shift;
my $nl = shift;
my $r;
defined $_[0] or return '';
if (length $_[0] < $i) { return $_[0]; }
while (@_) {
defined($_[0]) or last;
if ($_[0] =~ /^(.{$i})\s(.+)$/) {
shift() and $r .= $1 . $nl;
$i = $e;
if (defined($2) and length($2) <= $e) {
$r .= $2;
$r .= $nl if length($2) == $e;
}
else {
unshift(@_, $2);
}
}
elsif ($i-- == 0) {
$i = $e;
shift() =~ /^(.{$i})(.+)$/ and $r .= $1 . $nl;
if (defined($2) and length($2) <= $e) {
$r .= $2;
$r .= $nl if length($2) == $e;
}
else {
unshift(@_, $2)
}
}
}
return defined($r) ? $r : '';
}
1;