package Text::ParseWords; require 5.000; require Exporter; require AutoLoader; use Carp; @ISA = qw(Exporter AutoLoader); @EXPORT = qw(shellwords quotewords); @EXPORT_OK = qw(old_shellwords); # This code needs updating to use new Perl 5 features (regexp etc). # ParseWords.pm # # Usage: # use ParseWords; # @words = "ewords($delim, $keep, @lines); # @words = &shellwords(@lines); # @words = &old_shellwords(@lines); # Hal Pomeranz (pomeranz@netcom.com), 23 March 1994 # Permission to use and distribute under the same terms as Perl. # No warranty expressed or implied. # Basically an update and generalization of the old shellwords.pl. # Much code shamelessly stolen from the old version (author unknown). # # "ewords() accepts a delimiter (which can be a regular expression) # and a list of lines and then breaks those lines up into a list of # words ignoring delimiters that appear inside quotes. # # The $keep argument is a boolean flag. If true, the quotes are kept # with each word, otherwise quotes are stripped in the splitting process. # $keep also defines whether unprotected backslashes are retained. # 1; __END__ sub shellwords { # A &shellwords() replacement is included to demonstrate the new package. # This version differs from the original in that it will _NOT_ default # to using $_ if no arguments are given. I personally find the old behavior # to be a mis-feature. local(@lines) = @_; $lines[$#lines] =~ s/\s+$//; "ewords('\s+', 0, @lines); } sub quotewords { # "ewords() works by simply jamming all of @lines into a single # string in $_ and then pulling off words a bit at a time until $_ # is exhausted. # # The inner "for" loop builds up each word (or $field) one $snippet # at a time. A $snippet is a quoted string, a backslashed character, # or an unquoted string. We fall out of the "for" loop when we reach # the end of $_ or when we hit a delimiter. Falling out of the "for" # loop, we push the $field we've been building up onto the list of # @words we'll be returning, and then loop back and pull another word # off of $_. # # The first two cases inside the "for" loop deal with quoted strings. # The first case matches a double quoted string, removes it from $_, # and assigns the double quoted string to $snippet in the body of the # conditional. The second case handles single quoted strings. In # the third case we've found a quote at the current beginning of $_, # but it didn't match the quoted string regexps in the first two cases, # so it must be an unbalanced quote and we croak with an error (which can # be caught by eval()). # # The next case handles backslashed characters, and the next case is the # exit case on reaching the end of the string or finding a delimiter. # # Otherwise, we've found an unquoted thing and we pull of characters one # at a time until we reach something that could start another $snippet-- # a quote of some sort, a backslash, or the delimiter. This one character # at a time behavior was necessary if the delimiter was going to be a # regexp (love to hear it if you can figure out a better way). local($delim, $keep, @lines) = @_; local(@words,$snippet,$field,$_); $_ = join('', @lines); while ($_) { $field = ''; for (;;) { $snippet = ''; if (s/^"(([^"\\]|\\[\\"])*)"//) { $snippet = $1; $snippet = "\"$snippet\"" if ($keep); } elsif (s/^'(([^'\\]|\\[\\'])*)'//) { $snippet = $1; $snippet = "'$snippet'" if ($keep); } elsif (/^["']/) { croak "Unmatched quote"; } elsif (s/^\\(.)//) { $snippet = $1; $snippet = "\\$snippet" if ($keep); } elsif (!$_ || s/^$delim//) { last; } else { while ($_ && !(/^$delim/ || /^['"\\]/)) { $snippet .= substr($_, 0, 1); substr($_, 0, 1) = ''; } } $field .= $snippet; } push(@words, $field); } @words; } sub old_shellwords { # Usage: # use ParseWords; # @words = old_shellwords($line); # or # @words = old_shellwords(@lines); local($_) = join('', @_); my(@words,$snippet,$field); s/^\s+//; while ($_ ne '') { $field = ''; for (;;) { if (s/^"(([^"\\]|\\.)*)"//) { ($snippet = $1) =~ s#\\(.)#$1#g; } elsif (/^"/) { croak "Unmatched double quote: $_"; } elsif (s/^'(([^'\\]|\\.)*)'//) { ($snippet = $1) =~ s#\\(.)#$1#g; } elsif (/^'/) { croak "Unmatched single quote: $_"; } elsif (s/^\\(.)//) { $snippet = $1; } elsif (s/^([^\s\\'"]+)//) { $snippet = $1; } else { s/^\s+//; last; } $field .= $snippet; } push(@words, $field); } @words; }