6. Pattern Matching

Introduction

#-----------------------------
match( $string, $pattern );
subst( $string, $pattern, $replacement );
#-----------------------------
$meadow =~ m/sheep/;   # True if $meadow contains "sheep"
$meadow !~ m/sheep/;   # True if $meadow doesn't contain "sheep"
$meadow =~ s/old/new/; # Replace "old" with "new" in $meadow
#-----------------------------
# Fine bovines demand fine toreadors.
# Muskoxen are a polar ovibovine species.
# Grooviness went out of fashion decades ago.
#-----------------------------
# Ovines are found typically in oviaries.
#-----------------------------
if ($meadow =~ /\bovines?\b/i) { print "Here be sheep!" }
#-----------------------------
$string = "good food";
$string =~ s/o*/e/;
#-----------------------------
# good food
# 
# geod food
# 
# geed food
# 
# geed feed
# 
# ged food
# 
# ged fed
# 
# egood food
#-----------------------------
#% echo ababacaca | perl -ne 'print "$&\n" if /(a|ba|b)+(a|ac)+/'
#ababa
#-----------------------------
#% echo ababacaca | 
#    awk 'match($0,/(a|ba|b)+(a|ac)+/) { print substr($0, RSTART, RLENGTH) }'
#ababacaca
#-----------------------------
while (m/(\d+)/g) {
    print "Found number $1\n";
}
#-----------------------------
@numbers = m/(\d+)/g;
#-----------------------------
$digits = "123456789";
@nonlap = $digits =~ /(\d\d\d)/g;
@yeslap = $digits =~ /(?=(\d\d\d))/g;
print "Non-overlapping:  @nonlap\n";
print "Overlapping:      @yeslap\n";
# Non-overlapping:  123 456 789

# Overlapping:      123 234 345 456 567 678 789
#-----------------------------
$string = "And little lambs eat ivy";
$string =~ /l[^s]*s/;
print "($`) ($&) ($')\n";
# (And ) (little lambs) ( eat ivy)
#-----------------------------

Copying and Substituting Simultaneously

#-----------------------------
$dst = $src;
$dst =~ s/this/that/;
#-----------------------------
($dst = $src) =~ s/this/that/;
#-----------------------------
# strip to basename
($progname = $0)        =~ s!^.*/!!;

# Make All Words Title-Cased
($capword  = $word)     =~ s/(\w+)/\u\L$1/g;

# /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1
($catpage  = $manpage)  =~ s/man(?=\d)/cat/;
#-----------------------------
@bindirs = qw( /usr/bin /bin /usr/local/bin );
for (@libdirs = @bindirs) { s/bin/lib/ }
print "@libdirs\n";
# /usr/lib /lib /usr/local/lib
#-----------------------------
($a =  $b) =~ s/x/y/g;      # copy $b and then change $a
 $a = ($b  =~ s/x/y/g);     # change $b, count goes in $a
#-----------------------------

Matching Letters

#-----------------------------
if ($var =~ /^[A-Za-z]+$/) {
    # it is purely alphabetic
}
#-----------------------------
use locale;
if ($var =~ /^[^\W\d_]+$/) {
    print "var is purely alphabetic\n";
}
#-----------------------------
use locale;
use POSIX 'locale_h';

# the following locale string might be different on your system
unless (setlocale(LC_ALL, "fr_CA.ISO8859-1")) {
    die "couldn't set locale to French Canadian\n";
}

while (<DATA>) {
    chomp;
    if (/^[^\W\d_]+$/) {
        print "$_: alphabetic\n";
    } else {
        print "$_: line noise\n";
    }
}

#__END__
#silly
#façade
#coöperate
#niño
#Renée
#Molière
#hæmoglobin
#naïve
#tschüß
#random!stuff#here
#-----------------------------

Matching Words

#-----------------------------
#/\S+/               # as many non-whitespace bytes as possible
#/[A-Za-z'-]+/       # as many letters, apostrophes, and hyphens
#-----------------------------
#/\b([A-Za-z]+)\b/            # usually best
#/\s([A-Za-z]+)\s/            # fails at ends or w/ punctuation
#-----------------------------

Commenting Regular Expressions

#-----------------------------
# download the following standalone program
#!/usr/bin/perl -p
# resname - change all "foo.bar.com" style names in the input stream
# into "foo.bar.com [204.148.40.9]" (or whatever) instead

use Socket;                 # load inet_addr
s{                          #
    (                       # capture the hostname in $1
        (?:                 # these parens for grouping only
            (?! [-_]  )     # lookahead for neither underscore nor dash
            [\w-] +         # hostname component
            \.              # and the domain dot
        ) +                 # now repeat that whole thing a bunch of times
        [A-Za-z]            # next must be a letter
        [\w-] +             # now trailing domain part
    )                       # end of $1 capture
}{                          # replace with this:
    "$1 " .                 # the original bit, plus a space
           ( ($addr = gethostbyname($1))   # if we get an addr
            ? "[" . inet_ntoa($addr) . "]" #        format it
            : "[???]"                      # else mark dubious
           )
}gex;               # /g for global
                    # /e for execute
                    # /x for nice formatting

#-----------------------------
s/                  # replace
  \#                #   a pound sign
  (\w+)             #   the variable name
  \#                #   another pound sign
/${$1}/xg;          # with the value of the global variable
##-----------------------------
s/                  # replace
\#                  #   a pound sign
(\w+)               #   the variable name
\#                  #   another pound sign
/'$' . $1/xeeg;     # ' with the value of *any* variable
#-----------------------------

Finding the Nth Occurrence of a Match

#-----------------------------
# One fish two fish red fish blue fish
#-----------------------------
$WANT = 3;
$count = 0;
while (/(\w+)\s+fish\b/gi) {
    if (++$count == $WANT) {
        print "The third fish is a $1 one.\n";
        # Warning: don't `last' out of this loop
    }
}
# The third fish is a red one.
#-----------------------------
/(?:\w+\s+fish\s+){2}(\w+)\s+fish/i;
#-----------------------------
# simple way with while loop
$count = 0;
while ($string =~ /PAT/g) {
    $count++;               # or whatever you'd like to do here
}

# same thing with trailing while
$count = 0;
$count++ while $string =~ /PAT/g;

# or with for loop
for ($count = 0; $string =~ /PAT/g; $count++) { }
    
# Similar, but this time count overlapping matches
$count++ while $string =~ /(?=PAT)/g;
#-----------------------------
$pond  = 'One fish two fish red fish blue fish';

# using a temporary
@colors = ($pond =~ /(\w+)\s+fish\b/gi);      # get all matches
$color  = $colors[2];                         # then the one we want

# or without a temporary array
$color = ( $pond =~ /(\w+)\s+fish\b/gi )[2];  # just grab element 3

print "The third fish in the pond is $color.\n";
# The third fish in the pond is red.
#-----------------------------
$count = 0;
$_ = 'One fish two fish red fish blue fish';
@evens = grep { $count++ % 2 == 1 } /(\w+)\s+fish\b/gi;
print "Even numbered fish are @evens.\n";
# Even numbered fish are two blue.
#-----------------------------
$count = 0;
s{
   \b               # makes next \w more efficient
   ( \w+ )          # this is what we'll be changing
   (
     \s+ fish \b
   )
}{
    if (++$count == 4) {
        "sushi" . $2;
    } else {
         $1   . $2;
    }
}gex;
# One fish two fish red fish sushi fish
#-----------------------------
$pond = 'One fish two fish red fish blue fish swim here.';
$color = ( $pond =~ /\b(\w+)\s+fish\b/gi )[-1];
print "Last fish is $color.\n";
# Last fish is blue.
#-----------------------------
m{
    A               # find some pattern A
    (?!             # mustn't be able to find
        .*          # something
        A           # and A
    )
    $               # through the end of the string
}x
#-----------------------------
$pond = 'One fish two fish red fish blue fish swim here.';
if ($pond =~ m{
                    \b  (  \w+) \s+ fish \b
                (?! .* \b fish \b )
            }six )
{
    print "Last fish is $1.\n";
} else {
    print "Failed!\n";
}
# Last fish is blue.
#-----------------------------

Matching Multiple Lines

#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# killtags - very bad html tag killer
undef $/;           # each read is whole file
while (<>) {        # get one whole file at a time
    s/<.*?>//gs;    # strip tags (terribly)
    print;          # print file to STDOUT
}

#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# headerfy: change certain chapter headers to html
$/ = '';
while ( <> ) {              # fetch a paragraph
    s{
        \A                  # start of record
        (                   # capture in $1
            Chapter         # text string
            \s+             # mandatory whitespace
            \d+             # decimal number
            \s*             # optional whitespace
            :               # a real colon
            . *             # anything not a newline till end of line
        )
    }{<H1>$1</H1>}gx;
    print;
}

#-----------------------------
#% perl -00pe 's{\A(Chapter\s+\d+\s*:.*)}{<H1>$1</H1>}gx' datafile
#-----------------------------
$/ = '';            # paragraph read mode for readline access
while (<ARGV>) {
    while (m#^START(.*?)^END#sm) {  # /s makes . span line boundaries
                                    # /m makes ^ match near newlines
        print "chunk $. in $ARGV has <<$1>>\n";
    }
}
#-----------------------------

Reading Records with a Pattern Separator

#-----------------------------
undef $/;
@chunks = split(/pattern/, <FILEHANDLE>);
#-----------------------------
# .Ch, .Se and .Ss divide chunks of STDIN
{
    local $/ = undef;
    @chunks = split(/^\.(Ch|Se|Ss)$/m, <>);
}
print "I read ", scalar(@chunks), " chunks.\n";
#-----------------------------

Extracting a Range of Lines

#-----------------------------
while (<>) {
    if (/BEGIN PATTERN/ .. /END PATTERN/) {
        # line falls between BEGIN and END in the
        # text, inclusive.
    }
}

while (<>) {
    if ($FIRST_LINE_NUM .. $LAST_LINE_NUM) {
        # operate only between first and last line, inclusive.
    }
}
#-----------------------------
while (<>) {
    if (/BEGIN PATTERN/ ... /END PATTERN/) {
        # line is between BEGIN and END on different lines
    }
}

while (<>) {
    if ($FIRST_LINE_NUM ... $LAST_LINE_NUM) {
        # operate only between first and last line, but not same
    }
}
#-----------------------------
# command-line to print lines 15 through 17 inclusive (see below)
perl -ne 'print if 15 .. 17' datafile

# print out all <XMP> .. </XMP> displays from HTML doc
while (<>) {
    print if m#<XMP>#i .. m#</XMP>#i;
}
    
# same, but as shell command
# perl -ne 'print if m#<XMP>#i .. m#</XMP>#i' document.html
#-----------------------------
# perl -ne 'BEGIN { $top=3; $bottom=5 }  print if $top .. $bottom' /etc/passwd        # previous command FAILS
# perl -ne 'BEGIN { $top=3; $bottom=5 } \
#     print if $. == $top .. $. ==     $bottom' /etc/passwd    # works
# perl -ne 'print if 3 .. 5' /etc/passwd   # also works
#-----------------------------
print if /begin/ .. /end/;
print if /begin/ ... /end/;
#-----------------------------
while (<>) {
    $in_header =   1  .. /^$/;
    $in_body   = /^$/ .. eof();
}
#-----------------------------
%seen = ();
while (<>) {
    next unless /^From:?\s/i .. /^$/;
    while (/([^<>(),;\s]+\@[^<>(),;\s]+)/g) {
        print "$1\n" unless $seen{$1}++;
    }
}
#-----------------------------

Matching Shell Globs as Regular Expressions

#-----------------------------
sub glob2pat {
    my $globstr = shift;
    my %patmap = (
         '*' => '.*',
         '?' => '.',
         '[' => '[',
         ']' => ']',
    );
    $globstr =~ s{(.)} { $patmap{$1} || "\Q$1" }ge;
    return '^' . $globstr . '$'; #'
}
#-----------------------------

Speeding Up Interpolated Matches

#-----------------------------
while ($line = <>) {
    if ($line =~ /$pattern/o) {
        # do something
    }
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep1 - grep for abbreviations of places that say "pop"
# version 1: slow but obvious way
@popstates = qw(CO ON MI WI MN);
LINE: while (defined($line = <>)) {
    for $state (@popstates) {
        if ($line =~ /\b$state\b/) {
            print; next LINE;
       }
    }
}

#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep2 - grep for abbreviations of places that say "pop"
# version 2: eval strings; fast but hard to quote
@popstates = qw(CO ON MI WI MN);
$code = 'while (defined($line = <>)) {';
for $state (@popstates) {
    $code .= "\tif (\$line =~ /\\b$state\\b/) { print \$line; next; }\n";
}
$code .= '}';
print "CODE IS\n----\n$code\n----\n" if 0;  # turn on to debug
eval $code;
die if $@;

#-----------------------------
while (defined($line = <>)) {
     if ($line =~ /\bCO\b/) { print $line; next; }
     if ($line =~ /\bON\b/) { print $line; next; }
     if ($line =~ /\bMI\b/) { print $line; next; }
     if ($line =~ /\bWI\b/) { print $line; next; }
     if ($line =~ /\bMN\b/) { print $line; next; }
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep3 - grep for abbreviations of places that say "pop"
# version 3: use build_match_func algorithm
@popstates = qw(CO ON MI WI MN);
    $expr = join('||', map { "m/\\b\$popstates[$_]\\b/o" } 0..$#popstates);
$match_any = eval "sub { $expr }";
die if $@;
while (<>) {
    print if &$match_any;
}

#-----------------------------
sub {
      m/\b$popstates[0]\b/o || m/\b$popstates[1]\b/o ||
      m/\b$popstates[2]\b/o || m/\b$popstates[3]\b/o ||
      m/\b$popstates[4]\b/o
  }
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# grepauth - print lines that mention both Tom and Nat

$multimatch = build_match_all(q/Tom/, q/Nat/);
while (<>) {
    print if &$multimatch;
}
exit;

sub build_match_any { build_match_func('||', @_) }
sub build_match_all { build_match_func('&&', @_) }
sub build_match_func {
    my $condition = shift;
    my @pattern = @_;  # must be lexical variable, not dynamic one
    my $expr = join $condition => map { "m/\$pattern[$_]/o" } (0..$#pattern);
    my $match_func = eval "sub { local \$_ = shift if \@_; $expr }";
    die if $@;  # propagate $@; this shouldn't happen!
    return $match_func;
}

#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep4 - grep for abbreviations of places that say "pop"
# version 4: use Regexp module
use Regexp;
@popstates = qw(CO ON MI WI MN);
@poppats   = map { Regexp->new( '\b' . $_ . '\b') } @popstates;
while (defined($line = <>)) {
    for $patobj (@poppats) {
        print $line if $patobj->match($line);
    }
}

#-----------------------------

Testing for a Valid Pattern

#-----------------------------
do {
    print "Pattern? ";
    chomp($pat = <>);
    eval { "" =~ /$pat/ };
    warn "INVALID PATTERN $@" if $@;
} while $@;
#-----------------------------
sub is_valid_pattern {
    my $pat = shift;
    return eval { "" =~ /$pat/; 1 } || 0;
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# paragrep - trivial paragraph grepper
die "usage: $0 pat [files]\n" unless @ARGV;
$/ = '';
$pat = shift;
eval { "" =~ /$pat/; 1 }      or die "$0: Bad pattern $pat: $@\n";
while (<>) {
    print "$ARGV $.: $_" if /$pat/o;
}

#-----------------------------
$pat = "You lose @{[ system('rm -rf *')]} big here";
#-----------------------------
$safe_pat = quotemeta($pat);
something() if /$safe_pat/;
#-----------------------------
something() if /\Q$pat/;
#-----------------------------

Honoring Locale Settings in Regular Expressions

#-----------------------------
use locale;
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# localeg - demonstrate locale effects

use locale;
use POSIX 'locale_h';

$name = "andreas k\xF6nig";
@locale{qw(German English)} = qw(de_DE.ISO_8859-1 us-ascii);
setlocale(LC_CTYPE, $locale{English})
  or die "Invalid locale $locale{English}";
@english_names = ();
while ($name =~ /\b(\w+)\b/g) {
        push(@english_names, ucfirst($1));
}
setlocale(LC_CTYPE, $locale{German})
  or die "Invalid locale $locale{German}";
@german_names = ();
while ($name =~ /\b(\w+)\b/g) {
        push(@german_names, ucfirst($1));
}
print "English names: @english_names\n";
print "German names:  @german_names\n";

English names: Andreas K Nig

German names:  Andreas König
#-----------------------------

Approximate Matching

#-----------------------------
use String::Approx qw(amatch);

if (amatch("PATTERN", @list)) {
    # matched
}

@matches = amatch("PATTERN", @list);
#-----------------------------
use String::Approx qw(amatch);
open(DICT, "/usr/dict/words")               or die "Can't open dict: $!";
while(<DICT>) {
    print if amatch("balast");
}

ballast

balustrade

blast

blastula

sandblast
#-----------------------------

Matching from Where the Last Pattern Left Off

#-----------------------------
while (/(\d+)/g) {
    print "Found $1\n";
}
#-----------------------------
$n = "   49 here";
$n =~ s/\G /0/g;
print $n;
00049 here
#-----------------------------
while (/\G,?(\d+)/g) {
    print "Found number $1\n";
}
#-----------------------------
$_ = "The year 1752 lost 10 days on the 3rd of September";

while (/(\d+)/gc) {
    print "Found number $1\n";
}

if (/\G(\S+)/g) {
    print "Found $1 after the last number.\n";
}

#Found number 1752
#
#Found number 10
#
#Found number 3
#
#Found rd after the last number.
#-----------------------------
print "The position in \$a is ", pos($a);
pos($a) = 30;
print "The position in \$_ is ", pos;
pos = 30;
#-----------------------------

Greedy and Non-Greedy Matches

#-----------------------------
# greedy pattern
s/<.*>//gs;                     # try to remove tags, very badly

# non-greedy pattern
s/<.*?>//gs;                    # try to remove tags, still rather badly
#-----------------------------
#<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>
#-----------------------------
m{ <b><i>(.*?)</i></b> }sx
#-----------------------------
/BEGIN((?:(?!BEGIN).)*)END/
#-----------------------------
m{ <b><i>(  (?: (?!</b>|</i>). )*  ) </i></b> }sx
#-----------------------------
m{ <b><i>(  (?: (?!</[ib]>). )*  ) </i></b> }sx
#-----------------------------
m{
    <b><i> 
    [^<]*  # stuff not possibly bad, and not possibly the end.
    (?:
 # at this point, we can have '<' if not part of something bad
     (?!  </?[ib]>  )   # what we can't have
     <                  # okay, so match the '<'
     [^<]*              # and continue with more safe stuff
    ) *
    </i></b>
 }sx
#-----------------------------

Detecting Duplicate Words

#-----------------------------
$/ = '';                      # paragrep mode
while (<>) {
    while ( m{
                \b            # start at a word boundary (begin letters)
                (\S+)         # find chunk of non-whitespace
                \b            # until another word boundary (end letters)
                (
                    \s+       # separated by some whitespace
                    \1        # and that very same chunk again
                    \b        # until another word boundary
                ) +           # one or more sets of those
             }xig
         )
    {
        print "dup word '$1' at paragraph $.\n";
    }
}
#-----------------------------
This is a test
test of the duplicate word finder.
#-----------------------------
$a = 'nobody';
$b = 'bodysnatcher';
if ("$a $b" =~ /^(\w+)(\w+) \2(\w+)$/) {
    print "$2 overlaps in $1-$2-$3\n";
}
body overlaps in no-body-snatcher
#-----------------------------
/^(\w+?)(\w+) \2(\w+)$/, 
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# prime_pattern -- find prime factors of argument using pattern matching
for ($N = ('o' x shift); $N =~ /^(oo+?)\1+$/; $N =~ s/$1/o/g) {
    print length($1), " ";
}
print length ($N), "\n";

#-----------------------------
# solve for 12x + 15y + 16z = 281, maximizing x
if (($X, $Y, $Z)  =
   (('o' x 281)  =~ /^(o*)\1{11}(o*)\2{14}(o*)\3{15}$/))
{
    ($x, $y, $z) = (length($X), length($Y), length($Z));
    print "One solution is: x=$x; y=$y; z=$z.\n";
} else {
    print "No solution.\n";
}
#One solution is: x=17; y=3; z=2.
#-----------------------------
('o' x 281)  =~ /^(o+)\1{11}(o+)\2{14}(o+)\3{15}$/;
#One solution is: x=17; y=3; z=2

('o' x 281)  =~ /^(o*?)\1{11}(o*)\2{14}(o*)\3{15}$/;
#One solution is: x=0; y=7; z=11.

('o' x 281)  =~ /^(o+?)\1{11}(o*)\2{14}(o*)\3{15}$/;
#One solution is: x=1; y=3; z=14.
#-----------------------------

Expressing AND, OR, and NOT in a Single Pattern

#-----------------------------
chomp($pattern = <CONFIG_FH>);
if ( $data =~ /$pattern/ ) { ..... }
#-----------------------------
/ALPHA|BETA/;
#-----------------------------
/^(?=.*ALPHA)(?=.*BETA)/s;
#-----------------------------
/ALPHA.*BETA|BETA.*ALPHA/s;
#-----------------------------
/^(?:(?!PAT).)*$/s;
#-----------------------------
/(?=^(?:(?!BAD).)*$)GOOD/s;
#-----------------------------
if (!($string =~ /pattern/)) { something() }   # ugly
if (  $string !~ /pattern/)  { something() }   # preferred
#-----------------------------
if ($string =~ /pat1/ && $string =~ /pat2/ ) { 
something
() }
#-----------------------------
if ($string =~ /pat1/ || $string =~ /pat2/ ) { 
something
() }
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# minigrep - trivial grep
$pat = shift;
while (<>) {
    print if /$pat/o;
}

#-----------------------------
 "labelled" =~ /^(?=.*bell)(?=.*lab)/s
#-----------------------------
$string =~ /bell/ && $string =~ /lab/
#-----------------------------
 if ($murray_hill =~ m{
             ^              # start of string
            (?=             # zero-width lookahead
                .*          # any amount of intervening stuff
                bell        # the desired bell string
            )               # rewind, since we were only looking
            (?=             # and do the same thing
                .*          # any amount of intervening stuff
                lab         # and the lab part
            )
         }sx )              # /s means . can match newline
{
    print "Looks like Bell Labs might be in Murray Hill!\n";
}
#-----------------------------
"labelled" =~ /(?:^.*bell.*lab)|(?:^.*lab.*bell)/
#-----------------------------
$brand = "labelled";
if ($brand =~ m{
        (?:                 # non-capturing grouper
            ^ .*?           # any amount of stuff at the front
              bell          # look for a bell
              .*?           # followed by any amount of anything
              lab           # look for a lab
          )                 # end grouper
    |                       # otherwise, try the other direction
        (?:                 # non-capturing grouper
            ^ .*?           # any amount of stuff at the front
              lab           # look for a lab
              .*?           # followed by any amount of anything
              bell          # followed by a bell
          )                 # end grouper
    }sx )                   # /s means . can match newline
{
    print "Our brand has bell and lab separate.\n";
}
#-----------------------------
$map =~ /^(?:(?!waldo).)*$/s
#-----------------------------
if ($map =~ m{
        ^                   # start of string
        (?:                 # non-capturing grouper
            (?!             # look ahead negation
                waldo       # is he ahead of us now?
            )               # is so, the negation failed
            .               # any character (cuzza /s)
        ) *                 # repeat that grouping 0 or more
        $                   # through the end of the string
    }sx )                   # /s means . can match newline
{
    print "There's no waldo here!\n";
}
#-----------------------------
 7:15am  up 206 days, 13:30,  4 users,  load average: 1.04, 1.07, 1.04

USER     TTY      FROM              LOGIN@  IDLE   JCPU   PCPU  WHAT

tchrist  tty1                       5:16pm 36days 24:43   0.03s  xinit

tchrist  tty2                       5:19pm  6days  0.43s  0.43s  -tcsh

tchrist  ttyp0    chthon            7:58am  3days 23.44s  0.44s  -tcsh

gnat     ttyS4    coprolith         2:01pm 13:36m  0.30s  0.30s  -tcsh
#-----------------------------
#% w | minigrep '^(?!.*ttyp).*tchrist'
#-----------------------------
m{
    ^                       # anchored to the start
    (?!                     # zero-width look-ahead assertion
        .*                  # any amount of anything (faster than .*?)
        ttyp                # the string you don't want to find
    )                       # end look-ahead negation; rewind to start
    .*                      # any amount of anything (faster than .*?)
    tchrist                 # now try to find Tom
}x
#-----------------------------
#% w | grep tchrist | grep -v ttyp
#-----------------------------
#% grep -i 'pattern' files
#% minigrep '(?i)pattern' files
#-----------------------------

Matching Multiple-Byte Characters

#-----------------------------
my $eucjp = q{                 # EUC-JP encoding subcomponents:
    [\x00-\x7F]                # ASCII/JIS-Roman (one-byte/character)
  | \x8E[\xA0-\xDF]            # half-width katakana (two bytes/char)
  | \x8F[\xA1-\xFE][\xA1-\xFE] # JIS X 0212-1990 (three bytes/char)
  | [\xA1-\xFE][\xA1-\xFE]     # JIS X 0208:1997 (two bytes/char)
};
#-----------------------------
/^ (?: $eucjp )*?  \xC5\xEC\xB5\xFE/ox # Trying to find Tokyo
#-----------------------------
/^ (  (?:eucjp)*? ) $Tokyo/$1$Osaka/ox
#-----------------------------
/\G (  (?:eucjp)*? ) $Tokyo/$1$Osaka/gox
#-----------------------------
@chars = /$eucjp/gox; # One character per list element
#-----------------------------
while (<>) {
  my @chars = /$eucjp/gox; # One character per list element
  for my $char (@chars) {
    if (length($char) == 1) {
      # Do something interesting with this one-byte character
    } else {
      # Do something interesting with this multiple-byte character
    }
  }
  my $line = join("",@chars); # Glue list back together
  print $line;
}
#-----------------------------
$is_eucjp = m/^(?:$eucjp)*$/xo;
#-----------------------------
$is_eucjp = m/^(?:$eucjp)*$/xo;
$is_sjis  = m/^(?:$sjis)*$/xo;
#-----------------------------
while (<>) {
  my @chars = /$eucjp/gox; # One character per list element
  for my $euc (@chars) {
    my $uni = $euc2uni{$char};
    if (defined $uni) {
        $euc = $uni;
    } else {
        ## deal with unknown EUC->Unicode mapping here.
    }
  }
  my $line = join("",@chars);
  print $line;
}
#-----------------------------

Matching a Valid Mail Address

#-----------------------------
1 while $addr =~ s/\([^()]*\)//g;
#-----------------------------
Dear someuser@host.com,

Please confirm the mail address you gave us Wed May  6 09:38:41
MDT 1998 by replying to this message.  Include the string
"Rumpelstiltskin" in that reply, but spelled in reverse; that is,
start with "Nik...".  Once this is done, your confirmed address will
be entered into our records.
#-----------------------------

Matching Abbreviations

#-----------------------------
chomp($answer = <>);
if    ("SEND"  =~ /^\Q$answer/i) { print "Action is send\n"  }
elsif ("STOP"  =~ /^\Q$answer/i) { print "Action is stop\n"  }
elsif ("ABORT" =~ /^\Q$answer/i) { print "Action is abort\n" }
elsif ("LIST"  =~ /^\Q$answer/i) { print "Action is list\n"  }
elsif ("EDIT"  =~ /^\Q$answer/i) { print "Action is edit\n"  }
#-----------------------------
use Text::Abbrev;
$href = abbrev qw(send abort list edit);
for (print "Action: "; <>; print "Action: ") {
    chomp;
    my $action = $href->{ lc($_) };
    print "Action is $action\n";
}
#-----------------------------
$name = 'send';
&$name();
#-----------------------------
# assumes that &invoke_editor, &deliver_message,
# $file and $PAGER are defined somewhere else.
use Text::Abbrev;
my($href, %actions, $errors);
%actions = (
    "edit"  => \&invoke_editor,
    "send"  => \&deliver_message,
    "list"  => sub { system($PAGER, $file) },
    "abort" => sub {
                    print "See ya!\n";
                    exit;
               },
    ""      => sub {
                    print "Unknown command: $cmd\n";
                    $errors++;
               },
);

$href = abbrev(keys %actions);

local $_;
for (print "Action: "; <>; print "Action: ") {
    s/^\s+//;       # trim leading  white space
    s/\s+$//;       # trim trailing white space
    next unless $_;
    $actions->{ $href->{ lc($_) } }->();
}
#-----------------------------
$abbreviation = lc($_);
$expansion    = $href->{$abbreviation};
$coderef      = $actions->{$expansion};
&$coderef();
#-----------------------------

Program: urlify

#-----------------------------
#% gunzip -c ~/mail/archive.gz | urlify > archive.urlified
#-----------------------------
#% urlify ~/mail/*.inbox > ~/allmail.urlified
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# urlify - wrap HTML links around URL-like constructs

$urls = '(http|telnet|gopher|file|wais|ftp)';
$ltrs = '\w';
$gunk = '/#~:.?+=&%@!\-';
$punc = '.:?\-';
$any  = "${ltrs}${gunk}${punc}";

while (<>) {
    s{
      \b                    # start at word boundary
      (                     # begin $1  {
       $urls     :          # need resource and a colon
       [$any] +?            # followed by on or more
                            #  of any valid character, but
                            #  be conservative and take only
                            #  what you need to....
      )                     # end   $1  }
      (?=                   # look-ahead non-consumptive assertion
       [$punc]*             # either 0 or more punctuation
       [^$any]              #   followed by a non-url char
       |                    # or else
       $                    #   then end of the string
      )
     }{<A HREF="$1">$1</A>}igox;
    print;
}

#-----------------------------

Program: tcgrep

#-----------------------------
#% tcgrep -ril '^From: .*kate' ~/mail
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# tcgrep: tom christiansen's rewrite of grep
# v1.0: Thu Sep 30 16:24:43 MDT 1993
# v1.1: Fri Oct  1 08:33:43 MDT 1993
# v1.2: Fri Jul 26 13:37:02 CDT 1996
# v1.3: Sat Aug 30 14:21:47 CDT 1997
# v1.4: Mon May 18 16:17:48 EDT 1998

use strict;
                                  # globals
use vars qw($Me $Errors $Grand_Total $Mult %Compress $Matches);

my ($matcher, $opt);              # matcher - anon. sub to check for matches
                                  # opt - ref to hash w/ command line options

init();                           # initialize globals

($opt, $matcher) = parse_args();  # get command line options and patterns

matchfile($opt, $matcher, @ARGV); # process files

exit(2) if $Errors;
exit(0) if $Grand_Total;
exit(1);

###################################

sub init {
    ($Me = $0) =~ s!.*/!!;        # get basename of program, "tcgrep"
    $Errors = $Grand_Total = 0;   # initialize global counters
    $Mult = "";                   # flag for multiple files in @ARGV
    $| = 1;                       # autoflush output

    %Compress = (                 # file extensions and program names
        z  => 'gzcat',            # for uncompressing
        gz => 'gzcat',
        Z  => 'zcat',
    );
}

###################################

sub usage {
        die <<EOF
usage: $Me [flags] [files]

Standard grep options:
        i   case insensitive
        n   number lines
        c   give count of lines matching
        C   ditto, but >1 match per line possible
        w   word boundaries only
        s   silent mode
        x   exact matches only
        v   invert search sense (lines that DON'T match)
        h   hide filenames
        e   expression (for exprs beginning with -)
        f   file with expressions
        l   list filenames matching

Specials:
        1   1 match per file
        H   highlight matches
        u   underline matches
        r   recursive on directories or dot if none
        t   process directories in 'ls -t' order
        p   paragraph mode (default: line mode)
        P   ditto, but specify separator, e.g. -P '%%\\n'
        a   all files, not just plain text files
        q   quiet about failed file and dir opens
        T   trace files as opened

May use a TCGREP environment variable to set default options.
EOF
}

###################################

sub parse_args {
    use Getopt::Std;

    my ($optstring, $zeros, $nulls, %opt, $pattern, @patterns, $match_code);
    my ($SO, $SE);

    if ($_ = $ENV{TCGREP}) {      # get envariable TCGREP
        s/^([^\-])/-$1/;          # add leading - if missing
        unshift(@ARGV, $_);       # add TCGREP opt string to @ARGV
    }

    $optstring = "incCwsxvhe:f:l1HurtpP:aqT";

    $zeros = 'inCwxvhelut';       # options to init to 0 (prevent warnings)
    $nulls = 'pP';                # options to init to "" (prevent warnings)

    @opt{ split //, $zeros } = ( 0 )  x length($zeros);
    @opt{ split //, $nulls } = ( '' ) x length($nulls);

    getopts($optstring, \%opt)              or usage();

    if ($opt{f}) {                # -f patfile
        open(PATFILE, $opt{f})          or die qq($Me: Can't open '$opt{f}': $!);

                                  # make sure each pattern in file is valid
        while ( defined($pattern = <PATFILE>) ) {
            chomp $pattern;
            eval { 'foo' =~ /$pattern/, 1 } or
                die "$Me: $opt{f}:$.: bad pattern: $@";
            push @patterns, $pattern;
        }
        close PATFILE;
    }
    else {                        # make sure pattern is valid
        $pattern = $opt{e} || shift(@ARGV) || usage();
        eval { 'foo' =~ /$pattern/, 1 } or
            die "$Me: bad pattern: $@";
        @patterns = ($pattern);
    }

    if ($opt{H} || $opt{u}) {     # highlight or underline
        my $term = $ENV{TERM} || 'vt100';
        my $terminal;

        eval {                    # try to look up escapes for stand-out
            require POSIX;        # or underline via Term::Cap
            use Term::Cap;

            my $termios = POSIX::Termios->new();
            $termios->getattr;
            my $ospeed = $termios->getospeed;

            $terminal = Tgetent Term::Cap { TERM=>undef, OSPEED=>$ospeed }
        };

        unless ($@) {             # if successful, get escapes for either
            local $^W = 0;        # stand-out (-H) or underlined (-u)
            ($SO, $SE) = $opt{H}
                ? ($terminal->Tputs('so'), $terminal->Tputs('se'))
                : ($terminal->Tputs('us'), $terminal->Tputs('ue'));
        }
        else {                    # if use of Term::Cap fails,
            ($SO, $SE) = $opt{H}  # use tput command to get escapes
                ? (`tput -T $term smso`, `tput -T $term rmso`)
                : (`tput -T $term smul`, `tput -T $term rmul`)
        }
    }

    if ($opt{i}) {
        @patterns = map {"(?i)$_"} @patterns;
    }

    if ($opt{p} || $opt{P}) {
        @patterns = map {"(?m)$_"} @patterns;
    }

    $opt{p}   && ($/ = '');
    $opt{P}   && ($/ = eval(qq("$opt{P}")));     # for -P '%%\n'
    $opt{w}   && (@patterns = map {'\b' . $_ . '\b'} @patterns);
    $opt{'x'} && (@patterns = map {"^$_\$"} @patterns);
    if (@ARGV) {
        $Mult = 1 if ($opt{r} || (@ARGV > 1) || -d $ARGV[0]) && !$opt{h};
    }
    $opt{1}   += $opt{l};                   # that's a one and an ell
    $opt{H}   += $opt{u};
    $opt{c}   += $opt{C};
    $opt{'s'} += $opt{c};
    $opt{1}   += $opt{'s'} && !$opt{c};     # that's a one

    @ARGV = ($opt{r} ? '.' : '-') unless @ARGV;
    $opt{r} = 1 if !$opt{r} && grep(-d, @ARGV) == @ARGV;

    $match_code  = '';
    $match_code .= 'study;' if @patterns > 5; # might speed things up a bit

    foreach (@patterns) { s(/)(\\/)g }

    if ($opt{H}) {
        foreach $pattern (@patterns) {
            $match_code .= "\$Matches += s/($pattern)/${SO}\$1${SE}/g;";
        }
    }
    elsif ($opt{v}) {
        foreach $pattern (@patterns) {
            $match_code .= "\$Matches += !/$pattern/;";
        }
    }
    elsif ($opt{C}) {
        foreach $pattern (@patterns) {
            $match_code .= "\$Matches++ while /$pattern/g;";
        }
    }
    else {
        foreach $pattern (@patterns) {
            $match_code .= "\$Matches++ if /$pattern/;";
        }
    }

    $matcher = eval "sub { $match_code }";
    die if $@;

    return (\%opt, $matcher);
}

###################################

sub matchfile {
    $opt = shift;                 # reference to option hash
    $matcher = shift;             # reference to matching sub

    my ($file, @list, $total, $name);
    local($_);
    $total = 0;

FILE: while (defined ($file = shift(@_))) {

        if (-d $file) {
            if (-l $file && @ARGV != 1) {
                warn "$Me: \"$file\" is a symlink to a directory\n"
                    if $opt->{T};
                next FILE;
            }
            if (!$opt->{r}) {
                warn "$Me: \"$file\" is a directory, but no -r given\n"
                    if $opt->{T};
                next FILE;
            }
            unless (opendir(DIR, $file)) {
                unless ($opt->{'q'}) {
                    warn "$Me: can't opendir $file: $!\n";
                    $Errors++;
                }
                next FILE;
            }
            @list = ();
            for (readdir(DIR)) {
                push(@list, "$file/$_") unless /^\.{1,2}$/;
            }
            closedir(DIR);
            if ($opt->{t}) {
                my (@dates);
                for (@list) { push(@dates, -M) }
                @list = @list[sort { $dates[$a] <=> $dates[$b] } 0..$#dates];
            }
            else {
                @list = sort @list;
            }
            matchfile($opt, $matcher, @list);    # process files
            next FILE;
        }

        if ($file eq '-') {
            warn "$Me: reading from stdin\n" if -t STDIN && !$opt->{'q'};
            $name = '<STDIN>';
        }
        else {
            $name = $file;
            unless (-e $file) {
                warn qq($Me: file "$file" does not exist\n) unless $opt->{'q'};
                $Errors++;
                next FILE;
            }
            unless (-f $file || $opt->{a}) {
                warn qq($Me: skipping non-plain file "$file"\n) if $opt->{T};
                next FILE;
            }

            my ($ext) = $file =~ /\.([^.]+)$/;
            if (defined $ext && exists $Compress{$ext}) {
                $file = "$Compress{$ext} <$file |";
            }
            elsif (! (-T $file  || $opt->{a})) {
                warn qq($Me: skipping binary file "$file"\n) if $opt->{T};
                next FILE;
            }
        }

        warn "$Me: checking $file\n" if $opt->{T};

        unless (open(FILE, $file)) {
            unless ($opt->{'q'}) {
                warn "$Me: $file: $!\n";
                $Errors++;
            }
            next FILE;
        }

        $total = 0;

        $Matches = 0;

LINE:  while (<FILE>) {
            $Matches = 0;
    
            ##############
            &{$matcher}();        # do it! (check for matches)
            ##############

            next LINE unless $Matches;

            $total += $Matches;

            if ($opt->{p} || $opt->{P}) {
                s/\n{2,}$/\n/ if $opt->{p};
                chomp         if $opt->{P};
            }

            print("$name\n"), next FILE if $opt->{l};

            $opt->{'s'} || print $Mult && "$name:",
                $opt->{n} ? "$.:" : "",
                $_,
                ($opt->{p} || $opt->{P}) && ('-' x 20) . "\n";

            next FILE if $opt->{1};                 # that's a one
        }
    }
    continue {
        print $Mult && "$name:", $total, "\n" if $opt->{c};
    }
    $Grand_Total += $total;
}

#-----------------------------

Regular Expression Grabbag

#-----------------------------
m/^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$/i
#-----------------------------
s/(\S+)(\s+)(\S+)/$3$2$1/
#-----------------------------
m/(\w+)\s*=\s*(.*)\s*$/             # keyword is $1, value is $2
#-----------------------------
m/.{80,}/
#-----------------------------
m|(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)|
#-----------------------------
s(/usr/bin)(/usr/local/bin)g
#-----------------------------
s/%([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge
#-----------------------------
s{
    /\*                    # Match the opening delimiter
    .*?                    # Match a minimal number of characters
    \*/                    # Match the closing delimiter
} []gsx;
#-----------------------------
s/^\s+//;
s/\s+$//;
#-----------------------------
s/\\n/\n/g;
#-----------------------------
s/^.*:://
#-----------------------------
m/^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\.
   ([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$/;
#-----------------------------
s(^.*/)()
#-----------------------------
$cols = ( ($ENV{TERMCAP} || " ") =~ m/:co#(\d+):/ ) ? $1 : 80;
#-----------------------------
($name = " $0 @ARGV") =~ s, /\S+/, ,g;
#-----------------------------
die "This isn't Linux" unless $^O =~ m/linux/i;
#-----------------------------
s/\n\s+/ /g
#-----------------------------
@nums = m/(\d+\.?\d*|\.\d+)/g;
#-----------------------------
@capwords = m/(\b[^\Wa-z0-9_]+\b)/g;
#-----------------------------
@lowords = m/(\b[^\WA-Z0-9_]+\b)/g;
#-----------------------------
@icwords = m/(\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b)/;
#-----------------------------
@links = m/<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>/sig;   #"'
#-----------------------------
($initial) = m/^\S+\s+(\S)\S*\s+\S/ ? $1 : "";
#-----------------------------
s/"([^"]*)"/``$1''/g   #"
#-----------------------------
{ local $/ = "";
  while (<>) {
    s/\n/ /g;
    s/ {3,}/  /g;
    push @sentences, m/(\S.*?[!?.])(?=  |\Z)/g;
  }
}
#-----------------------------
m/(\d{4})-(\d\d)-(\d\d)/            # YYYY in $1, MM in $2, DD in $3
#-----------------------------
m/ ^
      (?:
       1 \s (?: \d\d\d \s)?            # 1, or 1 and area code
       |                               # ... or ...
       \(\d\d\d\) \s                   # area code with parens
       |                               # ... or ...
       (?: \+\d\d?\d? \s)?             # optional +country code
       \d\d\d ([\s\-])                 # and area code
      )
      \d\d\d (\s|\1)                   # prefix (and area code separator)
      \d\d\d\d                         # exchange
        $
 /x
#-----------------------------
m/\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/i
#-----------------------------
push(@lines, $1)
    while ($input =~ s/^([^\012\015]*)(\012\015?|\015\012?)//);
#-----------------------------