PLEAC-Python
Prev		Next

6. Pattern Matching

Introduction

# Note: regexes are used less often in Python than in Perl as tasks are often
# covered by string methods, or specialised objects, modules, or packages.

import re                   # "re" is the regular expression module.
re.search("sheep",meadow)   # returns a MatchObject is meadow contains "sheep".
if not re.search("sheep",meadow):
    print "no sheep on this meadow only a fat python."
# replacing strings is not done by "re"gular expressions.
meadow = meadow.replace("old","new")   # replace "old" with "new" and assign result.
#-----------------------------
re.search("ovine",meadow)

meadow = """Fine bovines demand fine toreadors.
Muskoxen are polar ovibovine species.
Grooviness went out of fashion decades ago."""

meadow = "Ovines are found typically in ovaries."

if re.search(r"\bovines\b",meadow,re.I) : print "Here be sheep!"
#-----------------------------
# The tricky bit
mystr = "good food"
re.sub("o*","e",mystr,1) # gives 'egood food'

echo ababacaca | python -c "import sys,re; print re.search('(a|ba|b)+(a|ac)+',sys.stdin.read()).group()"
#-----------------------------
# pattern matching modifiers
# assume perl code iterates over some file
import re, fileinput
for ln = fileinput.input():
    fnd = re.findall("(\d+)",ln)
    if len(fnd) > 0:
        print "Found number %s" % (fnd[0])
# ----------------------------
digits = "123456789"
nonlap = re.findall("(\d\d\d)", digits)
yeslap = ["not yet"]
print "Non-overlapping:",",".join(nonlap)
print "Overlapping    :",",".join(yeslap)
# ----------------------------
mystr = "And little lambs eat ivy"
fnd = re.search("(l[^s]*s)", mystr)
print "(%s) (%s) (%s)" % (mystr[:fnd.start()], fnd.group(), mystr[fnd.end():])
# (And ) (little lambs) ( eat ivy)

Copying and Substituting Simultaneously

import re
dst = re.sub("this","that",src)
#-----------------------------
# strip to basename
basename = re.sub(".*/(?=[^/]+)","",progname)

# Make All Words Title-Cased
# DON'T DO THIS - use str.title() instead
def cap(mo): return mo.group().capitalize()
re.sub("(?P<n>\w+)",cap,"make all words title-cased")

# /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1
manpage = "/usr/man/man3/foo.1"
catpage  = re.sub("man(?=\d)","cat",manpage)
#-----------------------------
bindirs = "/usr/bin /bin /usr/local/bin".split()
libdirs = [d.replace("bin", "lib") for d in bindirs]

print " ".join(libdirs)
#=> /usr/lib /lib /usr/local/lib
#-----------------------------
# strings are never modified in place.
#-----------------------------

Matching Letters

##---------------------------

# DON'T DO THIS.  use line[:-1].isalpha() [this probably goes for the
#    remainder of this section too!]
import re
if re.match("^[A-Za-z]+$",line):
    print "pure alphabetic"
##---------------------------
if re.match(r"^[^\W\d_]+$", line, re.LOCALE):
    print "pure alphabetic"
##---------------------------
import re
import locale

try:
    locale.setlocale(locale.LC_ALL, 'fr_CA.ISO8859-1')
except:
    print "couldn't set locale to French Cnadian"
    raise SystemExit

DATA="""
silly
façade
coöperate
niño
Renée
Molière 
hæmoglobin
naïve
tschüß
random!stuff#here
"""

for ln in DATA.split():
    ln = ln.rstrip()
    if re.match(r"^[^\W\d_]+$",ln,re.LOCALE):
        print "%s: alphabetic" % (ln)
    else:
        print "%s: line noise" % (ln)
# although i dont think "coöperate" should be in canadian
##---------------------------

Matching Words

# Matching Words
"\S+"          # as many non-whitespace bytes as possible
"[A-Za-z'-]+"  # as many letters, apostrophes, and hyphens

# string split is similar to splitting on "\s+"
"A text   with some\tseparator".split()

"\b*([A-Za-z]+)\b*"   # word boundaries 
"\s*([A-Za-z]+)\s*"   # might work too as on letters are allowed.

re.search("\Bis\B","this thistle") # matches on thistle not on this
re.search("\Bis\B","vis-a-vis")    # does not match

Commenting Regular Expressions

#-----------------------------
#!/usr/bin/python
# resname - change all "foo.bar.com" style names in the input stream
# into "foo.bar.com [204.148.40.9]" (or whatever) instead

import socket               # load inet_addr
import fileinput
import re

match = re.compile("""(?P<hostname>  # capture hostname
                         (?:         # these parens for grouping only
                            [\w-]+   # hostname component
                            \.       # ant the domain dot
                         ) +         # now repeat that whole thing a bunch of times
                         [A-Za-z]    # next must be a letter
                         [\w-] +     # now trailing domain part
                      )              # end of hostname capture
                   """,re.VERBOSE)   # for nice formatting

def repl(match_obj):
    orig_hostname = match_obj.group("hostname")
    try:
        addr = socket.gethostbyname(orig_hostname)
    except socket.gaierror:
        addr = "???"
    return "%s [%s]" % (orig_hostname, addr)

for ln in fileinput.input():
    print match.sub(repl, ln)
#-----------------------------
re.sub("""(?x)     # nicer formatting
          \#       #   a pound sign
          (\w+)    #   the variable name
          \#       #   another pound sign
          """,
          lambda m: eval(m.group(1)),  # replace with the value of the global variable
          line
      )
##-----------------------------
re.sub("""(?x)     # nicer formatting
          \#       #   a pound sign
          (\w+)    #   the variable name
          \#       #   another pound sign
          """,
          lambda m: eval(eval(m.group(1))),  # replace with the value of *any* variable
          line
      )
##-----------------------------

Finding the Nth Occurrence of a Match

import re
pond = "one fish two fish red fish blue fish"
fishes = re.findall(r"(?i)(\w+)\s+fish\b",pond)
if len(fishes)>2:
    print "The third fish is a %s one." % (fishes[2])
##-----------------------------
re.findall(r"(?i)(?:\w+\s+fish\s+){2}(\w+)\s+fish",pond)
##-----------------------------
count = 0
for match_object in re.finditer(r"PAT", mystr):
    count += 1   # or whatever you want to do here

# "progressive" matching might be better if one wants match 5 from 50.
# to count use
count = len(re.findall(r"PAT",mystr))
count = len(re.findall(r"aba","abaababa"))

# "count" overlapping matches
count = len(re.findall(r"(?=aba)","abaababa"))

# FASTEST non-overlapping might be str.count
"abaababa".count("aba")
##-----------------------------
pond = "one fish two fish red fish blue fish"
colors = re.findall(r"(?i)(\w+)\s+fish\b",pond)   # get all matches
color = colors[2]                                 # then the one we want

# or without a temporary list
color = re.findall(r"(?i)(\w+)\s+fish\b",pond)[2] # just grab element 3

print "The third fish in the pond is %s." % (color)
##-----------------------------
import re

pond = "one fish two fish red fish blue fish"
matches = re.findall(r"(\w+)\s+fish\b",pond)
evens = [fish for (i, fish) in enumerate(matches) if i%2]
print "Even numbered fish are %s." % (" ".join(evens))
##-----------------------------
count = 0
def four_is_sushi(match_obj):
    global count
    count += 1
    if count==4:
        return "sushi%s" % (match_obj.group(2))
    return "".join(match_obj.groups())

re.sub(r"""(?x)               # VERBOSE
           \b                 # makes next \w more efficient
           ( \w+ )            # this is what we'll be changing
           (
             \s+ fish \b
           )""",
           four_is_sushi,
           pond)
# one fish two fish red fish sushi fish
##-----------------------------
# greedily
last_fish = re.findall(r"(?i).*\b(\w+)\s+fish\b",pond)
##-----------------------------
pond = "One fish two fish red fish blue fish swim here"
color = re.findall(r"(?i)\b(\w+)\s+fish\b",pond)[-1]
print "Last fish is "+color+"."
# FASTER using string.
lastfish = pond.rfind("fish")
color = pond[:lastfish].split()[-1]
##-----------------------------
r"""(?x)
    A             # find some pattern A
    (?!           # mustn't be able to find
      .*          # something
      A           # and A
    )
    $             # through the end of string
 """

pond = "One fish two fish red fish blue fish swim here"
fnd = re.findall(r"""(?xis)                # VERBOSE, CASEINSENSITIVE, DOTALL
                  \b ( \w+ ) \s+ fish \b
                  (?! .* \b fish \b )""",
                  pond)
if len(fnd):
    print "Last fish is %s." % (fnd[0])
else:
    print "Failed!"

Matching Multiple Lines

# Matching Multiple Lines
#
#!/usr/bin/python
# killtags - very bad html tag killer
import re
import sys

text = open(sys.argv[1]).read()        # read the whole file
text = re.sub("(?ms)<.*?>","",text)    # strip tags (terrible
print text
## ----------------------------
#!/usr/bin/python
# headerfy: change certain chapter headers to html
import sys, re

match = re.compile(r"""(?xms)          # re.VERBOSE, re.MULTILINE, and re.DOTALL
                       \A              # start of the string
                       (?P<chapter>    # capture in g<chapter>
                         Chapter       # literal string
                         \s+           # mandatory whitespace
                         \d+           # decimal number
                         \s*           # optional whitespace
                         :             # a real colon
                         . *           # anything not a newline till end of line
                       )
                    """)
text = open(sys.argv[1]).read()        # read the whole file
for paragraph in text.split("\n"):   # split on unix end of lines
    p = match.sub("<h1>\g<chapter></h1>",paragraph)
    print p
## ----------------------------
# the one liner does not run.
# python -c 'import sys,re; for p in open(sys.argv[1]).read().split("\n\n"): print re.sub(r"(?ms)\A(Chapter\s+\d+\s*:.*)","<h1>\g0</h1>",p)'
## ----------------------------
match = re.compile(r"(?ms)^START(.*?)^END")
     # s makes . span line boundaries
     # m makes ^ match at the beginning of the string and at the beginning of each line

chunk = 0
for paragraph in open(sys.argv[1]).read().split("\n\n"):
    chunk += 1
    fnd = match.findall(paragraph)
    if fnd:
        print "chunk %d in %s has <<%s>>" % (chunk,sys.argv[1],">>,<<".join(fnd))
## ----------------------------

Reading Records with a Pattern Separator

import sys
# Read the whole file and split
chunks = open(sys.argv[1]).read().split()      # on whitespace
chunks = open(sys.argv[1]).read().split("\n")  # on line ends

# splitting on pattern
import re
pattern = r"x"
chunks = re.split(pattern, open(sys.argv[1]).read())
##-----------------------------
chunks = re.split(r"(?m)^\.(Ch|Se|Ss)$",open(sys.argv[1]).read())
print "I read %d chunks." % (len(chunks))
# without delimiters
chunks = re.split(r"(?m)^\.(?:Ch|Se|Ss)$",open(sys.argv[1]).read())

# with delimiters
chunks = re.split(r"(?m)^(\.(?:Ch|Se|Ss))$",open(sys.argv[1]).read())

# with delimiters at chunkstart
chunks = re.findall(r"""(?xms)       # multiline, dot matches lineend, allow comments
                          ((?:^\.)?  # consume the separator if present
                           .*?)      # match everything but not greedy
                          (?=        # end the match on this but dont consume it
                            (?:                  # dont put into group [1]
                               ^\.(?:Ch|Se|Ss)$  # either end on one of the roff commands
                               |\Z               # or end of text
                            )
                          )""",
                    open(sys.argv[1]).read())
# [1] if "?:" is removed the result holds tuples: ('.Ch\nchapter x','.Ch')
#     which might be more usefull.

Extracting a Range of Lines

##-----------------------------
# Python doesn't have perl's range operators
# If you want to only use a selected line range, use enumerate
# (though note that indexing starts at zero:
for i, line in enumerate(myfile):
    if firstlinenum <= i < lastlinenum:
        dosomethingwith(line)

# Using patterned ranges is slightly trickier -
# You need to search for the first pattern then
# search for the next pattern:
import re
for line in myfile:
    if re.match(pat1, line):
        break

dosomethingwith(line)    # Only if pat1 can be on same line as pat2

for line in myfile:
    if re.match(pat2, line):
        break
    dosomethingwith(line)
##-----------------------------
# If you need to extract ranges a lot, the following generator funcs
# may be useful:
def extract_range(myfile, start, finish):
    for i, line in enumerate(myfile):
        if start <= i < finish:
            yield line
        elif i == finish:
            break

for line in extract_range(open("/etc/passwd"), 3, 5):
    print line

def patterned_range(myfile, startpat, endpat=None):
    startpat = re.compile(startpat)
    if endpat is not None:
        endpat = re.compile(endpat)
    in_range = False
    for line in myfile:
        if re.match(startpat, line):
            in_range = True
        if in_range:
            yield line
        if endpat is not None and re.match(endpat, line):
            break

# DO NOT DO THIS.  Use the email module instead
for line in patterned_range(msg, "^From:?", "^$"):
    pass #...

Matching Shell Globs as Regular Expressions

tests = (("list.?",r"^list\..$"),
        ("project.*",r"^project\..*$"),
        ("*old",r"^.*old$"),
        ("type*.[ch]",r"^type.*\.[ch]$"),
        ("*.*",r"^.*\..*$"),
        ("*",r"^.*$"),
        )

# The book says convert "*","?","[","]" all other characters will be quoted.
# The book uses "\Q" which escapes any characters that would otherwise be
# treated as regular expression.
# Escaping every char fails as "\s" is not "s" in a regex.

def glob2pat(globstr):
    pat = globstr.replace("\\",r"\\")
    pat = pat.replace(".",r"\.").replace("?",r".").replace("*",r".*")
    
    return "^"+pat+"$"

for globstr, patstr in tests:
    g2p = glob2pat(globstr)
    if g2p != patstr:
        print globstr, "failed! Should be", patstr, "but was", g2p

Speeding Up Interpolated Matches


# download the following standalone program
#!/usr/bin/python
# popgrep1 - grep for abbreviations of places that say "pop"
# version 1: slow but obvious way
import fileinput
import re
popstates = ["CO","ON","MI","WI","MN"]
for line in fileinput.input():
    for state in popstates:
        if re.search(r"\b"+state+r"\b",line):
            print line



#-----------------------------
# download the following standalone program
#!/usr/bin/python
# popgrep2 - grep for abbreviations of places that say "pop"
# version 2: compile the patterns
import fileinput
import re
popstates = ["CO","ON","MI","WI","MN"]
state_re = []
for state in popstates:
    state_re.append(re.compile(r"\b"+state+r"\b"))
for line in fileinput.input():
    for state in state_re:
        if state.search(line):
            print line


#-----------------------------
# download the following standalone program
#!/usr/bin/python
# popgrep3 - grep for abbreviations of places that say "pop"
# version 3: compile a single pattern
import fileinput
import re
popstates = ["CO","ON","MI","WI","MN"]
state_re = re.compile(r"\b(?:"+"|".join(popstates)+r")\b")
for line in fileinput.input():
    if state_re.search(line):
        print line


#-----------------------------
# download the following standalone program
#!/usr/bin/python
# grepauth - print lines that mention both Tom and Nat
import fileinput
import re

def build_match_any(words):
    return re.compile("|".join(words))
def uniq(arr):
    seen = {}
    for item in arr:
        seen[item] = seen.get(item, 0) + 1
    return seen.keys()
def build_match_all(words):
    r = re.compile("|".join(words))
    c = lambda line: len(uniq(r.findall(line)))>=len(words)
    return c

any = build_match_any(("Tom","Nat"))
all = build_match_all(("Tom","Nat"))
for line in fileinput.input():
    if any.search(line):
        print "any:", line
    if all(line):
        print "all:", line



#-----------------------------

Testing for a Valid Pattern

# Testing for a Valid Pattern

import re
while True:
    pat = raw_input("Pattern? ")
    try:
        re.compile(pat)
    except re.error, err:
        print "INVALID PATTERN", err
        continue
    break

# ----
def is_valid_pattern(pat):
    try:
        re.compile(pat)
    except re.error:
        return False
    return True

# ----

# download the following standalone program
#!/usr/bin/python
# paragrep - trivial paragraph grepper
#
# differs from perl version in parano.
# python version displays paragraph in current file.

import sys, os.path, re
if len(sys.argv)<=1:
        print "usage: %s pat [files]\n" % sys.argv[0]
        sys.exit(1)

pat = sys.argv[1]
try:
        pat_re = re.compile(pat)
except:
        print "%s: bad pattern %s: %s" % (sys.argv[1], pat, sys.exc_info()[1])
        sys.exit(1)
for filename in filter(os.path.isfile,sys.argv[2:]):
        parano = 0
        for para in open(filename).read().split("\n\n"):
                parano += 1
                if pat_re.search(para):
                        print filename, parano, para, "\n"
                        


# ----

# as we dont evaluate patterns the attack ::
#
#   $pat = "You lose @{[ system('rm -rf *']} big here";
#
# does not work.

Honoring Locale Settings in Regular Expressions


# download the following standalone program
#!/usr/bin/python
# localeg - demonstrates locale effects
#
# re must be told to respect locale either in the regexp
# "(?L)" or as flag to the call (python 2.4) "re.LOCALE".

import sys
import re, string
from locale import LC_CTYPE, setlocale, getlocale

name = "andreas k\xF6nig"
locale = {"German" : "de_DE.ISO_8859-1", "English" : "en_US"}
# us-ascii is not supported on linux py23
# none works in activestate py24

try:
    setlocale(LC_CTYPE, locale["English"])
except:
    print "Invalid locale %s" % locale["English"]
    sys.exit(1)
english_names = []
for n in re.findall(r"(?L)\b(\w+)\b",name):
    english_names.append(n.capitalize())

try:
    setlocale(LC_CTYPE, locale["German"])
except:
    print "Invalid locale %s" % locale["German"]
    sys.exit(1)
german_names = map(string.capitalize, re.findall(r"(?L)\b(\w+)\b",name))

print "English names: %s" % " ".join(english_names)
print "German names: %s" % " ".join(german_names)

Approximate Matching

##-----------------------------
import difflib
matchlist = ["ape", "apple", "lapel", "peach", "puppy"]
print difflib.get_close_matches("appel", matchlist)
#=> ['lapel', 'apple', 'ape']
##-----------------------------
# Also see:
#     http://www.personal.psu.edu/staff/i/u/iua1/python/apse/
#     http://www.bio.cam.ac.uk/~mw263/pyagrep.html

Matching from Where the Last Pattern Left Off

##-----------------------------
# To search (potentially) repeatedly for a pattern, use re.finditer():

# DO NOT DO THIS.  Split on commas and convert elems using int()
mystr = "3,4,5,9,120"
for match in re.finditer("(\d+)", mystr):
    n = match.group(0)
    if n == "9":
        break # '120' will never be matched
    print "Found number", n

# matches know their end position
mystr = "The year 1752 lost 10 days on the 3rd of September"
x = re.finditer("(\d+)", mystr)
for match in x:
    n = match.group(0)
    print "Found number", n

tail = re.match("(\S+)", mystr[match.end():])
if tail:
    print "Found %s after the last number."%tail.group(0)

Greedy and Non-Greedy Matches

# Python's regexes are based on Perl's, so it has the non-greedy 
# '*?', '+?', and '??' versions of '*', '+', and '?'.
# DO NOT DO THIS. import htmllib, formatter, etc, instead
#-----------------------------
# greedy pattern
txt = re.sub("<.*>", "", txt) # try to remove tags, very badly

# non-greedy pattern
txt = re.sub("<.*?>", "", txt) # try to remove tags, still rather badly
#-----------------------------
txt = "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>"

print re.findall("<b><i>(.*?)</i></b>", txt
##-----------------------------
print re.findall("/BEGIN((?:(?!BEGIN).)*)END/", txt)
##-----------------------------
print re.findall("<b><i>((?:(?!<b>|<i>).)*)</i></b>", txt)
##-----------------------------
print re.findall("<b><i>((?:(?!<[ib]>).)*)</i></b>", txt)
##-----------------------------
print re.findall("""
    <b><i> 
    [^<]*  # stuff not possibly bad, and not possibly the end.
    (?:    # at this point, we can have '<' if not part of something bad
     (?!  </?[ib]>  )   # what we can't have
     <                  # okay, so match the '<'
     [^<]*              # and continue with more safe stuff
    ) *
    </i></b>
    """, re.VERBOSE, txt)
##-----------------------------

Detecting Duplicate Words

##-----------------------------
text = """
This is a test
test of the duplicate word finder.
"""
words = text.split()
for curr, next in zip(words[:-1], words[1:]):
    if curr.upper() == next.upper():
            print "Duplicate word '%s' found." % curr

# DON'T DO THIS
import re
pat = r"""
      \b            # start at a word boundary (begin letters)
      (\S+)         # find chunk of non-whitespace
      \b            # until another word boundary (end letters)
      (
          \s+       # separated by some whitespace
          \1        # and that very same chunk again
          \b        # until another word boundary
      ) +           # one or more sets of those
      """
for match in re.finditer(pat, text, flags=re.VERBOSE|re.IGNORECASE):
    print "Duplicate word '%s' found." % match.group(1)
##-----------------------------
a = 'nobody';
b = 'bodysnatcher';

text = a+" "+b
pat = r"^(\w+)(\w+) \2(\w+)$"
for match in re.finditer(pat, text):
    m1, m2, m3 = match.groups()
    print m2, "overlaps in %s-%s-%s"%(m1, m2, m3)
##-----------------------------
pat = r"^(\w+?)(\w+) \2(\w+)$"
##-----------------------------
try:
    while True:
        factor = re.match(r"^(oo+?)\1+$", n).group(1)
        n = re.sub(factor, "o", n)
        print len(factor)
except AttributeError:
    print len(n)
##-----------------------------
def diaphantine(n, x, y, z):
    pat = r"^(o*)\1{%s}(o*)\2{%s}(o*)\3{%s}$"%(x-1, y-1, z-1)
    text = "o"*n
    try:
        vals = [len(v) for v in re.match(pat, text).groups()]
    except ValueError:
        print "No solutions."
    else:
        print "One solution is: x=%s, y=%s, z=%s."%tuple(vals)
        
diaphantine(n=281, x=12, y=15, z=16)

Expressing AND, OR, and NOT in a Single Pattern

##-----------------------------
# Pass any of the following patterns to re.match(), etc
pat = "ALPHA|BETA"
pat = "^(?=.*ALPHA)(?=.*BETA)"
pat = "ALPHA.*BETA|BETA.*ALPHA"
pat = "^(?:(?!PAT).)*$"
pat = "(?=^(?:(?!BAD).)*$)GOOD"
##-----------------------------
if not re.match(pattern, text):
    something()
##-----------------------------
if re.match(pat1, text) and re.match(pat2, text):
    something()
##-----------------------------
if re.match(pat1, text) or re.match(pat2, text):
    something()
##-----------------------------
# DON'T DO THIS.
"""minigrep - trivial grep"""
import sys, re

pat = sys.argv[1]
for line in sys.stdin:
    if re.match(pat, line):
        print line[:-1]
##-----------------------------
if re.match(r"^(?=.*bell)(?=.*lab)", "labelled"):
    something()
##-----------------------------
if re.search("bell", s) and re.search("lab", s):
    something()
##-----------------------------
if re.match("""
             ^              # start of string
            (?=             # zero-width lookahead
                .*          # any amount of intervening stuff
                bell        # the desired bell string
            )               # rewind, since we were only looking
            (?=             # and do the same thing
                .*          # any amount of intervening stuff
                lab         # and the lab part
            )
            """,
            murray_hill,
            re.DOTALL | re.VERBOSE):
    print "Looks like Bell Labs might be in Murray Hill!"
##-----------------------------
if re.match(r"(?:^.*bell.*lab)|(?:^.*lab.*bell)", "labelled"):
    something()
##-----------------------------
brand = "labelled"
if re.match("""
            (?:                 # non-capturing grouper
                ^ .*?           # any amount of stuff at the front
                bell            # look for a bell
                .*?             # followed by any amount of anything
                lab             # look for a lab
            )                   # end grouper
            |                   # otherwise, try the other direction
            (?:                 # non-capturing grouper
                ^ .*?           # any amount of stuff at the front
                lab             # look for a lab
                .*?             # followed by any amount of anything
                bell            # followed by a bell
            )                   # end grouper
            """,
            brand,
            re.DOTALL | re.VERBOSE):
    print "Our brand has bell and lab separate."
##-----------------------------
x = "odlaw"
if re.match("^(?:(?!waldo).)*$", x):
   print "There's no waldo here!"
##-----------------------------
if re.match("""
            ^                   # start of string
            (?:                 # non-capturing grouper
                (?!             # look ahead negation
                    waldo       # is he ahead of us now?
                )               # is so, the negation failed
                .               # any character (cuzza /s)
            ) *                 # repeat that grouping 0 or more
            $                   # through the end of the string
            """,
            x,
            re.VERBOSE | re.DOTALL):
    print "There's no waldo here!\n";
##-----------------------------

Matching Multiple-Byte Characters

# @@INCOMPLETE@@
# @@INCOMPLETE@@

Matching a Valid Mail Address

##-----------------------------
from email._parseaddr import AddressList

print AddressList("fred&barney@stonehenge.com").addresslist[0]

print AddressList("fred&barney@stonehenge.com (Hanna Barbara)").addresslist[0]

name, address = AddressList("Mr Fooby Blah <me@nowhere.com>").addresslist[0]
print "%s's address is '%s'"%(name, address)

Matching Abbreviations

##-----------------------------
# Assuming the strings all start with different letters, or you don't
# mind there being precedence, use the startswith string method:

def get_action(answer):
    answer = answer.lower()
    actions = ["send", "stop", "abort", "list", "end"]
    for action in actions:
        if action.startswith(answer):
            return action

print "Action is %s."%get_action("L")
#=> Action is list.
##-----------------------------
#DON'T DO THIS:
import re
answer = "ab"
answer = re.escape(answer.strip())
for action in ("SEND", "STOP", "ABORT", "LIST", "EDIT"):
    if re.match(answer, action, flags=re.IGNORECASE):
        print "Action is %s."%action.lower()
##-----------------------------
import re, sys
def handle_cmd(cmd):    
    cmd = re.escape(cmd.strip())
    for name, action in {"edit": invoke_editor,
                         "send": deliver_message,
                         "list": lambda: system(pager, myfile),
                         "abort": sys.exit,
                         }
        if re.match(cmd, name, flags=re.IGNORECASE):
            action()
            break
    else:
        print "Unknown command:", cmd
handle_cmd("ab")

Program: urlify

##-----------------------------
# urlify - wrap HTML links around URL-like constructs
import re, sys, fileinput

def urlify_string(s):
    urls = r'(http|telnet|gopher|file|wais|ftp)'
    
    ltrs = r'\w';
    gunk = r'/#~:.?+=&%@!\-'
    punc = r'.:?\-'
    any  = ltrs + gunk + punc 

    pat = re.compile(r"""
      \b                    # start at word boundary
      (                     # begin \1  {
       %(urls)s  :          # need resource and a colon
       [%(any)s] +?         # followed by one or more
                            #  of any valid character, but
                            #  be conservative and take only
                            #  what you need to....
      )                     # end   \1  }
      (?=                   # look-ahead non-consumptive assertion
       [%(punc)s]*          # either 0 or more punctuation
       [^%(any)s]           #   followed by a non-url char
       |                    # or else
       $                    #   then end of the string
      )
    """%locals(), re.VERBOSE | re.IGNORECASE)
    return re.sub(pat, r"<A HREF=\1>\1</A>", s)

if __name__ == "__main__":
    for line in fileinput.input():
        print urlify_string(line)

Program: tcgrep

##-----------------------------
# @@INCOMPLETE@@
# @@INCOMPLETE@@

Regular Expression Grabbag

# The majority of regexes in this section are either partially
# or completely The Wrong Thing to Do.
##-----------------------------
# DON'T DO THIS.  Use a Roman Numeral module, etc. (since
# you need one anyway to calculate values)
pat = r"^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$"
re.match(pat, "mcmlxcvii")
##-----------------------------
txt = "one two three four five"

# If the words are cleanly delimited just split and rejoin:
word1, word2, rest = txt.split(" ", 2)
print " ".join([word2, word1, rest])

# Otherwise:
frompat = r"(\S+)(\s+)(\S+)"
topat =  r"\3\2\1"
print re.sub(frompat, topat, txt)

##-----------------------------
print str.split("=")

# DON'T DO THIS
pat = r"(\w+)\s*=\s*(.*)\s*$"
print re.match(pat, "key=val").groups()
##-----------------------------
line = "such a very very very very very very very very very very very very very long line"
if len(line) > 80:
    process(line)

# DON'T DO THIS
pat = ".{80,}"
if re.match(pat, line):
    process(line)
##-----------------------------
dt = time.strptime("12/11/05 12:34:56", "%d/%m/%y %H:%M:%S")

# DON'T DO THIS
pat = r"(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)"
dt = re.match(pat, "12/11/05 12:34:56").groups()
##-----------------------------
txt = "/usr/bin/python"
print txt.replace("/usr/bin", "/usr/local/bin")
# Alternatively for file operations use os.path, shutil, etc.

# DON'T DO THIS
print re.sub("/usr/bin", "/usr/local/bin", txt)
##-----------------------------
import re

def unescape_hex(matchobj):
    return chr(int(matchobj.groups(0)[0], 16))
txt = re.sub(r"%([0-9A-Fa-f][0-9A-Fa-f])", unescape_hex, txt)

# Assuming that the hex escaping is well-behaved, an alternative is:
def unescape_hex(seg):
    return chr(int(seg[:2], 16)) + seg[2:]

segs = txt.split("%")
txt = segs[0] + "".join(unescape_hex(seg) for seg in segs[1:])
##-----------------------------
txt = re.sub(r"""
             /\*                    # Match the opening delimiter
             .*?                    # Match a minimal number of characters
             \*/                    # Match the closing delimiter
             """, "", txt, re.VERBOSE)
##-----------------------------
txt.strip()

# DON'T DO THIS
txt = re.sub(r"^\s+", "", txt)
txt = re.sub(r"\s+$", "", txt)
##-----------------------------
txt.replace("\\n", "\n")

# DON'T DO THIS
txt = re.sub("\\n", "\n", txt)
##-----------------------------
txt = re.sub("^.*::", "")
##-----------------------------
import socket
socket.inet_aton(txt) # Will raise an error if incorrect

# DON'T DO THIS.
octseg =r"([01]?\d\d|2[0-4]\d|25[0-5])"
dot = r"\."
pat = "^" + octseg + dot + octseg + dot + octseg + dot + octseg + "$"

if not re.match(pat, txt, re.VERBOSE)
   raise ValueError

# Defitely DON'T DO THIS.
pat = r"""^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\.
          ([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$"""
##-----------------------------
fname = os.path.basename(path)

# DON'T DO THIS.
fname = re.sub("^.*/", "", path)
##-----------------------------
import os
try:
    tc = os.environ["TERMCAP"]
except KeyError:
    cols = 80
else:
    cols = re.match(":co#(\d+):").groups(1)
##-----------------------------
# (not quite equivalent to the Perl version)
name = os.path.basename(sys.argv[0])

# DON'T DO THIS.
name = re.sub("^.*/", "", sys.argv[0])
##-----------------------------
if sys.platform != "linux":
    raise SystemExit("This isn't Linux")
##-----------------------------
txt = re.sub(r"\n\s+", " ", txt)

# In many cases you could just use:
txt = txt.replace("\n", " ")
##-----------------------------
nums = re.findall(r"\d+\.?\d*|\.\d+", txt)
##-----------------------------
# If the words are clearly delimited just use:
capwords = [word for word in txt.split() if word.isupper()]

# Otherwise
capwords = [word for word in re.findall(r"\b(\S+)\b", txt) if word.isupper()]

# (probably) DON'T DO THIS. 
capwords = re.findall(r"(\b[^\Wa-z0-9_]+\b)", txt)
##-----------------------------
# If the words are clearly delimited just use:
lowords = [word for word in txt.split() if word.islower()]

# Otherwise
lowords = [word for word in re.findall(r"\b(\S+)\b", txt) if word.islower()]

# (probably) DON'T DO THIS. 
lowords = re.findall(r"(\b[^\WA-Z0-9_]+\b)", txt)
##-----------------------------
# If the words are clearly delimited just use:
icwords = [word for word in txt.split() if word.istitle()]

# Otherwise
icwords = [word for word in re.finditer(r"\b(\S+)\b") if word.istitle()]

# DON'T DO THIS. 
icwords = re.findall(r"(\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b)", txt)
##-----------------------------
# DON'T DO THIS - use HTMLParser, etc.
links = re.findall(r"""<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>""", txt)
##-----------------------------
names = txt.split()
if len(names) == 3:
    initial = names[1][0]
else:
    initial = ""

# DON'T DO THIS. 
pat = "^\S+\s+(\S)\S*\s+\S"
try:
    initial = re.match(pat, txt).group(1)
except AttributeError:
    initial = ""
##-----------------------------
txt = re.sub('"([^"]*)"', "``\1''", txt)
##-----------------------------
sentences = [elem[0] for elem in re.findall(r"(.*?[!?.])(  |\Z)", s)]
##-----------------------------
import time
dt = time.strptime(txt, "%Y-%m-%d")

# DON'T DO THIS.
year, month, day = re.match(r"(\d{4})-(\d\d)-(\d\d)", txt).groups()
##-----------------------------
pat = r"""
      ^
      (?:
       1 \s (?: \d\d\d \s)?            # 1, or 1 and area code
       |                               # ... or ...
       \(\d\d\d\) \s                   # area code with parens
       |                               # ... or ...
       (?: \+\d\d?\d? \s)?             # optional +country code
       \d\d\d ([\s\-])                 # and area code
      )
      \d\d\d (\s|\1)                   # prefix (and area code separator)
      \d\d\d\d                         # exchange
        $
      """
re.match(pat, txt, re.VERBOSE)
##-----------------------------
re.match(r"\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b", txt, re.IGNORECASE)
##-----------------------------
for line in file(fname, "Ur"):          #Universal newlines
    process(line)

# DON'T DO THIS
lines = [re.sub(r"^([^\012\015]*)(\012\015?|\015\012?)", "", line) 
         for line in file(fname)]
##-----------------------------

Prev	Home	Next
Hashes		File Access