6. Pattern Matching

Introduction

-- for info about the various regexp libraries, see http://haskell.org/haskellwiki/Regular_expressions
--
-- [Oct 2008] Text.Regex.Posix/Text.Regex.Base provides =~ which is quite powerful
-- but it misses simple things like subRegex and splitRegex so in many cases
-- Text.Regex must be used
--
-- with Text.Regex.Posix:
import Text.Regex.Posix
import Text.Regex.Base

meadow = [ "sheeps are ovine", "Sheeps are ovine"
         , "Fine bovines demand fine toreadors."
         , "Muskoxen are polar ovibovine species."
         , "Grooviness went out of fashion decades ago." ]

t1 = filter (=~ "sheep") meadow
-- ["sheeps are ovine"]

t2 = filter (=~ "\\<ovines?") meadow
-- ["sheeps are ovine","Sheeps are ovine"]

-- below require "import Data.Bits" for .|. :
ignoreCase = makeRegexOpts (defaultCompOpt .|. compIgnoreCase) defaultExecOpt
t3 = filter (matchTest $ ignoreCase "sheep") meadow
-- ["sheeps are ovine","Sheeps are ovine"]


-- the same with Text.Regex:
import Data.Maybe
import Text.Regex

t1 = filter (isJust . matchRegex (mkRegex "sheep")) meadow
-- ["sheeps are ovine"]

t2 = filter (isJust . matchRegex (mkRegex "\\<ovines?")) meadow
-- ["sheeps are ovine","Sheeps are ovine"]

ignoreCase s = mkRegexWithOpts s True False
t3 = filter (isJust . matchRegex (ignoreCase "sheep")) meadow
-- ["sheeps are ovine","Sheeps are ovine"]

-- note that Data.ByteString.Char8 has a few interesting functions,
-- but usually regexps are better
import qualified Data.ByteString.Char8 as B

t = filter is_sheep meadow
    where is_sheep s = B.isInfixOf (B.pack "sheep") (B.pack s)
-- ["sheeps are ovine"]


s1 = subRegex (mkRegex "o+") "good food" "e"
-- ged fed 

-- the following helper functions are useful
subRegexWith re new s =
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ (subRegexWith re new after)
subRegexOnceWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ after

s2 = subRegexOnceWith (mkRegex "o+") (\_ -> "e") "good food" 
-- ged food


digits = "123456789";
l = digits =~ "..." :: [String] -- nb: in List context, calls matchAll
-- ["123","456","789"]

-- for overlapping matches, one must do it by hand:
allOverlappingMatches re s = 
    case matchRegexAll re s of
      Nothing -> []
      Just (before, matched, after, _) -> matched : allOverlappingMatches re (tail matched ++ after)

l = allOverlappingMatches (mkRegex "...") digits
-- ["123","234","345","456","567","678","789"]


string = "And little lambs eat ivy"
(before, matched, after) = string =~ "l[^s]*s" :: (String, String, String)
s = printf "(%s) (%s) (%s)" before matched after :: String
-- "(And ) (little lambs) ( eat ivy)"

Copying and Substituting Simultaneously

-- copying is the default in pure languages :)
import Text.Regex
import System.Environment
import System.FilePath.Posix

prog_ = subRegex (mkRegex ".*/") "/etc/foo" ""
-- "foo"
-- but it's much nicer to use takeFileName
prog__ = takeFileName "/etc/foo"
-- "foo"

-- as for the program basename, it is:
prog = getProgName


manpage = "/usr/man/man3/foo.1"
catpage = subRegexMatchesWith (mkRegex "man([0-9])") (\[n] -> "cat" ++ n) manpage
-- /usr/man/cat3/foo.1

subRegexMatchesWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, l) -> before ++ new l ++ after

bindirs = [ "/usr/bin", "/bin", "/usr/local/bin" ]
libdirs = map (\s -> subRegex (mkRegex "bin$") s "lib") bindirs

Matching Letters

is_pure_alphabetic = isJust . matchRegex (mkRegex "^[A-Za-z]+$")
is_pure_alphabetic_ = all isLetter -- this handles unicode (accents...)

some_words = [ "silly", "random!stuff#here", "façade", "tschÃŒÃ", "niño", "ÑÑÑÑкОй" ]
l1 = filter is_pure_alphabetic some_words -- ["silly"]
l2 = filter is_pure_alphabetic_ some_words -- all except "random!stuff#here"

Matching Words

-- "\\S+"         -- as many non-whitespace bytes as possible
-- "[A-Za-z'-]+"  -- as many letters, apostrophes, and hyphens
-- 
-- "\\<([A-Za-z]+)\\>" -- usually best
-- "\\s([A-Za-z]+)\\s" -- fails at ends or w/ punctuation

-- words is similar to splitting on "\s+"
l = words "A text   with some\tseparator\n"
-- ["A","text","with","some","separator"]
--
-- (nb: spaces at beginning or end do not change the result)

Commenting Regular Expressions

-- Commenting Regular Expressions
-- since regexps are simple strings, you can concatenate regexp pieces:
re = "#"          -- a pound sign
     ++ "(\\w+)"  -- the variable name
     ++ "#"       -- another pound sign
expand_macro vals = subAllRegexMatchesWith (mkRegex re) get_val
    where get_val [s] = fromJust (lookup s vals)
s = expand_macro [ ("foo", "42") ] "blah #foo# blah"
-- "blah 42 blah"

subAllRegexMatchesWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, l) -> before ++ new l ++ (subAllRegexMatchesWith re new after)

Finding the Nth Occurrence of a Match

fishes = "One fish two fish red fish blue fish"

-- first some very simple and useful functions:
ignoreCase s = mkRegexWithOpts s True False
allMatches re s =
    case matchRegexAll re s of
      Nothing -> []
      Just (_before, _, after, matches) -> matches : allMatches re after

-- here we go:
colors = map head $ allMatches (ignoreCase "(\\w+)\\s+fish\\>") fishes
-- ["One","two","red","blue"]
s1 = printf "The third fish is a %s one." (colors !! 2) :: String
-- The third fish is a red one.

-- another solution to skip the first 2 matches using an ad'hoc regexp:
s2 = printf "The third fish is a %s one." color :: String
    where Just (_, _, _, [_, color]) = 
              matchRegexAll (ignoreCase "(\\w+\\s+fish\\s+){2}(\\w+)\\s+fish") fishes
-- The third fish is a red one.

-- a modified subRegex to be able to handle differently the 4th match
subRegexWithEnv re f env s =
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, matches) -> before ++ matched' ++ (subRegexWithEnv re f env' after)
          where (env', matched') = f env (matched:matches)

s3 = subRegexWithEnv (ignoreCase "(\\w+)(\\s+fish\\>)") f 1 fishes
    where f n [verbatim, _, fish] = (n+1, if n == 4 then "sushi" ++ fish else verbatim)
-- One fish two fish red fish sushi fish

-- note that we can easily specialize subRegexWithEnv into subRegexWithCount:
withCountEnv f g s = f (\n val -> (n+1, g n val)) 1 s
subRegexWithCount re = withCountEnv (subRegexWithEnv re)

s3' = subRegexWithCount (ignoreCase "(\\w+)(\\s+fish\\>)") f fishes
    where f n [verbatim, _, fish] = if n == 4 then "sushi" ++ fish else verbatim
-- One fish two fish red fish sushi fish

-- adding .* at beginning of the regexp will ensure we get the last hit.
-- but we must be carefull ".*(\\w+)\\s+fish\\>" is wrong since .* will match
-- most the color, so \\< is also needed
s4 = case matchRegex (ignoreCase ".*\\<(\\w+)\\s+fish\\>") fishes of
     Just [last_fish_color] -> printf "Last fish is %s." last_fish_color
     _ -> "Failed!"
-- Last fish is blue.

Matching Multiple Lines

-- for multi line regexps, use (mkRegexWithOpts _ False _)
#!/usr/bin/runghc

-- killtags - very bad html tag killer
import System
import Text.Regex

main = do s <- getArgs >>= any_input
          putStr (strip_tags s)

any_input [] = getContents
any_input (f:_) = readFile f

strip_tags s = subRegex (mkRegexWithOpts "<[^>]*>" False True) s ""

-- testing it:
test = "<a>after a\n\
       \<bbb\n\
       \bbb>after b <c>after c\n"
s = strip_tags test
-- "after a\n<bbb\nbbb>after b after c\n"

Reading Records with a Pattern Separator

-- Read the whole file and split
l1 = fmap lines (readFile filename) -- on line ends
l2 = fmap words (readFile filename) -- on whitespace
l3 = fmap (splitRegex $ mkRegex "PATTERN") (readFile filename) -- on regexp

-- .Ch, .Se and .Ss divide chunks
f = splitRegex (mkRegex "^\\.(Ch|Se|Ss)$")

-- note that in the example above, you loose the matched line,
-- if you need it, you must create your own splitRegex:
splitRegex' :: Regex -> String -> [(String, String)]
splitRegex' re s = 
    case unfoldr' f s of
      (l, "") -> l
      (l, s) -> l ++ [(s, "")]
    where 
      -- small helper functions which tranform matchRegexAll output
      -- to the one wanted by unfoldr'
      f = fmap f' . matchRegexAll re
      f' (before, matched, after, _) = ((before, matched), after)

-- it uses this special unfoldr
-- (unfoldr only returns [b] whereas we also need the rest)
-- nb: with Control.Arrow.first which is \f (a,b) -> (f a, b)
--     we could write:  Arrow.first (a :) (unfoldr' f b')
unfoldr' :: (a -> Maybe (b, a)) -> a -> ([b], a)
unfoldr' f b  =
  case f b of
   Just (a, b') -> let (l, b'') = unfoldr' f b' in (a:l, b'')
   Nothing      -> ([], b)

unwords_keeping_spaces = splitRegex' (mkRegex "\\s+")
t = unwords_keeping_spaces "a b  c \t d"
-- [("a"," "),("b","  "),("c"," \t "),("d","")]

Extracting a Range of Lines

import Text.Regex
import Text.Regex.Posix
import Data.Maybe
import Text.Printf
import System

any_input [] = getContents
any_input (f:_) = readFile f

-- command-line to print lines 15 through 17 inclusive (see below)
main = do s <- getArgs >>= any_input
          putStr (unlines $ take 3 $ drop 14 $ lines s)

filterByRange :: (a -> Bool) -> (a -> Bool) -> [a] -> [[a]]
filterByRange _ _ [] = []
filterByRange f_beg f_end l = 
    let l' = dropWhile (not . f_beg) l in
    case break f_end l' of
      (before, match : l'') -> (before ++ [match]) : filterByRange f_beg f_end l''
      _ -> []

extract_XMP :: [String] -> [[String]]
extract_XMP = filterByRange (=~ "<XMP>") (=~ "</XMP>")

t1 = do s <- readFile "foo.html"
        return $ extract_XMP $ lines s

ignoreCase s = mkRegexWithOpts s True False
extract_XMP' = filterByRange (match "<XMP>") (match "</XMP>")
    where match re = isJust . matchRegex (ignoreCase re)

t1' = do s <- readFile "foo.html"
         return $ extract_XMP' $ lines s

mbox_to_headers = filterByRange (match "^From:?\\s") (match "^$")
    where match re = isJust . matchRegex (ignoreCase re)
mail_adresses = concatMap search_in_line
    where search_in_line line = (line =~ "[^<>(),;[:space:]]+@[^<>(),;[:space:]]+") :: [String]
          -- nb: in List context, calls matchAll.
          -- nb: \s can not be used, so instead of [^<>\s], one must do [^<>[:space:]]

t2 = do s <- readFile "mbox"
        return $ mail_adresses $ concat $ mbox_to_headers $ lines s

Matching Shell Globs as Regular Expressions

import List
import Data.Maybe

special_chars = ".()+*?[]\\{}$^|"

regex_escape_char :: Char -> String
regex_escape_char c = if elem c special_chars 
               then ['\\', c] 
               else [c]
regex_escape :: String -> String
regex_escape = concatMap regex_escape_char

glob_list = 
    [ ('*', ".*")
    , ('?', ".")
    , ('[', "[")
    , (']', "]") ]
glob_char_to_regexp c = fromMaybe (regex_escape_char c) (lookup c glob_list)
-- nb: fromMaybe will try (lookup ...) first, then fallback on (regex_escape_char c)

glob2pat = concatMap glob_char_to_regexp

pat = glob2pat "*File.*"
-- ".*File\\..*"

Speeding Up Interpolated Matches

import Data.List
import Data.Maybe
import Text.Regex

popstates = ["CO","ON","MI","WI","MN"]

is_state = isJust . matchRegex re
    where re = mkRegex $ "\\<(" ++ intercalate "|" popstates ++ ")\\>"

l = filter is_state [ "xxx", "xx CO xx", "WI", "WIxx" ]
-- ["xx CO xx","WI"]

-- this is less efficient:
is_state' line = any (\re -> isJust $ matchRegex re line) (map to_regexp popstates)
    where to_regexp c = mkRegex $ "\\<" ++ c ++ "\\>"

Testing for a Valid Pattern

import Data.Maybe
import Data.List
import Text.Regex.Base
import Text.Regex.Posix

-- one can not catch invalid patterns using Text.Regex

-- recent versions of Text.Regex.Base have makeRegexM:
is_valid_pattern :: String -> Bool
is_valid_pattern re = isJust (makeRegexM re :: Maybe Regex)
-- nb: the type Regex must be specified since matchRegexM uses abstract
--     classes and haskell can't guess which instance to use

-- or can use compile from Text.Regex.Posix.String:
t = let regexp = "(" in
    compile defaultCompOpt defaultExecOpt regexp >>=
      (\re -> case re of
                Left (_, err) -> error ("bad regexp \"" ++ regexp ++ "\": " ++ err)
                Right re -> return re)
-- *** Exception: bad regexp "(": Unmatched ( or \(

Honoring Locale Settings in Regular Expressions

-- Text.Regex and Text.Regex.Posix do not handle unicode (accents...)

Approximate Matching

import Data.List

-- Calculates the Levenshtein, or edit distance, between two strings.
-- here is a basic/slow version:
levenshtein_basic s t = distance (length s) (length t)
    where distance i 0 = i
          distance 0 j = j
          distance i j = minimum [ distance (i-1) j + 1
                                 , distance i (j-1) + 1
                                 , distance (i-1) (j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1) ]

-- a fast version based on the previous one, adding memoization
-- (note the recursive use of "d")
levenshtein s t = d
    where d = [ [distance m n | n<-[0..length t]] | m<-[0..length s] ]
          distance i 0 = i
          distance 0 j = j
          distance i j = minimum [ d!!(i-1)!!j + 1
                                 , d!!i!!(j-1) + 1
                                 , d!!(i-1)!!(j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1) ]

-- a more efficient/cryptic version.
-- (it computes the "distance" figures for each "sb" chars,
--  ie it memoizes only what is needed)
levenshtein' sa sb = foldl transform [0..length sa] sb 
    where 
      transform xs@(x:xs') c = scanl compute (x+1) (zip3 sa xs xs')
          where
            compute z (c', x, y) = minimum [y+1, z+1, x + if c == c' then 0 else 1]

-- Determines if two strings are an approximate match.
amatch percentage s t = levenshtein s t * 100 <= percentage * length s

main = do s <- readFile "/usr/share/dict/words"
          print $ filter (amatch 20 "balast") (lines s)
-- ["balant","balas","balat","ballast","belast","blast"]

Matching from Where the Last Pattern Left Off

import List
import Text.Regex
import Text.Regex.Posix
import Text.Printf

allMatches re = unfoldr (fmap adjust . matchRegexAll re)
    where adjust (_before, _, after, matches) = (matches, after)

-- special unfoldr (unfoldr only returns [b] whereas we also need the rest)
unfoldr' :: (a -> Maybe (b, a)) -> a -> ([b], a)
unfoldr' f b  =
  case f b of
   Just (a, b') -> let (l, b'') = unfoldr' f b' in (a:l, b'')
   Nothing      -> ([], b)

allMatches' re = unfoldr' (fmap adjust . matchRegexAll re)
    where adjust (_before, _, after, matches) = (matches, after)

t = allMatches (mkRegex "([0-9]+)") "3,4,5,9,120"
-- [["3"],["4"],["5"],["9"],["120"]]
-- (note that \d is not special, one must use [0-9] or [[:digit:]])

mystr = "The year 1752 lost 10 days on the 3rd of September"
t2 = 
    let (l, remain) = allMatches' (mkRegex "([0-9]+)") mystr in
    let after = remain =~ "\\S+" :: String in
    putStr $ unlines $ map (\[n] -> "Found number " ++ n) l ++ 
                        [ printf "Found %s after the last number" after ]
    
-- Found number 1752
-- Found number 10
-- Found number 3
-- Found rd after the last number

Greedy and Non-Greedy Matches

Detecting Duplicate Words

Expressing AND, OR, and NOT in a Single Pattern

Matching Multiple-Byte Characters

Matching a Valid Mail Address

Matching Abbreviations

Program: urlify

Program: tcgrep

Regular Expression Grabbag