PLEAC-Haskell
Prev		Next

6. Pattern Matching

Introduction

-- for info about the various regexp libraries, see http://haskell.org/haskellwiki/Regular_expressions
--
-- [Oct 2008] Text.Regex.Posix/Text.Regex.Base provides =~ which is quite powerful
-- but it misses simple things like subRegex and splitRegex so in many cases
-- Text.Regex must be used
--
-- with Text.Regex.Posix:
import Text.Regex.Posix
import Text.Regex.Base

meadow = [ "sheeps are ovine", "Sheeps are ovine"
         , "Fine bovines demand fine toreadors."
         , "Muskoxen are polar ovibovine species."
         , "Grooviness went out of fashion decades ago." ]

t1 = filter (=~ "sheep") meadow
-- ["sheeps are ovine"]

t2 = filter (=~ "\\<ovines?") meadow
-- ["sheeps are ovine","Sheeps are ovine"]

-- below require "import Data.Bits" for .|. :
ignoreCase = makeRegexOpts (defaultCompOpt .|. compIgnoreCase) defaultExecOpt
t3 = filter (matchTest $ ignoreCase "sheep") meadow
-- ["sheeps are ovine","Sheeps are ovine"]


-- the same with Text.Regex:
import Data.Maybe
import Text.Regex

t1 = filter (isJust . matchRegex (mkRegex "sheep")) meadow
-- ["sheeps are ovine"]

t2 = filter (isJust . matchRegex (mkRegex "\\<ovines?")) meadow
-- ["sheeps are ovine","Sheeps are ovine"]

ignoreCase s = mkRegexWithOpts s True False
t3 = filter (isJust . matchRegex (ignoreCase "sheep")) meadow
-- ["sheeps are ovine","Sheeps are ovine"]

-- note that Data.ByteString.Char8 has a few interesting functions,
-- but usually regexps are better
import qualified Data.ByteString.Char8 as B

t = filter is_sheep meadow
    where is_sheep s = B.isInfixOf (B.pack "sheep") (B.pack s)
-- ["sheeps are ovine"]


s1 = subRegex (mkRegex "o+") "good food" "e"
-- ged fed 

-- the following helper functions are useful
subRegexWith re new s =
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ (subRegexWith re new after)
subRegexOnceWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ after

s2 = subRegexOnceWith (mkRegex "o+") (\_ -> "e") "good food" 
-- ged food


digits = "123456789";
l = digits =~ "..." :: [String] -- nb: in List context, calls matchAll
-- ["123","456","789"]

-- for overlapping matches, one must do it by hand:
allOverlappingMatches re s = 
    case matchRegexAll re s of
      Nothing -> []
      Just (before, matched, after, _) -> matched : allOverlappingMatches re (tail matched ++ after)

l = allOverlappingMatches (mkRegex "...") digits
-- ["123","234","345","456","567","678","789"]


string = "And little lambs eat ivy"
(before, matched, after) = string =~ "l[^s]*s" :: (String, String, String)
s = printf "(%s) (%s) (%s)" before matched after :: String
-- "(And ) (little lambs) ( eat ivy)"

Copying and Substituting Simultaneously

-- copying is the default in pure languages :)
import Text.Regex
import System.Environment
import System.FilePath.Posix

prog_ = subRegex (mkRegex ".*/") "/etc/foo" ""
-- "foo"
-- but it's much nicer to use takeFileName
prog__ = takeFileName "/etc/foo"
-- "foo"

-- as for the program basename, it is:
prog = getProgName


manpage = "/usr/man/man3/foo.1"
catpage = subRegexMatchesWith (mkRegex "man([0-9])") (\[n] -> "cat" ++ n) manpage
-- /usr/man/cat3/foo.1

subRegexMatchesWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, l) -> before ++ new l ++ after

bindirs = [ "/usr/bin", "/bin", "/usr/local/bin" ]
libdirs = map (\s -> subRegex (mkRegex "bin$") s "lib") bindirs

Matching Letters

is_pure_alphabetic = isJust . matchRegex (mkRegex "^[A-Za-z]+$")
is_pure_alphabetic_ = all isLetter -- this handles unicode (accents...)

some_words = [ "silly", "random!stuff#here", "faÃ§ade", "tschÃŒÃ", "niÃ±o", "ÑÑÑÑÐºÐžÐ¹" ]
l1 = filter is_pure_alphabetic some_words -- ["silly"]
l2 = filter is_pure_alphabetic_ some_words -- all except "random!stuff#here"

Matching Words

-- "\\S+"         -- as many non-whitespace bytes as possible
-- "[A-Za-z'-]+"  -- as many letters, apostrophes, and hyphens
-- 
-- "\\<([A-Za-z]+)\\>" -- usually best
-- "\\s([A-Za-z]+)\\s" -- fails at ends or w/ punctuation

-- words is similar to splitting on "\s+"
l = words "A text   with some\tseparator\n"
-- ["A","text","with","some","separator"]
--
-- (nb: spaces at beginning or end do not change the result)

Commenting Regular Expressions

-- Commenting Regular Expressions
-- since regexps are simple strings, you can concatenate regexp pieces:
re = "#"          -- a pound sign
     ++ "(\\w+)"  -- the variable name
     ++ "#"       -- another pound sign
expand_macro vals = subAllRegexMatchesWith (mkRegex re) get_val
    where get_val [s] = fromJust (lookup s vals)
s = expand_macro [ ("foo", "42") ] "blah #foo# blah"
-- "blah 42 blah"

subAllRegexMatchesWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, l) -> before ++ new l ++ (subAllRegexMatchesWith re new after)

Finding the Nth Occurrence of a Match

fishes = "One fish two fish red fish blue fish"

-- first some very simple and useful functions:
ignoreCase s = mkRegexWithOpts s True False
allMatches re s =
    case matchRegexAll re s of
      Nothing -> []
      Just (_before, _, after, matches) -> matches : allMatches re after

-- here we go:
colors = map head $ allMatches (ignoreCase "(\\w+)\\s+fish\\>") fishes
-- ["One","two","red","blue"]
s1 = printf "The third fish is a %s one." (colors !! 2) :: String
-- The third fish is a red one.

-- another solution to skip the first 2 matches using an ad'hoc regexp:
s2 = printf "The third fish is a %s one." color :: String
    where Just (_, _, _, [_, color]) = 
              matchRegexAll (ignoreCase "(\\w+\\s+fish\\s+){2}(\\w+)\\s+fish") fishes
-- The third fish is a red one.

-- a modified subRegex to be able to handle differently the 4th match
subRegexWithEnv re f env s =
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, matches) -> before ++ matched' ++ (subRegexWithEnv re f env' after)
          where (env', matched') = f env (matched:matches)

s3 = subRegexWithEnv (ignoreCase "(\\w+)(\\s+fish\\>)") f 1 fishes
    where f n [verbatim, _, fish] = (n+1, if n == 4 then "sushi" ++ fish else verbatim)
-- One fish two fish red fish sushi fish

-- note that we can easily specialize subRegexWithEnv into subRegexWithCount:
withCountEnv f g s = f (\n val -> (n+1, g n val)) 1 s
subRegexWithCount re = withCountEnv (subRegexWithEnv re)

s3' = subRegexWithCount (ignoreCase "(\\w+)(\\s+fish\\>)") f fishes
    where f n [verbatim, _, fish] = if n == 4 then "sushi" ++ fish else verbatim
-- One fish two fish red fish sushi fish

-- adding .* at beginning of the regexp will ensure we get the last hit.
-- but we must be carefull ".*(\\w+)\\s+fish\\>" is wrong since .* will match
-- most the color, so \\< is also needed
s4 = case matchRegex (ignoreCase ".*\\<(\\w+)\\s+fish\\>") fishes of
     Just [last_fish_color] -> printf "Last fish is %s." last_fish_color
     _ -> "Failed!"
-- Last fish is blue.

Matching Multiple Lines

-- for multi line regexps, use (mkRegexWithOpts _ False _)
#!/usr/bin/runghc

-- killtags - very bad html tag killer
import System
import Text.Regex

main = do s <- getArgs >>= any_input
          putStr (strip_tags s)

any_input [] = getContents
any_input (f:_) = readFile f

strip_tags s = subRegex (mkRegexWithOpts "<[^>]*>" False True) s ""

-- testing it:
test = "<a>after a\n\
       \<bbb\n\
       \bbb>after b <c>after c\n"
s = strip_tags test
-- "after a\n<bbb\nbbb>after b after c\n"

Reading Records with a Pattern Separator

-- Read the whole file and split
l1 = fmap lines (readFile filename) -- on line ends
l2 = fmap words (readFile filename) -- on whitespace
l3 = fmap (splitRegex $ mkRegex "PATTERN") (readFile filename) -- on regexp

-- .Ch, .Se and .Ss divide chunks
f = splitRegex (mkRegex "^\\.(Ch|Se|Ss)$")

-- note that in the example above, you loose the matched line,
-- if you need it, you must create your own splitRegex:
splitRegex' :: Regex -> String -> [(String, String)]
splitRegex' re s = 
    case unfoldr' f s of
      (l, "") -> l
      (l, s) -> l ++ [(s, "")]
    where 
      -- small helper functions which tranform matchRegexAll output
      -- to the one wanted by unfoldr'
      f = fmap f' . matchRegexAll re
      f' (before, matched, after, _) = ((before, matched), after)

-- it uses this special unfoldr
-- (unfoldr only returns [b] whereas we also need the rest)
-- nb: with Control.Arrow.first which is \f (a,b) -> (f a, b)
--     we could write:  Arrow.first (a :) (unfoldr' f b')
unfoldr' :: (a -> Maybe (b, a)) -> a -> ([b], a)
unfoldr' f b  =
  case f b of
   Just (a, b') -> let (l, b'') = unfoldr' f b' in (a:l, b'')
   Nothing      -> ([], b)

unwords_keeping_spaces = splitRegex' (mkRegex "\\s+")
t = unwords_keeping_spaces "a b  c \t d"
-- [("a"," "),("b","  "),("c"," \t "),("d","")]

Extracting a Range of Lines

import Text.Regex
import Text.Regex.Posix
import Data.Maybe
import Text.Printf
import System

any_input [] = getContents
any_input (f:_) = readFile f

-- command-line to print lines 15 through 17 inclusive (see below)
main = do s <- getArgs >>= any_input
          putStr (unlines $ take 3 $ drop 14 $ lines s)

filterByRange :: (a -> Bool) -> (a -> Bool) -> [a] -> [[a]]
filterByRange _ _ [] = []
filterByRange f_beg f_end l = 
    let l' = dropWhile (not . f_beg) l in
    case break f_end l' of
      (before, match : l'') -> (before ++ [match]) : filterByRange f_beg f_end l''
      _ -> []

extract_XMP :: [String] -> [[String]]
extract_XMP = filterByRange (=~ "<XMP>") (=~ "</XMP>")

t1 = do s <- readFile "foo.html"
        return $ extract_XMP $ lines s

ignoreCase s = mkRegexWithOpts s True False
extract_XMP' = filterByRange (match "<XMP>") (match "</XMP>")
    where match re = isJust . matchRegex (ignoreCase re)

t1' = do s <- readFile "foo.html"
         return $ extract_XMP' $ lines s

mbox_to_headers = filterByRange (match "^From:?\\s") (match "^$")
    where match re = isJust . matchRegex (ignoreCase re)
mail_adresses = concatMap search_in_line
    where search_in_line line = (line =~ "[^<>(),;[:space:]]+@[^<>(),;[:space:]]+") :: [String]
          -- nb: in List context, calls matchAll.
          -- nb: \s can not be used, so instead of [^<>\s], one must do [^<>[:space:]]

t2 = do s <- readFile "mbox"
        return $ mail_adresses $ concat $ mbox_to_headers $ lines s

Matching Shell Globs as Regular Expressions

import List
import Data.Maybe

special_chars = ".()+*?[]\\{}$^|"

regex_escape_char :: Char -> String
regex_escape_char c = if elem c special_chars 
               then ['\\', c] 
               else [c]
regex_escape :: String -> String
regex_escape = concatMap regex_escape_char

glob_list = 
    [ ('*', ".*")
    , ('?', ".")
    , ('[', "[")
    , (']', "]") ]
glob_char_to_regexp c = fromMaybe (regex_escape_char c) (lookup c glob_list)
-- nb: fromMaybe will try (lookup ...) first, then fallback on (regex_escape_char c)

glob2pat = concatMap glob_char_to_regexp

pat = glob2pat "*File.*"
-- ".*File\\..*"

Speeding Up Interpolated Matches

import Data.List
import Data.Maybe
import Text.Regex

popstates = ["CO","ON","MI","WI","MN"]

is_state = isJust . matchRegex re
    where re = mkRegex $ "\\<(" ++ intercalate "|" popstates ++ ")\\>"

l = filter is_state [ "xxx", "xx CO xx", "WI", "WIxx" ]
-- ["xx CO xx","WI"]

-- this is less efficient:
is_state' line = any (\re -> isJust $ matchRegex re line) (map to_regexp popstates)
    where to_regexp c = mkRegex $ "\\<" ++ c ++ "\\>"

Testing for a Valid Pattern

import Data.Maybe
import Data.List
import Text.Regex.Base
import Text.Regex.Posix

-- one can not catch invalid patterns using Text.Regex

-- recent versions of Text.Regex.Base have makeRegexM:
is_valid_pattern :: String -> Bool
is_valid_pattern re = isJust (makeRegexM re :: Maybe Regex)
-- nb: the type Regex must be specified since matchRegexM uses abstract
--     classes and haskell can't guess which instance to use

-- or can use compile from Text.Regex.Posix.String:
t = let regexp = "(" in
    compile defaultCompOpt defaultExecOpt regexp >>=
      (\re -> case re of
                Left (_, err) -> error ("bad regexp \"" ++ regexp ++ "\": " ++ err)
                Right re -> return re)
-- *** Exception: bad regexp "(": Unmatched ( or \(

Honoring Locale Settings in Regular Expressions

-- Text.Regex and Text.Regex.Posix do not handle unicode (accents...)

Approximate Matching

import Data.List

-- Calculates the Levenshtein, or edit distance, between two strings.
-- here is a basic/slow version:
levenshtein_basic s t = distance (length s) (length t)
    where distance i 0 = i
          distance 0 j = j
          distance i j = minimum [ distance (i-1) j + 1
                                 , distance i (j-1) + 1
                                 , distance (i-1) (j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1) ]

-- a fast version based on the previous one, adding memoization
-- (note the recursive use of "d")
levenshtein s t = d
    where d = [ [distance m n | n<-[0..length t]] | m<-[0..length s] ]
          distance i 0 = i
          distance 0 j = j
          distance i j = minimum [ d!!(i-1)!!j + 1
                                 , d!!i!!(j-1) + 1
                                 , d!!(i-1)!!(j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1) ]

-- a more efficient/cryptic version.
-- (it computes the "distance" figures for each "sb" chars,
--  ie it memoizes only what is needed)
levenshtein' sa sb = foldl transform [0..length sa] sb 
    where 
      transform xs@(x:xs') c = scanl compute (x+1) (zip3 sa xs xs')
          where
            compute z (c', x, y) = minimum [y+1, z+1, x + if c == c' then 0 else 1]

-- Determines if two strings are an approximate match.
amatch percentage s t = levenshtein s t * 100 <= percentage * length s

main = do s <- readFile "/usr/share/dict/words"
          print $ filter (amatch 20 "balast") (lines s)
-- ["balant","balas","balat","ballast","belast","blast"]

Matching from Where the Last Pattern Left Off

import List
import Text.Regex
import Text.Regex.Posix
import Text.Printf

allMatches re = unfoldr (fmap adjust . matchRegexAll re)
    where adjust (_before, _, after, matches) = (matches, after)

-- special unfoldr (unfoldr only returns [b] whereas we also need the rest)
unfoldr' :: (a -> Maybe (b, a)) -> a -> ([b], a)
unfoldr' f b  =
  case f b of
   Just (a, b') -> let (l, b'') = unfoldr' f b' in (a:l, b'')
   Nothing      -> ([], b)

allMatches' re = unfoldr' (fmap adjust . matchRegexAll re)
    where adjust (_before, _, after, matches) = (matches, after)

t = allMatches (mkRegex "([0-9]+)") "3,4,5,9,120"
-- [["3"],["4"],["5"],["9"],["120"]]
-- (note that \d is not special, one must use [0-9] or [[:digit:]])

mystr = "The year 1752 lost 10 days on the 3rd of September"
t2 = 
    let (l, remain) = allMatches' (mkRegex "([0-9]+)") mystr in
    let after = remain =~ "\\S+" :: String in
    putStr $ unlines $ map (\[n] -> "Found number " ++ n) l ++ 
                        [ printf "Found %s after the last number" after ]
    
-- Found number 1752
-- Found number 10
-- Found number 3
-- Found rd after the last number

Prev	Home	Next
Hashes		File Access

6. Pattern Matching

Introduction

Copying and Substituting Simultaneously

Matching Letters

Matching Words

Commenting Regular Expressions

Finding the Nth Occurrence of a Match

Matching Multiple Lines

Reading Records with a Pattern Separator

Extracting a Range of Lines

Matching Shell Globs as Regular Expressions

Speeding Up Interpolated Matches

Testing for a Valid Pattern

Honoring Locale Settings in Regular Expressions

Approximate Matching

Matching from Where the Last Pattern Left Off

Greedy and Non-Greedy Matches

Detecting Duplicate Words

Expressing AND, OR, and NOT in a Single Pattern

Matching Multiple-Byte Characters

Matching a Valid Mail Address

Matching Abbreviations

Program: urlify

Program: tcgrep

Regular Expression Grabbag