-- for info about the various regexp libraries, see http://haskell.org/haskellwiki/Regular_expressions -- -- [Oct 2008] Text.Regex.Posix/Text.Regex.Base provides =~ which is quite powerful -- but it misses simple things like subRegex and splitRegex so in many cases -- Text.Regex must be used -- -- with Text.Regex.Posix: import Text.Regex.Posix import Text.Regex.Base meadow = [ "sheeps are ovine", "Sheeps are ovine" , "Fine bovines demand fine toreadors." , "Muskoxen are polar ovibovine species." , "Grooviness went out of fashion decades ago." ] t1 = filter (=~ "sheep") meadow -- ["sheeps are ovine"] t2 = filter (=~ "\\<ovines?") meadow -- ["sheeps are ovine","Sheeps are ovine"] -- below require "import Data.Bits" for .|. : ignoreCase = makeRegexOpts (defaultCompOpt .|. compIgnoreCase) defaultExecOpt t3 = filter (matchTest $ ignoreCase "sheep") meadow -- ["sheeps are ovine","Sheeps are ovine"] -- the same with Text.Regex: import Data.Maybe import Text.Regex t1 = filter (isJust . matchRegex (mkRegex "sheep")) meadow -- ["sheeps are ovine"] t2 = filter (isJust . matchRegex (mkRegex "\\<ovines?")) meadow -- ["sheeps are ovine","Sheeps are ovine"] ignoreCase s = mkRegexWithOpts s True False t3 = filter (isJust . matchRegex (ignoreCase "sheep")) meadow -- ["sheeps are ovine","Sheeps are ovine"] -- note that Data.ByteString.Char8 has a few interesting functions, -- but usually regexps are better import qualified Data.ByteString.Char8 as B t = filter is_sheep meadow where is_sheep s = B.isInfixOf (B.pack "sheep") (B.pack s) -- ["sheeps are ovine"] s1 = subRegex (mkRegex "o+") "good food" "e" -- ged fed -- the following helper functions are useful subRegexWith re new s = case matchRegexAll re s of Nothing -> s Just (before, matched, after, _) -> before ++ new matched ++ (subRegexWith re new after) subRegexOnceWith re new s = case matchRegexAll re s of Nothing -> s Just (before, matched, after, _) -> before ++ new matched ++ after s2 = subRegexOnceWith (mkRegex "o+") (\_ -> "e") "good food" -- ged food digits = "123456789"; l = digits =~ "..." :: [String] -- nb: in List context, calls matchAll -- ["123","456","789"] -- for overlapping matches, one must do it by hand: allOverlappingMatches re s = case matchRegexAll re s of Nothing -> [] Just (before, matched, after, _) -> matched : allOverlappingMatches re (tail matched ++ after) l = allOverlappingMatches (mkRegex "...") digits -- ["123","234","345","456","567","678","789"] string = "And little lambs eat ivy" (before, matched, after) = string =~ "l[^s]*s" :: (String, String, String) s = printf "(%s) (%s) (%s)" before matched after :: String -- "(And ) (little lambs) ( eat ivy)" |
-- copying is the default in pure languages :) import Text.Regex import System.Environment import System.FilePath.Posix prog_ = subRegex (mkRegex ".*/") "/etc/foo" "" -- "foo" -- but it's much nicer to use takeFileName prog__ = takeFileName "/etc/foo" -- "foo" -- as for the program basename, it is: prog = getProgName manpage = "/usr/man/man3/foo.1" catpage = subRegexMatchesWith (mkRegex "man([0-9])") (\[n] -> "cat" ++ n) manpage -- /usr/man/cat3/foo.1 subRegexMatchesWith re new s = case matchRegexAll re s of Nothing -> s Just (before, matched, after, l) -> before ++ new l ++ after bindirs = [ "/usr/bin", "/bin", "/usr/local/bin" ] libdirs = map (\s -> subRegex (mkRegex "bin$") s "lib") bindirs |
is_pure_alphabetic = isJust . matchRegex (mkRegex "^[A-Za-z]+$") is_pure_alphabetic_ = all isLetter -- this handles unicode (accents...) some_words = [ "silly", "random!stuff#here", "façade", "tschÃŒÃ", "niño", "ÑÑÑÑкОй" ] l1 = filter is_pure_alphabetic some_words -- ["silly"] l2 = filter is_pure_alphabetic_ some_words -- all except "random!stuff#here" |
-- "\\S+" -- as many non-whitespace bytes as possible -- "[A-Za-z'-]+" -- as many letters, apostrophes, and hyphens -- -- "\\<([A-Za-z]+)\\>" -- usually best -- "\\s([A-Za-z]+)\\s" -- fails at ends or w/ punctuation -- words is similar to splitting on "\s+" l = words "A text with some\tseparator\n" -- ["A","text","with","some","separator"] -- -- (nb: spaces at beginning or end do not change the result) |
-- Commenting Regular Expressions -- since regexps are simple strings, you can concatenate regexp pieces: re = "#" -- a pound sign ++ "(\\w+)" -- the variable name ++ "#" -- another pound sign expand_macro vals = subAllRegexMatchesWith (mkRegex re) get_val where get_val [s] = fromJust (lookup s vals) s = expand_macro [ ("foo", "42") ] "blah #foo# blah" -- "blah 42 blah" subAllRegexMatchesWith re new s = case matchRegexAll re s of Nothing -> s Just (before, matched, after, l) -> before ++ new l ++ (subAllRegexMatchesWith re new after) |
fishes = "One fish two fish red fish blue fish" -- first some very simple and useful functions: ignoreCase s = mkRegexWithOpts s True False allMatches re s = case matchRegexAll re s of Nothing -> [] Just (_before, _, after, matches) -> matches : allMatches re after -- here we go: colors = map head $ allMatches (ignoreCase "(\\w+)\\s+fish\\>") fishes -- ["One","two","red","blue"] s1 = printf "The third fish is a %s one." (colors !! 2) :: String -- The third fish is a red one. -- another solution to skip the first 2 matches using an ad'hoc regexp: s2 = printf "The third fish is a %s one." color :: String where Just (_, _, _, [_, color]) = matchRegexAll (ignoreCase "(\\w+\\s+fish\\s+){2}(\\w+)\\s+fish") fishes -- The third fish is a red one. -- a modified subRegex to be able to handle differently the 4th match subRegexWithEnv re f env s = case matchRegexAll re s of Nothing -> s Just (before, matched, after, matches) -> before ++ matched' ++ (subRegexWithEnv re f env' after) where (env', matched') = f env (matched:matches) s3 = subRegexWithEnv (ignoreCase "(\\w+)(\\s+fish\\>)") f 1 fishes where f n [verbatim, _, fish] = (n+1, if n == 4 then "sushi" ++ fish else verbatim) -- One fish two fish red fish sushi fish -- note that we can easily specialize subRegexWithEnv into subRegexWithCount: withCountEnv f g s = f (\n val -> (n+1, g n val)) 1 s subRegexWithCount re = withCountEnv (subRegexWithEnv re) s3' = subRegexWithCount (ignoreCase "(\\w+)(\\s+fish\\>)") f fishes where f n [verbatim, _, fish] = if n == 4 then "sushi" ++ fish else verbatim -- One fish two fish red fish sushi fish -- adding .* at beginning of the regexp will ensure we get the last hit. -- but we must be carefull ".*(\\w+)\\s+fish\\>" is wrong since .* will match -- most the color, so \\< is also needed s4 = case matchRegex (ignoreCase ".*\\<(\\w+)\\s+fish\\>") fishes of Just [last_fish_color] -> printf "Last fish is %s." last_fish_color _ -> "Failed!" -- Last fish is blue. |
-- for multi line regexps, use (mkRegexWithOpts _ False _) #!/usr/bin/runghc -- killtags - very bad html tag killer import System import Text.Regex main = do s <- getArgs >>= any_input putStr (strip_tags s) any_input [] = getContents any_input (f:_) = readFile f strip_tags s = subRegex (mkRegexWithOpts "<[^>]*>" False True) s "" -- testing it: test = "<a>after a\n\ \<bbb\n\ \bbb>after b <c>after c\n" s = strip_tags test -- "after a\n<bbb\nbbb>after b after c\n" |
-- Read the whole file and split l1 = fmap lines (readFile filename) -- on line ends l2 = fmap words (readFile filename) -- on whitespace l3 = fmap (splitRegex $ mkRegex "PATTERN") (readFile filename) -- on regexp -- .Ch, .Se and .Ss divide chunks f = splitRegex (mkRegex "^\\.(Ch|Se|Ss)$") -- note that in the example above, you loose the matched line, -- if you need it, you must create your own splitRegex: splitRegex' :: Regex -> String -> [(String, String)] splitRegex' re s = case unfoldr' f s of (l, "") -> l (l, s) -> l ++ [(s, "")] where -- small helper functions which tranform matchRegexAll output -- to the one wanted by unfoldr' f = fmap f' . matchRegexAll re f' (before, matched, after, _) = ((before, matched), after) -- it uses this special unfoldr -- (unfoldr only returns [b] whereas we also need the rest) -- nb: with Control.Arrow.first which is \f (a,b) -> (f a, b) -- we could write: Arrow.first (a :) (unfoldr' f b') unfoldr' :: (a -> Maybe (b, a)) -> a -> ([b], a) unfoldr' f b = case f b of Just (a, b') -> let (l, b'') = unfoldr' f b' in (a:l, b'') Nothing -> ([], b) unwords_keeping_spaces = splitRegex' (mkRegex "\\s+") t = unwords_keeping_spaces "a b c \t d" -- [("a"," "),("b"," "),("c"," \t "),("d","")] |
import Text.Regex import Text.Regex.Posix import Data.Maybe import Text.Printf import System any_input [] = getContents any_input (f:_) = readFile f -- command-line to print lines 15 through 17 inclusive (see below) main = do s <- getArgs >>= any_input putStr (unlines $ take 3 $ drop 14 $ lines s) filterByRange :: (a -> Bool) -> (a -> Bool) -> [a] -> [[a]] filterByRange _ _ [] = [] filterByRange f_beg f_end l = let l' = dropWhile (not . f_beg) l in case break f_end l' of (before, match : l'') -> (before ++ [match]) : filterByRange f_beg f_end l'' _ -> [] extract_XMP :: [String] -> [[String]] extract_XMP = filterByRange (=~ "<XMP>") (=~ "</XMP>") t1 = do s <- readFile "foo.html" return $ extract_XMP $ lines s ignoreCase s = mkRegexWithOpts s True False extract_XMP' = filterByRange (match "<XMP>") (match "</XMP>") where match re = isJust . matchRegex (ignoreCase re) t1' = do s <- readFile "foo.html" return $ extract_XMP' $ lines s mbox_to_headers = filterByRange (match "^From:?\\s") (match "^$") where match re = isJust . matchRegex (ignoreCase re) mail_adresses = concatMap search_in_line where search_in_line line = (line =~ "[^<>(),;[:space:]]+@[^<>(),;[:space:]]+") :: [String] -- nb: in List context, calls matchAll. -- nb: \s can not be used, so instead of [^<>\s], one must do [^<>[:space:]] t2 = do s <- readFile "mbox" return $ mail_adresses $ concat $ mbox_to_headers $ lines s |
import List import Data.Maybe special_chars = ".()+*?[]\\{}$^|" regex_escape_char :: Char -> String regex_escape_char c = if elem c special_chars then ['\\', c] else [c] regex_escape :: String -> String regex_escape = concatMap regex_escape_char glob_list = [ ('*', ".*") , ('?', ".") , ('[', "[") , (']', "]") ] glob_char_to_regexp c = fromMaybe (regex_escape_char c) (lookup c glob_list) -- nb: fromMaybe will try (lookup ...) first, then fallback on (regex_escape_char c) glob2pat = concatMap glob_char_to_regexp pat = glob2pat "*File.*" -- ".*File\\..*" |
import Data.List import Data.Maybe import Text.Regex popstates = ["CO","ON","MI","WI","MN"] is_state = isJust . matchRegex re where re = mkRegex $ "\\<(" ++ intercalate "|" popstates ++ ")\\>" l = filter is_state [ "xxx", "xx CO xx", "WI", "WIxx" ] -- ["xx CO xx","WI"] -- this is less efficient: is_state' line = any (\re -> isJust $ matchRegex re line) (map to_regexp popstates) where to_regexp c = mkRegex $ "\\<" ++ c ++ "\\>" |
import Data.Maybe import Data.List import Text.Regex.Base import Text.Regex.Posix -- one can not catch invalid patterns using Text.Regex -- recent versions of Text.Regex.Base have makeRegexM: is_valid_pattern :: String -> Bool is_valid_pattern re = isJust (makeRegexM re :: Maybe Regex) -- nb: the type Regex must be specified since matchRegexM uses abstract -- classes and haskell can't guess which instance to use -- or can use compile from Text.Regex.Posix.String: t = let regexp = "(" in compile defaultCompOpt defaultExecOpt regexp >>= (\re -> case re of Left (_, err) -> error ("bad regexp \"" ++ regexp ++ "\": " ++ err) Right re -> return re) -- *** Exception: bad regexp "(": Unmatched ( or \( |
-- Text.Regex and Text.Regex.Posix do not handle unicode (accents...) |
import Data.List -- Calculates the Levenshtein, or edit distance, between two strings. -- here is a basic/slow version: levenshtein_basic s t = distance (length s) (length t) where distance i 0 = i distance 0 j = j distance i j = minimum [ distance (i-1) j + 1 , distance i (j-1) + 1 , distance (i-1) (j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1) ] -- a fast version based on the previous one, adding memoization -- (note the recursive use of "d") levenshtein s t = d where d = [ [distance m n | n<-[0..length t]] | m<-[0..length s] ] distance i 0 = i distance 0 j = j distance i j = minimum [ d!!(i-1)!!j + 1 , d!!i!!(j-1) + 1 , d!!(i-1)!!(j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1) ] -- a more efficient/cryptic version. -- (it computes the "distance" figures for each "sb" chars, -- ie it memoizes only what is needed) levenshtein' sa sb = foldl transform [0..length sa] sb where transform xs@(x:xs') c = scanl compute (x+1) (zip3 sa xs xs') where compute z (c', x, y) = minimum [y+1, z+1, x + if c == c' then 0 else 1] -- Determines if two strings are an approximate match. amatch percentage s t = levenshtein s t * 100 <= percentage * length s main = do s <- readFile "/usr/share/dict/words" print $ filter (amatch 20 "balast") (lines s) -- ["balant","balas","balat","ballast","belast","blast"] |
import List import Text.Regex import Text.Regex.Posix import Text.Printf allMatches re = unfoldr (fmap adjust . matchRegexAll re) where adjust (_before, _, after, matches) = (matches, after) -- special unfoldr (unfoldr only returns [b] whereas we also need the rest) unfoldr' :: (a -> Maybe (b, a)) -> a -> ([b], a) unfoldr' f b = case f b of Just (a, b') -> let (l, b'') = unfoldr' f b' in (a:l, b'') Nothing -> ([], b) allMatches' re = unfoldr' (fmap adjust . matchRegexAll re) where adjust (_before, _, after, matches) = (matches, after) t = allMatches (mkRegex "([0-9]+)") "3,4,5,9,120" -- [["3"],["4"],["5"],["9"],["120"]] -- (note that \d is not special, one must use [0-9] or [[:digit:]]) mystr = "The year 1752 lost 10 days on the 3rd of September" t2 = let (l, remain) = allMatches' (mkRegex "([0-9]+)") mystr in let after = remain =~ "\\S+" :: String in putStr $ unlines $ map (\[n] -> "Found number " ++ n) l ++ [ printf "Found %s after the last number" after ] -- Found number 1752 -- Found number 10 -- Found number 3 -- Found rd after the last number |