1. Strings

Introduction

str  = "\\n"                             -- two characters, \ and an n
str2 = "Jon 'Maddog' Orwant"             -- in haskell we can do string only with ", no single quote
str3 = "\n"                              -- a "newline" character
str4 = "Jon \"Maddog\" Orwant"           -- literal double quotes
str5 = "Multiline string must use a backslash at end of line\n\
\and a backslash at beginning of line\n";
str6 = "It is a common practice\n\
       \to indent lines\n\
       \(the indentation doesn't change the value of the string)\n"

Accessing Substrings

groupNelem n = unfoldr f
    where f [] = Nothing
          f s = Just (splitAt n s)

-- split at five byte boundaries
fivers = groupNelem 5

-- chop string into individual characters:
-- nothing to do

s     = "This is what you have"
first = take 1 s                          -- "T"
start = take 2 $ drop 5 s                 -- "is"
rest  = drop 13 s                         -- "you have"
last' = last s                            -- 'e'

import Text.Regex
s = "This is what you have"

-- strings are immutable
s2 = a ++ "wasn't" ++ drop 2 b
     where (a, b) = splitAt 5 s
s3 = subRegex (mkRegex " is ") s " wasn't " -- "This wasn't what you have"

-- substitute "at" for "is", restricted to first five characters
s4 = a' ++ b
    where (a, b) = splitAt 5 s
          a' = subRegex (mkRegex "is") a "at"
-- do it another way
import Text.Regex
import qualified Control.Arrow as Arrow
f = uncurry (++) . Arrow.first subst . splitAt 5
    where subst s = subRegex (mkRegex "is") s "at"

-- unpack chars using parsec
import Text.ParserCombinators.Parsec
fromRight (Right v) = v
fromRight v = error ("fromRight: " ++ show v)
parseStr parser s = fromRight $ parse parser "" s
nchars n = count n anyToken

a = "To be or not to be"
b = parseStr (nchars 6 >> nchars 6) a -- "or not"

-- not same example as perl's unpack with "X5" which means "back up a byte"
-- here is an example using lookAhead where we start again at some point
[b3,c3] = parseStr (sequence [ lookAhead p1, p2 ]) a -- ["or","be"]
          where p1 = nchars 6 >> nchars 2
                p2 = nchars 3 >> nchars 2


import List
cut2fmt l = zipWith (-) l (1:l)
applyfmt fmt s = l ++ [s']
    where (s', l) = mapAccumL f s fmt
          f s n = (s',subs)
              where (subs,s') = splitAt n s

fmt = cut2fmt [8,14,20,26,30] -- [7,6,6,6,4]
l = applyfmt fmt "12345678912345678901234567890123456789" 
-- 1234567 891234 567890 123456 7890 123456789

Establishing a Default Value

-- boolean operators only work on booleans
a = b || c

-- if you have an optional value, use type Maybe
import Maybe
v1 = fromMaybe "b" $ Just "a"   -- "a"
v2 = fromMaybe "b" $ Nothing    -- "b"

-- to combine Maybe values, you can use mplus
import Control.Monad (mplus)

v1' = (Just "a") `mplus` (Just "b") -- Just "a"
v2' = Nothing `mplus` (Just "b")    -- Just "b"

-- you could also define some perlish things:
class Default_val a where
    default_val :: a
    is_default_val :: a -> Bool

instance Default_val [a] where 
    default_val = []
    is_default_val = null
instance Default_val Num where 
    default_val = 0
    is_default_val = (== 0)

a &&& b = if a.is_default_val then default_val else b
a ||| b = if a.is_default_val then b else a

foo = bar ||| "DEFAULT VALUE"

--
import Maybe
import System
argv0 = fmap listToMaybe getArgs
dir = fmap (fromMaybe "/tmp") argv0

-- or
dir' = fmap (head . (++ ["/tmp"])) getArgs

Exchanging Values Without Using Temporary Variables

-- no side effect in haskell => swap is a nonsense

Converting Between ASCII Characters and Values

import Char
i = ord 'e'                     -- 101
c = chr 101                     -- 'e'

import Text.Printf
printf "Number %d is character %c\n" 101 101

ascii_character_numbers = map ord "sample" -- [115,97,109,112,108,101]
word = map chr ascii_character_numbers

ibm = map (chr . (+ 1) . ord) "HAL" -- "IBM"

Processing a String One Character at a Time

s = "an apple a day"
msg1 = "unique chars are: " ++ sort (nub s)
msg2 = "sum is " ++ (show $ sum $ map ord $ s)


-- slowcat -----------------------
#!/usr/bin/runghc
{-# OPTIONS_GHC -fglasgow-exts #-}

import System
import System.IO
import System.Posix
import Text.Regex

any_input [] = getContents
any_input (f:_) = readFile f

main = do (time, args') <- fmap get_time getArgs
          s <- any_input args'
          hSetBuffering stdout NoBuffering
          mapM_ (\c -> putChar c >> usleep (5000 * time)) s
    where get_time (x:args)
              | (Just [d]) <- matchRegex (mkRegex "^-([0-9]+)") x = (read d, args)
          get_time args = (1, args)

Reversing a String by Word or Character

string = "Yoda said, \"can you see this?\""
allwords = words string
revwords = unwords (reverse allwords)

revwords = (unwords . reverse . words) string

-- another version of revwords which works for spaces
revwords' = (unwords . reverse . splitRegex (mkRegex " ")) string

-- yet another version using a home made version of splitRegex' which keeps the matched string
import List
import Text.Regex

-- special unfoldr (unfoldr only returns [b] whereas we also need the rest)
-- nb: we can use Control.Arrow.first which is \f (a,b) -> (f a, b)
--     and write:  Arrow.first (a :) (unfoldr' f b')
unfoldr' :: (a -> Maybe (b, a)) -> a -> ([b], a)
unfoldr' f b  =
  case f b of
   Just (a, b') -> let (l, b'') = unfoldr' f b' in (a:l, b'')
   Nothing      -> ([], b)

splitRegex' :: Regex -> String -> [(String, String)]
splitRegex' re s = 
    case unfoldr' f s of
      (l, "") -> l
      (l, s) -> l ++ [(s, "")]
    where 
      -- small helper functions which tranform matchRegexAll output
      -- to the one wanted by unfoldr'
      f = fmap f' . matchRegexAll re
      f' (before, matched, after, _) = ((before, matched), after)

words' = concatMap (\(a,b) -> [a,b]) . splitRegex' (mkRegex "\\s+")

revwords' = (concat . reverse . words') string

{-
import Test.QuickCheck
import Char

instance Arbitrary Char where
    arbitrary     = fmap chr $ choose (32,255)
    coarbitrary n = variant (ord n)

property_words' s = (concat . words') s == s
verif = quickCheck property_words'
-}
--

word = "reviver";
is_palindrome s = s == reverse s

long_palindromes = fmap (filter (\s -> s == reverse s && length s > 5) . lines) $
                      readFile "/usr/share/dict/words"

Expanding and Compressing Tabs


expand_tabs "" = ""
expand_tabs s = foldr1 ((++) . adjust) $ splitRegex (mkRegex "\t") s
    where adjust a = a ++ replicate (8 - (length a) `mod` 8) ' '

-- replace spaces with tab
unexpand = concat . map (reverse . unexp . reverse) . groupNelem 8 where
    unexp s = if head s == ' ' && length s == 8 
              then '\t' : dropWhile (== ' ') s
              else s
{- 
   Here is a property that can be given to quickCheck 
   property_expand s = (expand_tabs . unexpand) s == s
-}
                        

Expanding Variables in User Input

-- can't do eval in haskell

-- subRegex only allow a fixed string
-- subRegexOnceWith below takes a (String -> String) function to compute a result
-- (and do the substitution only once)
subRegexOnceWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ after
s = subRegexOnceWith (mkRegex "[0-9]+") (show . (* 2) . read) "I am 17 years old"

Controlling Case

import Char
s1 = map toUpper "dromedary"       -- "DROMEDARY"
s2 = map toLower s1              -- "dromedary"
s3 = toUpper (head s2) : tail s2

capitalize "" = ""
capitalize (x:xs) = toUpper x : map toLower xs

-- capitalize each word's first character, downcase the rest
s4 = map capitalize $ words "thIS is a loNG liNE"

-- randcap: filter to randomly capitalize 20% of the letters
#!/usr/bin/runghc
import System
import System.IO
import Random
import Char

any_input [] = getContents
any_input (f:_) = readFile f

rand :: Int -> Int -> IO Int
rand low high = getStdRandom (randomR (low, high))

randcap :: String -> IO String
randcap = sequence . map f where
      f c = fmap (modify_char c) $ rand(1,5)
      modify_char c n = (if n == 1 then toUpper else toLower) c

main = do s <- getArgs >>= any_input
          randcap s >>= putStr

Interpolating Functions and Expressions Within Strings

s = "I have " ++ show (n+1) ++ " guanacos."

-- multiline strings are quite ugly in haskell
-- so skipping the send_mail example

Indenting Here Documents

var = "your text\n\
      \goes here\n"

Reformatting Paragraphs

import Data.List

input = "Folding and splicing is the work of an editor,\
        \ not a mere collection of silicon and mobile electrons!"

wrap columns first_indent indent s =
        intercalate "\n" $ wgroup (tail first_indent) (words s)
        where wgroup current [] = [current]
              wgroup current (w : ws) =
                if length current + length w + 1 < columns 
                then wgroup (current ++ " " ++ w) ws 
                else current : wgroup (indent ++ w) ws


-- another version
wrap_simple columns s =
    intercalate "\n" $ map unwords $ groupWhile pred $ words s
    where pred = (<= columns) . sum . map ((+1) . length)

wrap columns first_indent indent s =
    first_indent ++ intercalate ("\n" ++ indent) (map unwords $ first : next)
    where (first, rest) = spanWhile (pred $ columns - length first_indent) (words s)
          next = groupWhile (pred $ columns - length indent) rest 
          pred width = (<= width) . sum . map ((+1) . length)

spanWhile :: ([a] -> Bool) -> [a] -> ([a],[a])
spanWhile p l = spanWhile' [] l              
    where spanWhile' seen [] = (seen, [])
          spanWhile' seen (x:xs) =
              let seen' = seen ++ [x] in
              if p seen' then spanWhile' seen' xs else (seen, x:xs)

groupWhile :: ([a] -> Bool) -> [a] -> [[a]]
groupWhile p l = case spanWhile p l of
                   ([], []) -> []
                   (l', []) -> [l']
                   (l', rest) -> l' : groupWhile p rest

Escaping Characters

import Text.Regex
import Char

subRegexWith re new s =
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ (subRegexWith re new after)

quoteMeta :: String -> String
quoteMeta = concatMap (\c -> if (isAlphaNum c) then [c] else ['\\', c])

t1 = subRegex (mkRegex $ quoteMeta "^") "foo^bar" "+" -- "foo+bar" (without the quoteMeta, it goes into a dead loop)
t2 = subRegex (mkRegex $ quoteMeta "${") "${foo}bar" "{" -- "{foo}bar"
t3 = subRegexWith (mkRegex "\\$\\{[^}]*\\}") (map toUpper) "${foo}bar}" -- "${FOO}bar}"

Trimming Blanks from the Ends of a String

trim_beg, trim_end, trim :: String -> String
trim_beg = snd . span isSpace
trim_end = reverse . trim_beg . reverse
trim = trim_end . trim_beg

trimmed = ">" ++ trim "\t       `Here' \t \t \n\n\n" ++ "<"

Parsing Comma-Separated Data

parse_csv :: String -> [String]
parse_csv s = case lex s of
        (x, "") -> [x]
        (x, xs) -> x : parse_csv xs
      where
           lex "" = ("", "")
           lex (',':xs) = ("",xs)
           lex ('"':xs) = (x++x',xs'') where (x,xs') = lexString xs
                                             (x',xs'') = lex xs'         
           lex (other:xs) = (other:x,xs') where (x,xs') = lex xs                                       

           lexString ('"':xs) = ("",xs)
           lexString ('\\':c:xs) = ('\\':c:x , xs') where (x,xs') = lexString xs
           lexString (c:xs) = (c:x,xs') where (x,xs') = lexString xs

-- the same using Parsec
import Text.ParserCombinators.Parsec

fromRight (Right v) = v
fromRight v = error ("fromRight: " ++ show v)
parseStr parser s = fromRight $ parse parser "" s
toList = fmap (\c -> [c])
many_ = fmap concat . many

parse_csv' = parseStr $ (quoted_string <|> raw_string) `sepBy` char ','
    where quoted_string = between (char '"') (char '"') (chars_until '"')
          raw_string = chars_until ','
          chars_until c = many_ (sequence [ char '\\', anyChar ] <|> toList (noneOf [c]))

test_string = "XYZZY,\"\",\"O'Reilly, Inc\",\"Wall, Larry\",\"a \\\"glub\\\" bit,\",5,\"Error, Core Dumped\""

output = concatMap format_it $ zip [0..] (parse_csv test_string)
    where format_it (line, s) = show line ++ " : " ++ s ++ "\n"

Soundex Matching

import List
import Maybe
import Char
import System.Posix.User

soundex name = (chars!!0) : concatMap show codes'
    where
      chars = map toUpper (filter isAlpha name)

      -- duplicate consecutive soundex digits are skipped
      codes = map head $ group $ map letter_to_code chars

      -- remove first, remove codes 0, add right-pad with 0
      codes' = take 3 (filter (/= 0) (tail codes) ++ [0, 0..])

letter_to_code :: Char -> Int
letter_to_code c = snd $ fromJust $ find (\(letters, _) -> c `elem` letters) letters_code
letters_code =
    [ ("AEIOUYHW", 0) , ("BFPV", 1) , ("CGJKQSXZ", 2) , ("DT", 3) , ("L", 4) , ("MN", 5) , ("R", 6) ]
          
errors = filter (\(code,name) -> code /= soundex name)
            [ ("H452", "holmes")
            , ("A355", "adomomi")
            , ("V536", "vonderlehr")
            , ("B400", "ball")
            , ("S000", "shaw")
            , ("J250", "jackson")
            , ("S545", "scanlon")
            , ("S532", "saintjohn")
            , ("K525", "kingsmith")
            , ("B331", "booth-davis")
            , ("K530", "Knuth")
            , ("K530", "Kant")
            , ("L300", "Lloyd")
            , ("L300", "Ladd")
            ]


-- one need to flush between putStr and getLine,
--   it would not be needed if we had printer a newline
-- the other solution is to disable buffering with hSetBuffering
putStr_getLine s = do putStr s
                      hFlush stdout
                      getLine

msoundex = do user <- putStr_getLine "Lookup user: "
              matching <- fmap (filter $ matches user) getAllUserEntries
              mapM_ (putStrLn . format_user) matching
    where 
      matches wanted user = soundex wanted `elem` (map soundex $ userName user : words (userGecos user))
      format_user user = userName user ++ ": " ++ userGecos user
                         

Program: fixstyle

-- download the following standalone program
module Main where

import qualified Text.Regex.PCRE.String as PCRE
import Data.Char (toLower)
import Data.List (intersperse)

transDict :: [(String,String)]
transDict =
    [("analysed"      ,"analyzed")
    ,("built-in"      ,"builtin")
    ,("chastized"     ,"chastised")
    ,("commandline"   ,"command-line")
    ,("de-allocate"   ,"deallocate")
    ,("dropin"        ,"drop-in")
    ,("hardcode"      ,"hard-code")
    ,("meta-data"     ,"metadata")
    ,("multicharacter","multi-character")
    ,("multiway"      ,"multi-way")
    ,("non-empty"     ,"nonempty")
    ,("non-profit"    ,"nonprofit")
    ,("non-trappable" ,"nontrappable")
    ,("pre-define"    ,"predefine")
    ,("preextend"     ,"pre-extend")
    ,("re-compiling"  ,"recompiling")
    ,("reenter"       ,"re-enter")
    ,("turnkey"       ,"turn-key")]

transWord :: String -> String
transWord word = case (lookup (map toLower word) transDict) of
                   Just trans -> trans
                   Nothing -> word

transDictRegex :: IO PCRE.Regex
transDictRegex = do
  compres <- PCRE.compile compopt execopt pattern
  case compres of
    Left (offset, string)
        -> error $
           "Regex pattern error" ++
           " at offset " ++ show offset ++
           " for string: " ++ string
    Right regex
        -> return regex
  where pattern = "(" ++ (concat $ intersperse "|" $ map fst transDict) ++ ")"
        compopt = PCRE.compCaseless + PCRE.compMultiline + PCRE.compUTF8
        execopt = PCRE.execBlank

matchRegex :: String -> IO (String, String, String)
matchRegex input = do
  regex <- transDictRegex
  execres <- PCRE.regexec regex input
  case execres of
    Left err -> error $ "regexec WrapError " ++ show err ++ "for: " ++ input
    Right Nothing -> return (input, [], [])
    Right (Just (head, word, tail, _)) -> return (head, word, tail)

translate :: String -> IO String
translate [] = do return []
translate input = do
  (head, word, tail) <- matchRegex input
  tailTrans <- (translate tail)
  return $ head ++ (transWord word) ++ tailTrans

main :: IO ()
main = do getContents >>= translate >>= putStr

Program: psgrep

-- download the following standalone program
module Main where

import Data.Char (isSpace)
import System.Time (ClockTime(..))
import System.Exit (ExitCode(..))
import System.Process (readProcessWithExitCode)
import Data.List (elemIndex, foldl', intercalate, words)
import qualified Data.Text as T

psFields :: [String]
psFields = ["flags","uid","pid","ppid","pri","nice","size","rss","wchan","stat",
            "tty","time","command"]


-- Accessors for ps output line fields -----------------------------------------

breakPSFields :: String -> [String]
breakPSFields line =
    breakFields line nInitCols
    where nInitCols        = (length psFields) - 1
          dropSpace        = dropWhile isSpace
          breakFields [] _ = []
          breakFields cs 0 = [T.unpack $ T.strip $ T.pack cs]
          breakFields cs n = let (h,t) = break isSpace $ dropSpace cs
                                 t'    = (breakFields t (n - 1))
                             in h:t'

type PSField = ([String], String)

getPSFields :: IO [PSField]
getPSFields = do
  (ecode, out, err) <- readProcessWithExitCode "ps" psargs []
  case ecode of
    ExitFailure eno -> error $ "ps failure: [" ++ show eno ++ "] " ++ err
    ExitSuccess -> return $ map (\l -> (breakPSFields l, l)) $ tail $ lines out
  where psargs = ["ax", "-o", intercalate "," psFields]

filterPSFields :: (PSField -> Bool) -> IO [PSField]
filterPSFields p = do getPSFields >>= (\fs -> return $ filter p fs)

printPSFields :: (PSField -> Bool) -> IO ()
printPSFields p = filterPSFields p >>= mapM_ (putStrLn . snd)


-- Query DSL -------------------------------------------------------------------

parseInt :: String -> Int
parseInt cs = read cs :: Int

parseClockTime :: String -> ClockTime
parseClockTime cs = TOD (toInteger secs) 0
                    where secs = foldl' (\a s -> (60 * a) + s) 0 $
                                 map (parseInt . T.unpack) $
                                 T.split (T.pack ":") (T.pack cs)

liftAcP :: String -> (String -> a) -> (a -> a -> Bool) -> a -> PSField -> Bool
liftAcP a p f v s = (p $ (fst s) !! i) `f` v
                    where i = case (a `elemIndex` psFields) of
                                Just n  -> n
                                Nothing -> error $ "Wrong ps field: " ++ a

flagsP   = liftAcP "flags"   parseInt
uidP     = liftAcP "uid"     parseInt
pidP     = liftAcP "pid"     parseInt
ppidP    = liftAcP "ppid"    parseInt
priP     = liftAcP "pri"     parseInt
niceP    = liftAcP "nice"    id
sizeP    = liftAcP "size"    parseInt
rssP     = liftAcP "rss"     parseInt
wchanP   = liftAcP "wchan"   id
statP    = liftAcP "stat"    id
ttyP     = liftAcP "tty"     id
timeP    = liftAcP "time"    parseClockTime
commandP = liftAcP "command" id

liftOpP o x y s = (x s) `o` (y s)

(&&?)    = liftOpP (&&)
infixr 3 &&?

(||?)    = liftOpP (||)
infixr 2 ||?


-- Example Runs ----------------------------------------------------------------

-- You will need below ghc command line wrapper before using this script from
-- the command line.

--   $ cat PSGrep.sh
--   #!/bin/sh
--   ghc -e "printPSFields \$ $@" PSGrep.hs

-- Rest is trivial...

--   $ ./PSGrep.sh 'flagsP (==) 0 &&? sizeP (<) 1024'
--   0     0  3019     1  19   0   256   504 -      Ss+  tty2     00:00:00 /sbin/getty 38400 tty2
--   0     0  3020     1  19   0   256   508 -      Ss+  tty3     00:00:00 /sbin/getty 38400 tty3
--   0     0  3022     1  19   0   256   500 -      Ss+  tty4     00:00:00 /sbin/getty 38400 tty4
--   0     0  3023     1  19   0   256   504 -      Ss+  tty5     00:00:00 /sbin/getty 38400 tty5
--   0     0  3024     1  19   0   256   504 -      Ss+  tty6     00:00:00 /sbin/getty 38400 tty6

                         
----------------------------------------------------------------------------------------------------