1. Strings

Introduction

str  = "\\n"                             -- two characters, \ and an n
str2 = "Jon 'Maddog' Orwant"             -- in haskell we can do string only with ", no single quote
str3 = "\n"                              -- a "newline" character
str4 = "Jon \"Maddog\" Orwant"           -- literal double quotes
str5 = "Multiline string must use a backslash at end of line\n\
\and a backslash at beginning of line\n";
str6 = "It is a common practice\n\
       \to indent lines\n\
       \(the indentation doesn't change the value of the string)\n"

Accessing Substrings

groupNelem n = unfoldr f
    where f [] = Nothing
          f s = Just (splitAt n s)

-- split at five byte boundaries
fivers = groupNelem 5

-- chop string into individual characters:
-- nothing to do

s     = "This is what you have"
first = take 1 s                          -- "T"
start = take 2 $ drop 5 s                 -- "is"
rest  = drop 13 s                         -- "you have"
last' = last s                            -- 'e'

import Text.Regex
s = "This is what you have"

-- strings are immutable
s2 = a ++ "wasn't" ++ drop 2 b
     where (a, b) = splitAt 5 s
s3 = subRegex (mkRegex " is ") s " wasn't " -- "This wasn't what you have"

-- substitute "at" for "is", restricted to first five characters
s4 = a' ++ b
    where (a, b) = splitAt 5 s
          a' = subRegex (mkRegex "is") a "at"
-- do it another way
import Text.Regex
import qualified Control.Arrow as Arrow
f = uncurry (++) . Arrow.first subst . splitAt 5
    where subst s = subRegex (mkRegex "is") s "at"

-- unpack chars using parsec
import Text.ParserCombinators.Parsec
fromRight (Right v) = v
fromRight v = error ("fromRight: " ++ show v)
parseStr parser s = fromRight $ parse parser "" s
nchars n = count n anyToken

a = "To be or not to be"
b = parseStr (nchars 6 >> nchars 6) a -- "or not"

-- not same example as perl's unpack with "X5" which means "back up a byte"
-- here is an example using lookAhead where we start again at some point
[b3,c3] = parseStr (sequence [ lookAhead p1, p2 ]) a -- ["or","be"]
          where p1 = nchars 6 >> nchars 2
                p2 = nchars 3 >> nchars 2


import List
cut2fmt l = zipWith (-) l (1:l)
applyfmt fmt s = l ++ [s']
    where (s', l) = mapAccumL f s fmt
          f s n = (s',subs)
              where (subs,s') = splitAt n s

fmt = cut2fmt [8,14,20,26,30] -- [7,6,6,6,4]
l = applyfmt fmt "12345678912345678901234567890123456789" 
-- 1234567 891234 567890 123456 7890 123456789

Establishing a Default Value

-- boolean operators only work on booleans
a = b || c

-- if you have an optional value, use type Maybe
import Maybe
v1 = fromMaybe "b" $ Just "a"   -- "a"
v2 = fromMaybe "b" $ Nothing    -- "b"

-- to combine Maybe values, you can use mplus
import Control.Monad (mplus)

v1' = (Just "a") `mplus` (Just "b") -- Just "a"
v2' = Nothing `mplus` (Just "b")    -- Just "b"

-- you could also define some perlish things:
class Default_val a where
    default_val :: a
    is_default_val :: a -> Bool

instance Default_val [a] where 
    default_val = []
    is_default_val = null
instance Default_val Num where 
    default_val = 0
    is_default_val = (== 0)

a &&& b = if a.is_default_val then default_val else b
a ||| b = if a.is_default_val then b else a

foo = bar ||| "DEFAULT VALUE"

--
import Maybe
import System
argv0 = fmap listToMaybe getArgs
dir = fmap (fromMaybe "/tmp") argv0

-- or
dir' = fmap (head . (++ ["/tmp"])) getArgs

Exchanging Values Without Using Temporary Variables

-- no side effect in haskell => swap is a nonsense

Converting Between ASCII Characters and Values

import Char
i = ord 'e'                     -- 101
c = chr 101                     -- 'e'

import Text.Printf
printf "Number %d is character %c\n" 101 101

ascii_character_numbers = map ord "sample" -- [115,97,109,112,108,101]
word = map chr ascii_character_numbers

ibm = map (chr . (+ 1) . ord) "HAL" -- "IBM"

Processing a String One Character at a Time

s = "an apple a day"
msg1 = "unique chars are: " ++ sort (nub s)
msg2 = "sum is " ++ (show $ sum $ map ord $ s)


-- slowcat -----------------------
#!/usr/bin/runghc
{-# OPTIONS_GHC -fglasgow-exts #-}

import System
import System.IO
import System.Posix
import Text.Regex

any_input [] = getContents
any_input (f:_) = readFile f

main = do (time, args') <- fmap get_time getArgs
          s <- any_input args'
          hSetBuffering stdout NoBuffering
          mapM_ (\c -> putChar c >> usleep (5000 * time)) s
    where get_time (x:args)
              | (Just [d]) <- matchRegex (mkRegex "^-([0-9]+)") x = (read d, args)
          get_time args = (1, args)

Reversing a String by Word or Character

string = "Yoda said, \"can you see this?\""
allwords = words string
revwords = unwords (reverse allwords)

revwords = (unwords . reverse . words) string

-- another version of revwords which works for spaces
revwords' = (unwords . reverse . splitRegex (mkRegex " ")) string

-- yet another version using a home made version of splitRegex' which keeps the matched string
import List
import Text.Regex
import qualified Control.Arrow as Arrow

unfoldr' f b  =
  case f b of
   Just (a, b') -> Arrow.first (a :) (unfoldr' f b')
   Nothing      -> ([], b)

splitRegex' re s = 
    case unfoldr' f s of
      (l, "") -> l
      (l, s) -> l ++ [(s, "")]
    where f = fmap f' . matchRegexAll re
          f' (before, matched, after, _) = ((before, matched), after)

words' = concatMap (\(a,b) -> [a,b]) . splitRegex' (mkRegex "[[:space:]]+")

revwords' = (concat . reverse . words') string

{-
import Test.QuickCheck
import Char

instance Arbitrary Char where
    arbitrary     = fmap chr $ choose (32,255)
    coarbitrary n = variant (ord n)

property_words' s = (concat . words') s == s
verif = quickCheck property_words'
-}
--

word = "reviver";
is_palindrome s = s == reverse s

long_palindromes = fmap (filter (\s -> s == reverse s && length s > 5) . lines) $
                      readFile "/usr/share/dict/words"

Expanding and Compressing Tabs


expand_tabs "" = ""
expand_tabs s = foldr1 ((++) . adjust) $ splitRegex (mkRegex "\t") s
    where adjust a = a ++ replicate (8 - (length a) `mod` 8) ' '

-- replace spaces with tab
unexpand = concat . map (reverse . unexp . reverse) . groupNelem 8 where
    unexp s = if head s == ' ' && length s == 8 
              then '\t' : dropWhile (== ' ') s
              else s
{- 
   Here is a property that can be given to quickCheck 
   property_expand s = (expand_tabs . unexpand) s == s
-}
                        

Expanding Variables in User Input

-- can't do eval in haskell

-- subRegex only allow a fixed string
-- subRegexOnceWith below takes a (String -> String) function to compute a result
-- (and do the substitution only once)
subRegexOnceWith re new s = 
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ after
s = subRegexOnceWith (mkRegex "[0-9]+") (show . (* 2) . read) "I am 17 years old"

Controlling Case

import Char
s1 = map toUpper "dromedary"       -- "DROMEDARY"
s2 = map toLower s1              -- "dromedary"
s3 = toUpper (head s2) : tail s2

capitalize "" = ""
capitalize (x:xs) = toUpper x : map toLower xs

-- capitalize each word's first character, downcase the rest
s4 = map capitalize $ words "thIS is a loNG liNE"

-- randcap: filter to randomly capitalize 20% of the letters
#!/usr/bin/runghc
import System
import System.IO
import Random
import Char

any_input [] = getContents
any_input (f:_) = readFile f

rand :: Int -> Int -> IO Int
rand low high = getStdRandom (randomR (low, high))

randcap :: String -> IO String
randcap = sequence . map f where
      f c = fmap (modify_char c) $ rand(1,5)
      modify_char c n = (if n == 1 then toUpper else toLower) c

main = do s <- getArgs >>= any_input
          randcap s >>= putStr

Interpolating Functions and Expressions Within Strings

s = "I have " ++ show (n+1) ++ " guanacos."

-- multiline strings are quite ugly in haskell
-- so skipping the send_mail example

Indenting Here Documents

var = "your text\n\
      \goes here\n"

Reformatting Paragraphs

import List
joinString :: String -> [String] -> String
joinString s = concat . intersperse s

input = "Folding and splicing is the work of an editor,\
        \ not a mere collection of silicon and mobile electrons!"

wrap columns first_indent indent s =
        joinString "\n" $ wgroup (tail first_indent) (words s)
        where wgroup current [] = [current]
              wgroup current (w : ws) =
                if length current + length w + 1 < columns 
                then wgroup (current ++ " " ++ w) ws 
                else current : wgroup (indent ++ w) ws


-- another version
wrap_simple columns s =
    joinString "\n" $ map unwords $ groupWhile pred $ words s
    where pred = (<= columns) . sum . map ((+1) . length)

wrap columns first_indent indent s =
    first_indent ++ joinString ("\n" ++ indent) (map unwords $ first : next)
    where (first, rest) = spanWhile (pred $ columns - length first_indent) (words s)
          next = groupWhile (pred $ columns - length indent) rest 
          pred width = (<= width) . sum . map ((+1) . length)

spanWhile :: ([a] -> Bool) -> [a] -> ([a],[a])
spanWhile p l = spanWhile' [] l              
    where spanWhile' seen [] = (seen, [])
          spanWhile' seen (x:xs) =
              let seen' = seen ++ [x] in
              if p seen' then spanWhile' seen' xs else (seen, x:xs)

groupWhile :: ([a] -> Bool) -> [a] -> [[a]]
groupWhile p l = case spanWhile p l of
                   ([], []) -> []
                   (l', []) -> [l']
                   (l', rest) -> l' : groupWhile p rest

Escaping Characters

import Text.Regex
import Char

subRegexWith re new s =
    case matchRegexAll re s of
      Nothing -> s
      Just (before, matched, after, _) -> before ++ new matched ++ (subRegexWith re new after)

quoteMeta :: String -> String
quoteMeta = concatMap (\c -> if (isAlphaNum c) then [c] else ['\\', c])

t1 = subRegex (mkRegex $ quoteMeta "^") "foo^bar" "+" -- "foo+bar" (without the quoteMeta, it goes into a dead loop)
t2 = subRegex (mkRegex $ quoteMeta "${") "${foo}bar" "{" -- "{foo}bar"
t3 = subRegexWith (mkRegex "\\$\\{[^}]*\\}") (map toUpper) "${foo}bar}" -- "${FOO}bar}"

Trimming Blanks from the Ends of a String

trim_beg, trim_end, trim :: String -> String
trim_beg = snd . span isSpace
trim_end = reverse . trim_beg . reverse
trim = trim_end . trim_beg

trimmed = ">" ++ trim "\t       `Here' \t \t \n\n\n" ++ "<"

Parsing Comma-Separated Data

parse_csv :: String -> [String]
parse_csv s = case lex s of
        (x, "") -> [x]
        (x, xs) -> x : parse_csv xs
      where
           lex "" = ("", "")
           lex (',':xs) = ("",xs)
           lex ('"':xs) = (x++x',xs'') where (x,xs') = lexString xs
                                             (x',xs'') = lex xs'         
           lex (other:xs) = (other:x,xs') where (x,xs') = lex xs                                       

           lexString ('"':xs) = ("",xs)
           lexString ('\\':c:xs) = ('\\':c:x , xs') where (x,xs') = lexString xs
           lexString (c:xs) = (c:x,xs') where (x,xs') = lexString xs

-- the same using Parsec
import Text.ParserCombinators.Parsec

fromRight (Right v) = v
fromRight v = error ("fromRight: " ++ show v)
parseStr parser s = fromRight $ parse parser "" s
toList = fmap (\c -> [c])
many_ = fmap concat . many

parse_csv' = parseStr $ (quoted_string <|> raw_string) `sepBy` char ','
    where quoted_string = between (char '"') (char '"') (chars_until '"')
          raw_string = chars_until ','
          chars_until c = many_ (sequence [ char '\\', anyChar ] <|> toList (noneOf [c]))

test_string = "XYZZY,\"\",\"O'Reilly, Inc\",\"Wall, Larry\",\"a \\\"glub\\\" bit,\",5,\"Error, Core Dumped\""

output = concatMap format_it $ zip [0..] (parse_csv test_string)
    where format_it (line, s) = show line ++ " : " ++ s ++ "\n"

Soundex Matching

import List
import Maybe
import Char
import System.Posix.User

soundex name = (chars!!0) : concatMap show codes'
    where
      chars = map toUpper (filter isAlpha name)

      -- duplicate consecutive soundex digits are skipped
      codes = map head $ group $ map letter_to_code chars

      -- remove first, remove codes 0, add right-pad with 0
      codes' = take 3 (filter (/= 0) (tail codes) ++ [0, 0..])

letter_to_code :: Char -> Int
letter_to_code c = snd $ fromJust $ find (\(letters, _) -> c `elem` letters) letters_code
letters_code =
    [ ("AEIOUYHW", 0) , ("BFPV", 1) , ("CGJKQSXZ", 2) , ("DT", 3) , ("L", 4) , ("MN", 5) , ("R", 6) ]
          
errors = filter (\(code,name) -> code /= soundex name)
            [ ("H452", "holmes")
            , ("A355", "adomomi")
            , ("V536", "vonderlehr")
            , ("B400", "ball")
            , ("S000", "shaw")
            , ("J250", "jackson")
            , ("S545", "scanlon")
            , ("S532", "saintjohn")
            , ("K525", "kingsmith")
            , ("B331", "booth-davis")
            , ("K530", "Knuth")
            , ("K530", "Kant")
            , ("L300", "Lloyd")
            , ("L300", "Ladd")
            ]


-- one need to flush between putStr and getLine,
--   it would not be needed if we had printer a newline
-- the other solution is to disable buffering with hSetBuffering
putStr_getLine s = do putStr s
                      hFlush stdout
                      getLine

msoundex = do user <- putStr_getLine "Lookup user: "
              matching <- fmap (filter $ matches user) getAllUserEntries
              mapM_ (putStrLn . format_user) matching
    where 
      matches wanted user = soundex wanted `elem` (map soundex $ userName user : words (userGecos user))
      format_user user = userName user ++ ": " ++ userGecos user
                         
-- TODO, pstyle and psgrep                       
                         
----------------------------------------------------------------------------------------------------

Program: fixstyle

Program: psgrep