1. Strings

Introduction



# Tcl's "..." corresponds to Perl's "...", while the other
# quoting construct {...} is more similar to Perl's q{...}
# operator.

# It's not necessary to quote text data in Tcl as long as
# it doesn't contain whitespace.



set string {\n}                     ;# two characters, \ and n

set string "jon 'maddog' orwant"    ;# literal single quotes

set string \n                       ;# a "newline" character

set string "jon \"crosby\" orwant"  ;# literal double quotes

set string {jon "stills" orwant}    ;# literal double quotes

set string "jon {nash} orwant"      ;# literal braces

set string {jon {young} orwant}     ;# literal braces

set a {
this is a multiline string
terminated by an unescaped and
{unnested} right brace (\})
}

Accessing Substrings



set value [string range $string $first $last]

set value [string range $string $first [expr {$first+$count-1}]]

set value [string range $string $first end]


set string [string replace $string $first $last $newstring]

set string [string replace $string $first [expr {$first+$count-1}] $newstring]

set string [string replace $string $first end $newtail]


# get a 5-byte string, skip 3, then grab 2 8-byte strings, then the rest
binary scan $data "A5 x3 A8 A8 A*" leading s1 s2 trailing


# Important note: the above was all well and good when the Cookbook was 
# written and a character and a byte were the same size.  They still
# are for some programming languages, but Tcl for one uses 16-bit
# Unicode characters to encode strings.

#   The above unpack/scan works for strings containing only character
# codes in the range 0--255, but distorts other strings by truncating
# all codes to 8 bits.

#   To avoid this, the input string can be converted to an 8-bit
# encoding before scanning:


encoding convertto utf-8 "H\u2082O is the chemical formula for water"
# => Hâ??O is the chemical formula for water


# split at five-byte boundaries (16-bit safe)
set fivers [list]
set temp [encoding convertto utf-8 $string]
while {[binary scan $temp a5a* group tail]} {
    lappend fivers $group
    set temp $tail
}
if {[string length $tail]} { lappend fivers $tail }
set fivers


# split at five-char boundaries (16-bit safe)
set fivers [regexp -all -inline {.{1,5}} $data]


# chop string into individual characters:
set chars [split $data {}]


# "This is what you have"
# +012345678901234567890  Indexing forwards (left to right)
#  098765432109876543210- Indexing from end (right to left)
# note that 0 means 10 or 20, etc. above

# end is a special value that is available in list and string 
# commands.  It is defined as the index of the last element (in 
# lists), or character (in strings).
#   Likewise, end-1 is defined as the element/character
# preceding the last, and so on.


set first [string index "This is what you have" 0]
# => T

 
set start [string range "This is what you have" 5 6]
# => is

 
set rest [string range "This is what you have" 13 end]
# => you have

 
set last [string index "This is what you have" end]
# => e

 
set end [string range "This is what you have" end-3 end]
# => have

 
set piece [string range "This is what you have" end-7 end-5]
# => you


# The general technique here is to mutate a string and then assign
# it back to the variable.  One can [replace] a segment of the string
# with another string or with an empty string (deleting the segment)
# or simply select a segment using [range].


 
set string [string replace "This is what you have" 5 6 wasn't]
# => This wasn't what you have

 
set string [string replace "This wasn't what you have" end-11 end ondrous]
# => This wasn't wondrous

 
set string [string range "This wasn't wondrous" 1 end]
# => his wasn't wondrous

 
set string [string range "his wasn't wondrous" 0 end-10]
# => his wasn'


if {[regexp $pattern [string range $string end-9 end]]} {
    return "Pattern matches in last 10 characters"
} else {
    return "Match failed"
}


# substitute "at" for "is", restricted to first five characters
regsub -all is [string range $string 0 4] at newstring
set string [string replace $string 0 4 $newstring]


regsub {(.)(.*)(.)} "make a hat" {\3\2\1} a
puts $a
# => take a ham


set b [string range "To be or not to be" 6 11]
# => or not

 
set a "To be or not to be"
set b [string range $a 6 7]
append b : [string range $a 3 4]
# => or:be


proc cut2fmt {args} {
    set positions $args
    set template  {}
    set lastpos   1
    foreach {place} $positions {
        append template "A[expr {$place-$lastpos}] "
        set lastpos $place
    }
    append template A*
    return $template
}
    
set fmt [cut2fmt 8 14 20 26 30]
# => A7 A6 A6 A6 A4 A*

Establishing a Default Value



# In Tcl, commands such as if or while require the value of the
# condition expression to be a proper boolean value.  If the 
# value is numeric, 0 is false and anything else is true.  For
# non-numeric strings, "true", "on", or "yes" is true and 
# "false", "off", or "no" is false.  Any other value for the
# condition expression raises an error.
#   The `boolean operators' return either "1" or "0".



# use $b if b has characters, else $c
if {[string length $b]} {
    set a $b
} else {
    set a $c
}

 
# use $b if b is non-zero, else $c
if {$b != 0} {
    set a $b
} else {
    set a $c
}


# set x to $y if $x has no characters
if {![string length $x]} {
    set x $y
}

 
# set x to $y if $x is zero
if {$x == 0} {
    set x $y
}


# set a to $b if b exists, else to $c
if {[info exists b]} {
    set a $b
} else {
    set a $c
}


# Perl: $dir = shift(@ARGV) || "/tmp";
set arg [lindex $argv 0]
set argv [lrange $argv 1 end]
if {[string length $arg]} {
    set dir $arg
} else {
    set dir /tmp
}


# Perl: $dir = $ARGV[0] || "/tmp";
set arg [lindex $argv 0]
if {[string length $arg]} {
    set dir $arg
} else {
    set dir /tmp
}


# Perl: $dir = defined($ARGV[0]) ? shift(@ARGV) : "/tmp";
if {[info exists argv] && [llength $argv]} {
    set dir [lindex $argv 0]
    set argv [lrange $argv 1 end]
} else {
    set dir /tmp
}

 
# Perl: $dir = @ARGV ? $ARGV[0] : "/tmp";
if {[llength $argv]} {
    set dir [lindex $argv 0]
} else {
    set dir /tmp
}


# Perl: $count{ $shell || "/bin/sh" }++;
if {![string length $shell]} {
    set shell /bin/sh
}
if {[info exist count($shell)]} {
    incr count($shell)
} else {
    set count($shell) 1
}


# The catch command intercepts errors raised.  In this
# case catch is used as an alternative idiom to 
# [info exists var].
# The pros and cons of the different idioms is 
# discussed at <URL: http://mini.net/tcl/1322.html>.



# find the user name on Unix systems; needs extension to
# check getlogin() and getpwuid()
if {![catch {string length $env(USER)}]} {
    set user $env(USER)
} elseif {![catch {string length $env(LOGIN)}]} {
    set user $env(LOGIN)
} else {
    set user "Unknown user"
}

# The most obvious way to do the above in Tcl is
set ::tcl_platform(user)

if {![string length $startingPoint]} {
    set startingPoint Greenwich
}


# if x has no elements, assign $y to it
if {[llength $x] == 0} {
    set x $y
}

# if y has elements, assign it to x, otherwise assign $z to x
if {[llength $y]} {
    set x $y
} else {
    set x $z
}

Exchanging Values Without Using Temporary Variables



# cross-assignment
foreach {b a} $args break


# cross-assignment with temp
set temp $a
set a $b
set b $temp
unset temp


foreach {alpha beta production} [list January March August] break
# move beta       to alpha,
# move production to beta,
# move alpha      to production
foreach {alpha beta production} [list $beta $production $alpha] break

Converting Between ASCII Characters and Values



set num [scan $char %c]


set char [format %c $num]

format "Number %d is character %c" 101 101
# => Number 101 is character e


set utf8data [encoding convertto utf-8 $string]
binary scan $utf8data c* codelist


set utf8data [binary format c* $codelist]
set string [encoding convertfrom utf-8 $utf8data]


proc hal2ibm {} {
    set hal HAL
    binary scan $hal c* codes
    foreach {num} $codes {
        lappend newcodes [incr num]
    }
    set ibm [binary format c* $newcodes]
}
hal2ibm
# => IBM

Processing a String One Character at a Time



set a [split $string {}]


set utf8data [encoding convertto utf-8 $string]
binary scan $utf8data c* a


# with -line, . never matches newline
foreach 1 [regexp -inline -all -line . $string] {
    # do something with $1
}


proc indChars-1 {s} {
    array set seen [list]
    set string $s
    foreach {char} [split $string {}] {
        if {[info exists seen($char)]} {
            incr seen($char)
        } else {
            set seen($char) 1
        }
    }
    puts "unique chars are: {[join [lsort [array names seen]] {}]}"
}
indChars-1 "an apple a day"
# => unique chars are: { adelnpy}


# Of course, if all you care about are which unique 
# characters appear, it's much easier:



proc uniqueChars-1 {s} {
    puts "unique chars are: {[join [lsort -unique [split $s {}]] {}]}"
}
uniqueChars-1 "an apple a day"
# => unique chars are: { adelnpy}


# simplistic checksum calculation
proc simpleChecksum {string} {
    set sum 0
    binary scan $string c* codes
    foreach {code} $codes {
        incr sum $code
    }
    return $sum
}
simpleChecksum "an apple a day"
# => 1248


# The Trf package, which is available at
# <URL: http://www.oche.de/~akupries/soft/trf/>,
# has several utilities for transforming data,
# including message digests such as CRC and MD5.



package require Trf
binary scan [crc {an apple a day}] H* checksum
set checksum
# => 325295


# slowcat - emulate a   s l o w   line printer
# usage: slowcat [-DELAY] [files ...]
proc slowcat {args} {
    set delay 1
    if {[llength $args]} {
        if {[regexp {^-([.\d]+)} [lindex $args 0] match delay]} {
            set args [lrange $args 1 end]
        }
    }
    fconfigure stdout -buffering no
    if {[llength $args]} {
        foreach {arg} $args {
            set f [open $arg]
            lappend channels $f
        }
    } else {
        set channels stdin
    }
    foreach {chan} $channels {
        while {[gets $chan line] > -1} {
            foreach {ch} [split $line {}] {
                puts -nonewline $ch
                after [expr {int(5 * $delay)}]
            }
            puts {}
        }
    }
}

Reversing a String by Word or Character



proc reverse {args} {
    set res [list]
    if {[llength $args] == 1} {
        set args [lindex $args 0]
    }
    foreach elem $args {
        set res [linsert $res 0 $elem]
    }
    return $res
}


# reverse characters
join [reverse [split $string {}]] {}


# reverse words
join [reverse [split $string]]

# reverse quoted words
join [reverse [split {Yoda said, "can you see this?"}]]
# => this?" see you "can said, Yoda


set word reviver
set is_palindrome [string equal $word [join [reverse [split $word]]]]
# => 1

Expanding and Compressing Tabs



package require textutil
namespace import ::textutil::tabify::*
tabify "...     zzz        xxx"
# => ...     zzz        xxx


untabify "...\tzzz\txxx"
# => ...        zzz        xxx


tabify2 "...     zzz     xxx"
# => ...        zzz     xxx


untabify2 "...\tzzz\txxx"
# => ...     zzz     xxx

Expanding Variables in User Input



set debt 100
subst "You owe $debt to me."
# => You owe 100 to me.


set debt 100
proc writeIt {string} {
    uplevel subst [list $string]
}
# braces prevent immediate substitution
writeIt {You owe $debt to me.}
# => You owe 100 to me.


foreach {rows cols} {24 80} break
set text {I am $rows high and $cols long}
subst $text
# => I am 24 high and 80 long


set string "I am 17 years old"
regsub -all {(\d+)} $string {[expr {\1*2}]} string
subst $string
# => I am 34 years old


# expand variables in $text, but put an error message in
# if the variable isn't defined.
proc expandOrError-1 {@text} {
    upvar ${@text} text
    while {[regexp {\$(\w+)} $text match var]} {
        if {[uplevel info exists $var]} {
            regsub \\$match $text [uplevel set $var] text
        } else {
            regsub \\$match $text "\[NO VARIABLE: $var\]" text
        }
    }
    set text
}


# Tcl allows commands to embedded in text data as well as variables. 
# If the string is taken from user input, this may be a security
# hazard.  The solution is to let a "safe interpreter" (which has a
# reduced set of commands by default) interpret the text data.  In
# case the interpreted script text contains illegal commands the
# interpreter raises an error.



proc safeExpand-1 {string} {
    set si [interp create -safe]
    set res [uplevel $si eval [list subst [list $string]]]
    interp delete $si
    set res
}
safeExpand-1 {[exec rm foo.bar]}
# => invalid command name "exec"


# It is also possible to further reduce the command set of an
# interpreter, or to add new commands, or to change the meaning
# of commands (i.e. exec would perform *some* system commands
# but not all, etc).

# But I digress...

Controlling Case



set little "bo peep"
set big [string toupper $little]
# => BO PEEP


set big "BO PEEP"
set little [string tolower $big]
# => bo peep


set little "bo peep"
set title [string totitle $little]
# => Bo peep


set little "bo peep"
set big [string toupper $little 0]
# => Bo peep


set big "BO PEEP"
set little [string tolower $big 0]
# => bO PEEP


# convert case within a string
set name {kirk}
set string "Ship's Captain: [string totitle $name]."
# => Ship's Captain: Kirk.


# capitalize each word's first character, downcase the rest
set text "thIS is a loNG liNE"
set pos 0
while {[regexp -indices -start $pos {(\w+)} $text where]} {
    foreach {first last} $where break
    set text [string totitle $text $first $last]
    set pos $last
    incr pos
}
puts $text
# => This Is A Long Line

# capitalize each word's first character, downcase the rest
# (another solution)
foreach word "thIS is a loNG liNE" {
    lappend words [string totitle $word]
}
puts $words
# => This Is A Long Line


# case insensitive string comparison
string equal -nocase foo Foo
# => 1


# randcap: filter to randomly capitalize 20% of the letters
set text {
001:001 In the beginning God created the heaven and the earth.
001:002 And the earth was without form, and void; and darkness was
        upon the face of the deep. And the spirit of God moved upon
        the face of the waters.
001:003 And God said, let there be light: and there was light.
}
set pos 0
while {[regexp -indices -start $pos {(\w)} $text where]} {
    foreach {first last} $where break
    if {rand()<=0.2} {
        set text [string toupper $text $first]
    } else {
        set text [string tolower $text $first]
    }
    set pos $last
    incr pos
}
puts $text
# => 
# =>         001:001 iN The begInNing god crEaTed tHe HeAven And thE earTh.
# => 
# =>         001:002 and tHe earth was wiThout form, aNd void; and darknESs Was
# =>                 upOn tHe faCe OF the deep. and the sPirIt Of goD moved upOn
# =>                 the fACE oF the wATers.
# => 
# =>         001:003 AnD goD said, lEt there be light: aND there wAs LighT.
# =>         

Interpolating Functions and Expressions Within Strings



# Interpolating functions and expressions within strings
set var1 Tool
proc func {s} {string totitle $s}
set var2 Language
set answer "$var1 [func command] $var2"
# => Tool Command Language


set n 5
set phrase "I have [expr {$n + 1}] guanacos."
# => I have 6 guanacos.


set rec foo:bar:baz
interp alias {} some_cmd {} join
some_cmd "What you want is [llength [split $rec :]] items"
# => What you want is 3 items


set text {
To: $naughty
From: Your Bank
Cc: [getManagerList $naughty]
Date: [clock format [clock seconds]] (today)
Dear $naughty,
Today, you bounced check number [expr {500 + int(rand()*100)}] to us.
Your account is now closed.
Sincerely,
the management
}
if {![sendMail $text $target]} {
    error "Couldn't send mail"
}

Indenting Here Documents



# all in one
regsub -line -all {^\s+} {
    your text
    goes here
} {} var
format %s \n$var
# => 
# => your text
# => goes here


# or with two steps
set var {
    your text
    goes here
}
regsub -line -all {^\s+} $var {} var
format %s \n$var
# => 
# => your text
# => goes here

# one more time
regsub -line -all {^\s+} {
    The five varieties of camelids
    are the familiar camel, his friends
    the llama and the alpaca, and the
    rather less well-known guanaco
    and vicuña.
} {} definition
# => 6


proc fix {string} {
    regsub -line -all {^\s+} $string {} string
    return $string
}

fix {
    My stuff goes here
}
# => My stuff goes here


# the end-of-string right brace doesn't have to be flush left
regsub -line -all {^\s+} {
    ...we will have peace, when you and all your works have
    perished--and the works of your dark master to whom you would
    deliver us. You are a liar, Saruman, and a corrupter of men's
    hearts.  --Theoden in /usr/src/perl/taint.c
                         } {} quote       ;# <-- looki looki
# move attribution to line of its own
regsub {\s+--} $quote \n-- quote
format %s \n$quote
# => 
# => ...we will have peace, when you and all your works have
# => perished--and the works of your dark master to whom you would
# => deliver us. You are a liar, Saruman, and a corrupter of men's
# => hearts.
# => --Theoden in /usr/src/perl/taint.c

proc rememberTheMain {} {
    dequote {
        @@@ int
        @@@ runops() {
        @@@     SAVEI32(runlevel);
        @@@     runlevel++;
        @@@     while ( op = (*op->op_ppaddr)() ) ;
        @@@     TAINT_NOT;
        @@@     return 0;
        @@@ }
    }
    # add more code here if you want
}
 
proc roadGoesEverOn {} {
    dequote {
       Now far ahead the Road has gone,
          And I must follow, if I can,
       Pursuing it with eager feet,
          Until it joins some larger way
       Where many paths and errands meet.
          And whither then? I cannot say.
                --Bilbo in /usr/src/perl/pp_ctl.c
    }
}
proc quotemeta {string} {
    regsub -all {(\W)} $string {\\\1} string
    return $string
}
proc dequote {text} {
    if {[regexp -line {^\s*(?:([^\w\s]+)(\s*).*\n)(?:\s*\1\2?.*\n)+(?:\s*)$} $text m 1 2]} {
        set white $2
        set leader [quotemeta $1]
    } else {
        regexp -line {^\n?(\s+)} $text m white
        set leader {}
    }
    regsub -line -all ^\\s*?$leader\(?:$white\) $text {} text
    return [string trimright $text]\n
}

Reformatting Paragraphs



# The tcllib 1.0 textutil module can adjust text
# to a specified line length, justify left, right,
# and plain, and fill lines to the right margin.
# However, it can't add indentation.



# A naive implementation of wrap.  Arguments:
# text  the text
# col   the line length (default 72)
# lead  first line indentation (def empty string)
# follow        indentation for following lines (def empty string)
proc wrap {text {col 72} {lead {}} {follow {}}} {
    set newtext {}
    set text $lead[string trimleft $text]
    set upto $col
    while {![string is wordchar [string index $text $upto]]} {incr upto -1}
    set upto [string wordstart $text $upto]
    if {$upto == 0} {
        set upto [string wordend $text $upto]
    } else {
        incr upto -1
    }
    append newtext [string range $text 0 $upto]\n
    set text [string replace $text 0 $upto]
    while {[string length $text]} {
        set text $follow[string trimleft $text]
        if {[string length $text] > $col} {
            set upto $col
            while {![string is wordchar [string index $text $upto]]} {incr upto -1}
            set upto [string wordstart $text $upto]
            if {$upto == 0} {
                set upto [string wordend $text $upto]
            } else {
                incr upto -1
            }
            append newtext [string range $text 0 $upto]\n
            set text [string replace $text 0 $upto]
        } else {
            append newtext $text
            set text {}
        }       
    }
    return $newtext
}

set input {"Folding and splicing is the work of an editor,"
      "not a mere collection of silicon"
      "and"
      "mobile electrons!"}
append res \n [string repeat 0123456789 2] \n
append res [wrap [join $input] 20 {    } {  }] \n
# => 
# => 01234567890123456789
# =>     Folding and 
# =>   splicing is the 
# =>   work of an 
# =>   editor, not a 
# =>   mere collection 
# =>   of silicon and 
# =>   mobile electrons!

Escaping Characters



# backslash
regsub -all (\[$charlist]) $var {\\\1} var


# double
regsub -all (\[$charlist]) $var {\1\1} var


set string {Mom said, "Don't do that."}
regsub -all {(['"])} $string {\\\1} string
puts $string
# => Mom said, \"Don\'t do that.\"

set string {Mom said, "Don't do that."}
regsub -all {(['"])} $string {\1\1} string
puts $string
# => Mom said, ""Don''t do that.""

set string {Mom said, "Don't do that."}
regsub -all {([^A-Z])} $string {\\\1} string    ;# or: ([^[:upper:]])
puts $string
# => M\o\m\ \s\a\i\d\,\ \"D\o\n\'\t\ \d\o\ \t\h\a\t\.\"

regsub -all {([^[:alnum:]])} "is a test!" {\\\1} string
puts "this $string"
# => this is\ a\ test\!

Trimming Blanks from the Ends of a String



string trim "\n\t Tcl \t\n"
# => Tcl


set string {
    foo bar
    baz
    
}
set res [list]
foreach {s} [split $string \n] {
    lappend res [string trim $s]
}
string trim [join $res]
# => foo bar baz


# The [gets] (get string) command always strips off the EOL
# sequence, be it CR, LF, or CRLF (configurable for the stream).

# Anyway, if you have a string that *might* have one or more
# \n characters at the end, and in case it does, you want to
# remove them:



string trimright "foo bar\n\n" \n
# => foo bar

Parsing Comma-Separated Data



# csv is a part of the standard ActiveTcl distribution
package require csv
set line {XYZZY,,"O'Reilly, Inc","Wall, Larry","a ""glug"" bit",5,"Error, Core Dumped"}
set fields [::csv::split $line]
set res {}
for {set i 0} {$i < [llength $fields]} {incr i} {
    append res \n "$i : [lindex $fields $i]"
}
puts $res
# => 
# => 0 : XYZZY
# => 1 : 
# => 2 : O'Reilly, Inc
# => 3 : Wall, Larry
# => 4 : a "glug" bit
# => 5 : 5
# => 6 : Error, Core Dumped

Soundex Matching

Program: fixstyle



# fixstyle - switch one set of strings to another set
#   usage: <scriptname> [-v] [files ...]
array set ::data {
    analysed         analyzed
    built-in         builtin
    chastized        chastised
    commandline      command-line
    de-allocate      deallocate
    dropin           drop-in
    hardcode         hard-code
    meta-data        metadata
    multicharacter   multi-character
    multiway         multi-way
    non-empty        nonempty
    non-profit       nonprofit
    non-trappable    nontrappable
    pre-define       predefine
    preextend        pre-extend
    re-compiling     recompiling
    reenter          re-enter
    turnkey          turn-key
}
 
set testtext {
    Yesterday we analysed the efficiency of the 
    built-in thingummies and were considerably 
    chastized by the results.  It seems that 
    commandline invocation forced the 
    whatchamacallit to de-allocate dropin 
    maguffins.  First, we tested instead to 
    hardcode meta-data -- especially when in 
    multicharacter and multiway format
    (obviously only for non-empty data sets).  
    However, that turned out to be a non-profit 
    improvement.  Dr Egnarts then demonstrated 
    using non-trappable signals in pre-define 
    mode to preextend save rates.  When 
    re-compiling we saw the application reenter 
    acceptable ratings on turnkey operations.
}
# verbose or non-verbose?
if {[llength $argv] && [string equal [lindex $argv 0] -v]} {
    set ::verbose yes
    set argv [lrange $argv 1 end]
} else {
    set ::verbose no
}
# prepare text to be read
set text {}
if {[string match *test [info script]]} {
    # if we're running a test:
    set text $testtext
} else {
    # Try to assemble text from input.  Do we have arguments?
    if {[info exists argv]} {
        # Yes; try to open each and read contents:
        foreach {fn} [lrange $argv 0 end] {
            if {![catch {open $fn} chan]} {
                append text [read $chan]
                close $chan
            }
        }
    }
    if {![string length $text]} {
        # we still don't have any text; try standard input
        # (inform user first)
        if {[tell stdin] == -1} {
            puts stderr "[info script]: Reading from stdin"
        }
        set text [read stdin]
    }
}
 
proc fixstyle {text} {
    global data verbose
    set newtext [list]
    foreach w $text {
        if {[catch {set word $data($w)}]} {
            set word $w
        } else {
            if {$verbose} {
                puts stderr "$w => $word"
            }
        }
        lappend newtext $word
    }
    return $newtext
}
 
fixstyle $text
# => Yesterday we analyzed the efficiency of the builtin thingummies and were considerably chastised by the results. It seems that command-line invocation forced the whatchamacallit to deallocate drop-in maguffins. First, we tested instead to hard-code metadata -- especially when in multi-character and multi-way format (obviously only for nonempty data sets). However, that turned out to be a nonprofit improvement. Dr Egnarts then demonstrated using nontrappable signals in predefine mode to pre-extend save rates. When recompiling we saw the application re-enter acceptable ratings on turn-key operations.

Program: psgrep