6. Pattern Matching

Introduction

#-----------------------------
regexp $pattern $string
regsub $pattern $string $replacement
#-----------------------------
regexp sheep $meadow            # True if $meadow contains "sheep"

#-----------------------------
regsub old $meadow new meadow   # Replace "old" with "new" in $meadow

#-----------------------------
if [regexp -nocase {\bovines?\b} $meadow ] {
    puts -nonewline {Here be sheep!}
}

#-----------------------------
set string {good food}
set string [regsub {o*} $string e] # regsub with out replacement var returns result.

#-----------------------------
foreach i [regexp -all -inline {\d+}] {
    puts "Found number $i"
}

#-----------------------------
set numbers [regexp -all -inline {\d+}]

#-----------------------------
set digits 123456789
set nonlap [regexp -inline -all {\d\d\d} $digits]
#no direct way for overlap since the regex behavior of /g|-all in tcl is differnt from perl.
set yeslap {}
for {set i 0} {$i < [string length $digits]} {incr i} {
     set match [regexp -inline {\d\d\d} [string range $digits $i end]]
     if {[string length $match]} {
        lappend yeslap $match
     }
}

#-----------------------------
# no direct pre and post match vars in tcl.
set string {And little lambs eat ivy}
regexp -indices  -- {l.*s} $string idxs
set start [lindex $idxs 0]
set stop [lindex $idxs 1]
puts "([string range $string 0 $start-1]) ([string range $string $start $stop]) ([string range $string $stop+1 end])"

Copying and Substituting Simultaneously

#-----------------------------
set dst $src
regsub this $dst that dst

#-----------------------------
regsub this $src that dst

#-----------------------------
# strip to basename
regsub ^.*/ $::argv0 {} progname

#-----------------------------
# it is easier to do it this way than the next.
package require struct::list
::struct::list map $words {string totitle}

# using regex.
set capword [gregsub {(\w+)} $words {
      r {return [string totitle $r]}
}]

#-----------------------------
# /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1
regsub {man(?=\d)} $manpage cat catpage

#-----------------------------
set bindirs {/usr/bin /bin /usr/local/bin}
set libdirs [string map {bin lib} $bindirs]
puts $libdirs
# /usr/lib /lib /usr/local/lib

#-----------------------------
regsub -all x $a y b # copy changed string to b
set b [regsub -all x $a y a] # change a, count goes in b

Matching Letters

#-----------------------------
# matching letters
if [regexp {^[A-Za-z]+$} $var] {
    #may be better to user [[:alpha:]]+$
}

#-----------------------------
if [regexp {^[[:alpha:]]+$} $var] {
    puts "var is purely alphabetic"
}

#-----------------------------
readlines $data {
    {line} {
        if {[regexp {^[[:alpha:]]+$} $line]} {
            puts -nonewline "$line: alphabetic"
        } else {
            puts -nonewline "$line: line noice"
        }
    }
}
#__END__
#silly
#façade
#coöperate
#niño
#Renée
#Molière
#hæmoglobin
#naïve
#tschüß
#random!stuff#here

Matching Words

#-----------------------------
# matching words
{\S+} # as many non-whitespace bytes as possible
{[A-Za-z'-]+} # as many letters apostrophes and hyphens

#-----------------------------
{\y([A-Za-z]+)\y} # usually best
{\s([A-Za-z]+)\s} # fails at ends or w/ punctuation

Commenting Regular Expressions

#-----------------------------
package require Tclx
set str {www.tcl.tk}

set re {(?x)                # allow formatting
    (                       # capture group
        (?:                 # grouping parens
            (?! [-_] )      # lookahead for neither - nor _
            [\w] +          # hostname component
            \.              # add domain dot
        )+                  # repeat
        [A-Za-z]            # next must be letter
        [\w-]+              # now trailing domain part
    )
}

puts [gregsub $re $str {
        {host} {
            return "$host \[[host_info addresses $host]\]"
        }
    }]

#-----------------------------
set re {(?x)       # replace
    \#             # a pound
    (\w+)          # varname
    \#             # another pound
}

puts [gregsub $re $str {
        {var} {
            return [uplevel 2 "set $var"]
        }
    }]

Finding the Nth Occurrence of a Match

#-----------------------------
# finding Nth occurence of a match
set pond  "One fish two fish red fish blue fish"
set want 3
set count 0
gregsub {(?i)(\w+)\s+fish} $pond {
    {c} {
        variable want
        variable count
        incr count
        if {$want == $count} {
            puts "The third fish is a $c one"
        }
    }
}

#-----------------------------
set fishes [regexp -all -inline -- {(?i)\w+(?=\s+fish)} $pond ]
puts "The third fish is a [lindex $fishes 2] one."

#-----------------------------
{(?i)(?:\w+\s+fish\s+){2}(\w+)\s+fish}

#-----------------------------
set count 0
gregsub {(?i)(\w+)\s+fish} $pond {
    {c} {
        uplevel 2 {incr count} #or what eveer you want to do.
    }
}

#-----------------------------
set count [regsub -all -- {PAT} $string {} {}]

#-----------------------------
set count [expr [llength [regexp -all -- {PAT} $string]] + 1]

#-----------------------------
# no overlapping matches.
# @@INCOMPLETE@@

#-----------------------------
set colors [regexp -all -inline -- {(?i)\w+(?=\s+fish)} $pond ]
set color [lindex $colors 2]

# with out temporary.
set color [lindex [regexp -all -inline -- {(?i)\w+(?=\s+fish)} $pond ] 2]
puts "The third fish in the pond is $color"

#-----------------------------
set evens {}
foreach {a b} [regexp -all -inline -- {(?i)\w+(?=\s+fish)} $pond ] {
    lappend evens $b
}
puts "The even numbered fish are $evens"

#-----------------------------
# hard to do sushi.
#-----------------------------
set pond  "One fish two fish red fish blue fish swim here"
set color [lindex [regexp -all -inline -- {(?i)\w+(?=\s+fish)} $pond ] end]
puts "Last fish is $color"

# last fish is blue
#-----------------------------
set re {(?x)
    A           # find some pattern A
    (?!         # mustn't be able to find
        .*      # something
        A       # and A
    )
    $           # thru end of str
}

#-----------------------------
set pond  "One fish two fish red fish blue fish swim here"
if [regexp -- {(?x)
        \y(\w+)\s+fish\y
        (?!.*\yfish\y)
    } $pond all one] {
    puts "Last fish is $one"
} else {
    puts "Failed."
}
# last fish is blue

Matching Multiple Lines

#-----------------------------
argf-iter {
    line {
        puts [regsub -all -- {<.*>} $line {}]
    }
}

#-----------------------------
# headerfy: change certain chapter headers to html
set re {(?xn)
    \A              # start of record
    (
        ^Chapter    # title
        \s+
        \d+         # decimal number
        \s+
        :
        .*
     )$
}

set options(CR) "\n"
argf-iter {
    para {
        variable re
        puts -nonewline [regsub -all -- $re $para {<H1>\1</H1>}]
    }
}
array unset options(CR)

#-----------------------------
set options(CR) "\n\n"
argf-iter {
    para {
        gregsub {(?w)^START(.*?)^END} $para {
            {chunk} {
                puts -nonewline "chunk in $::argv has $chunk"
            }
        }
    }
}

Reading Records with a Pattern Separator

#-----------------------------
# reading records with a pattern separator
set chunks [split [regsub -all -- {pattern} [read -nonewline $fd] "\0"] "\0"]

#-----------------------------
set chunks [split [regsub -all -- {(?n)^\.(Ch|Se|Ss)$} [read -nonewline $fd] "\0"] "\0"]
set len [llength $chunks]
puts "I read $len chunks"

Extracting a Range of Lines

#-----------------------------
# tcl does not have regexp range operators

#-----------------------------
set fd [open $argv]
set data [split [read $fd] "\n"]

regrange {BEGIN PATTERN} .. {ENDPATTERN} $data {
    {line} {
        puts ">$line"
    }
}

set fd [open $argv]
set data [split [read $fd] "\n"]
foreach line [lrange $data $first_line_no $last_line_no-1] {
    puts $line
}

#-----------------------------
set fd [open $argv]
set data [split [read $fd] "\n"]

regrange {BEGIN PATTERN} ... {ENDPATTERN} $data {
    {line} {
        puts ">$line"
    }
}

set fd [open $argv]
set data [split [read $fd] "\n"]
foreach line [lrange $data $first_line_no-1 $last_line_no] {
    puts $line
}

#-----------------------------
set fd [open $argv]
set data [split [read $fd] "\n"]
puts [lrange $data 15-1 17-1] # prints lines 15 .. 17 as it is indexed by 0.

#-----------------------------
# the perl logic is not directly portable due to absence of range operators.
set in_header {}
regrange {} .. {^$} $data {
    {line} {
        variable in_header
        lappend in_header $line
    }
}

set in_body {}
regrange {^$} .. {$-^} $data { # $-^ will not match any thing thus leaving an open end.
    {line} {
        variable in_body
        lappend in_body $line
    }
}

#-----------------------------
set fd [open $argv]
set data [split [read $fd] "\n"]
array set seen {}
regrange {(?i)^From:?\s} .. {^$} $data {
    line {
        variable seen
        set ids [regexp -inline -all {[^<>(),;\s]+\@[^<>(),;\s]+} $line]
        foreach id $ids {
            if {![info exists seen($id)]} {
                puts $id
                set seen($id) 0
            } else {
                incr seen($id)
            }
        }
    }
}

Matching Shell Globs as Regular Expressions

#-----------------------------
proc glob2pat globstr {
    # note - we dont need to do this, we already have 'glob' command.
    # escapes the chars '\' '.' '+' '^' '$' '{' '}' '(' ')'
    set patmap {
       "\\" "\\\\"
        {.} {\.}
        {+} {\+}
        {^} {\^}
        {$} {\$}
        "{" "\{"
        "}" "\}"
        {(} {\(}
        {)} {\)}

        * .*
        ? .
        [ [
        ] ]
    }
    # using a bre to avoid other regexp rules
    return [append {} (?b)^ [string map $patmap [join $globstr]] $]
}

Speeding Up Interpolated Matches

#-----------------------------
# tcl caches compiled regexp if it is assigned to a variable (and even if it is not,
# but that is restricted to last 30) so /o in perl is not necessary here.
set pattern {blue}
argf-iter {
    line {
        variable pattern
        if [regexp  -- $pattern $line] {
            # do something.
        }
    }
}

#-----------------------------
set popstates {CO ON MI WI MN}
while {[gets $fd line] >= 0} {
    foreach state $popstates {
        if [regexp  -- $state $line] {
            puts $line
            break
        }
    }
}

#-----------------------------
# using argf-iter
set popstates {CO ON MI WI MN}
argf-iter {
    line {
        variable popstates
        foreach state $popstates {
            if [regexp  -- $state $line] {
                puts -nonewline $line
                break
            }
        }
    }
}

#-----------------------------
set popstates {CO ON MI WI MN}
set pre {while {[gets $fd line]>= 0}}
set code {}
foreach state $popstates {
    append code [subst -nocommands {
    if [regexp  -- $state [set line]] {
        puts [set line]
    }
    }]
}

eval [lappend pre $code]

#-----------------------------
package require struct::list

set fd [open $argv]
set sw_pre {[switch -regexp {$line}}
set code {}
append code [::struct::list map $popstates {apply {
        {state} {
            return "$state {return 1}"
        }
    }}]
lappend code {default {return 0}}

set tmp {}
set myproc [append tmp $sw_pre { } [list [join $code]] {]}]
while {[gets $fd line] >= 0} {
    if [subst [subst -nocommands $myproc]] {
        puts $line
    }
}

#-----------------------------
proc build_exp words {
    # return a list of lambdas that can be applied to a line to get a
    # result string containing matching results.
    return [::struct::list map $words {apply
        {{word} {
            return "line {return \[regexp $word \$line\]}"
        }}
    }]
}

proc func {var f} {
    return [apply $f $var]
}

proc + {a b} {return [expr ($a + $b)]}
proc * {a b} {return [expr ($a * $b)]}

proc build_match_func {func init words} {
    #return an applicable lambda.
    return "line {return \[::struct::list fold \[::struct::list map \[build_exp {$words}\] \[list func \$line\]\] $init $func\]}"
}

set match_any [build_match_func + 0 $words]
set match_all [build_match_func * 1 $words]

while {[gets $fd line] >= 0} {
    if [apply $match_all $line] {
        puts $line
    }
}

#-----------------------------
# we cache all regex in tcl. so there is no difference here.
set popstates {CO ON MI WI MN}
while {[gets $fd line] >= 0} {
    foreach state $popstates {
        if [regexp  -- $state $line] {
            puts $line
            break
        }
    }
}

Testing for a Valid Pattern

#-----------------------------
chan configure stdout -buffering none
while {![eof stdin]} {
    if {[catch {
        puts -nonewline "Pattern? "
        gets stdin pat
        regexp $pat {}
    } err]} {
        puts "Invalid pattern"
    }
}

#-----------------------------
proc is_valid_pattern pat {
    return [expr ![catch {regexp $pat {}} err]]
}

#-----------------------------
set rexp [lindex $argv 0]
if [catch "regexp $rexp {}" err] {
    puts "Bad Pattern $rexp: $::argv0"
    exit -1
}
set fd [open [lindex $argv 1]]
foreach para [split [regsub -all -- "\n\n" [read $fd] "\0"] "\0"] {
    if [regexp $rexp $para] {
        puts $para
    }
}
close $fd

#-----------------------------
set safe [interp create -safe]
if [$safe eval {regexp $pat $line}] {
    do_something
}

Honoring Locale Settings in Regular Expressions

#-----------------------------
# @@INCOMPLETE@@
# @@INCOMPLETE@@

Approximate Matching

#-----------------------------
# @@INCOMPLETE@@
# @@INCOMPLETE@@

Matching from Where the Last Pattern Left Off

#-----------------------------
# perl does not support \G switch
# so we are left with:

gregsub {(\d+)} $str {
        {match} {
            puts "Found $match"
        }
    }

#-----------------------------
set n [gregsub {^( )} $str {
        {match} {
            return 0
        }
    }]

#-----------------------------
gregsub {^,?(\d+)} $str {
        {match} {
            puts "Found number $match"
        }
    }

#-----------------------------
# tcl does not have /c modifier either.
proc gmatch {exp str block} {
    set start 0
    while 1 {
        if {[regexp -indices -start $start -- $exp $str idx]} {
            set start [expr [lindex $idx 1] + 1]
            apply $block [string range $str {expand}$idx]
        } else break
    }
    return $start
}

set str "The year 1752 lost 10 days on the 3rd of September"
set e [gmatch {\d+} $str {
        {match} {
            puts $match
        }
    }]

if [regexp -indices -start $e -- {\S+} $str idx] {
    puts "Found [string range $str {expand}$idx] after last number"
}

#-----------------------------
# use the [lindex $idx end] as the pos for next regexp match..

Greedy and Non-Greedy Matches

#-----------------------------
# try removing tags very badly
regsub -all -- {<.*>} $line {} line

#-----------------------------
# non greedy but still bad.
regsub -all -- {<.*?>} $line {} line

#-----------------------------
# stil wrong
set txt "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me
too!</i></b>"
regexp -all -inline -- {(?x) <b><i>(.*?)</i></b> } $txt

#-----------------------------
{(?x)BEGIN((?:(?!BEGIN).)*)END}

#-----------------------------
{(?x) <b><i>(  (?:  (?!</b>|</i>). )* )</i></b> }

#-----------------------------
{(?x) <b><i>(  (?:  (?!</[bi]>). )* )</i></b> }

#-----------------------------
{(?x)
    <b><i>
    [^<]*       #stuff not possibly bad and not possibly end
    (?:
        (?!  </?[ib]>  ) #what we cant have
        <
        [^>]*
    ) *
    </i></b>
}

Detecting Duplicate Words

#-----------------------------
# no easy way to do this.
# a difference in the tcl regex implementation means that if I say \1+, it immediatly
# changes the definition of \1 do not know if this behavior is correct.
# but it means that unlike the perl version, we print the dup words multiple times.
# if they are repeated more than 2 times .
# using a non capturing gropu (?:xx\1xx) did not help.

set fd [open $argv]
set p 0
foreach para [split [regsub -all -- "\n\n" [read -nonewline $fd] "\0"] "\0"] {
    incr p
    set start 0
    while 1 {
        set re {\y(\S+)\y(\s+\1\y)}
        if {[regexp -indices -start $start -- $re $para all one two]} {
            puts "dup word '[string range $para {expand}$one]' at paragraph $p"
            set start [expr [lindex $all end] + 1]
        } else break
    }
}

#-----------------------------
set a nobody
set b bodysnatcher
if [regexp -- {^(\w+)(\w+) \2(\w+)$} "$a $b" all 1 2 3] {
    puts "$2 overlaps in $1-$2-$3"
}

#-----------------------------
{^(\w+?)(\w+) \2(\w+)$}

#-----------------------------
# prime factors
set arg 180
set cap [string repeat o $arg]
while {[regexp -- {^(oo+?)\1+$} $cap all one]} {
    puts -nonewline [string length $one]
    regsub -all $one $cap o cap
}
puts [string length $cap]

#-----------------------------
# diophantine
set cap [string repeat o 281]
if {[regexp -- {(o*)\1{11}(o*)\2{14}(o*)\3{15}$} [string repeat o 281]
all 1 2 3]} {
    puts "One solution is x=[string length $1] y=[string length $2]
z=[string length $3]"
} else {
    puts "No match"
}

# One solution is x=17 y=3 z=2
#-----------------------------
{^(o+)\1{11}(o+)\2{14}(o+)\3{15}$}  => One solution is x=17 y=3 z=2
{^(o*?)\1{11}(o*)\2{14}(o*)\3{15}$} => One solution is x=0 y=7 z=11
{^(o+?)\1{11}(o*)\2{14}(o*)\3{15}$} => One solution is x=1 y=3 z=14

Expressing AND, OR, and NOT in a Single Pattern

#-----------------------------
# alpha | beta
{alpha|beta}

#-----------------------------
# alpha & beta
{(?=.*alpha)(?=.*beta)}

#-----------------------------
# alpha beta | beta alpha
{alpha.*beta|beta.*alpha}

#-----------------------------
# !beta
{^(?:(?!beta).)*$}

#-----------------------------
# !bad but good
{(?=(?:(?!BAD).)*$)GOOD}
# we dont have an operator like =~ or !~ in tcl so no prefered way.

#-----------------------------
if {[expr [regexp {pat1} $string] && [regexp {pat2} $string]]} {
    something
}

#-----------------------------
if {[expr [regexp {pat1} $string] || [regexp {pat2} $string]]} {
    something
}

#-----------------------------
# mini grep
set pat [::struct::list shift argv]
argf-iter {
    line {
        variable pat
        if [regexp $pat $line] {
            puts -nonewline $line
        }
    }
}

#-----------------------------
regexp {(?=.*bell)(?=.*lab)} "labelled"
[expr {[regexp {} bell] && [regexp {} lab]}]

#-----------------------------
if [regexp {(?xw)
    ^               # start
    (?=             # lookahead
        .*
        bell
    )
    (?=
        .*
        lab
    )
} $murray_hill] {
    puts "Looks like Bell Labs might be in Murray Hill!"
}

#-----------------------------
regexp {(?:^.*bell.*lab)|(?:^.*lab.*bell)} labelled

#-----------------------------
set brand labelled
if [regexp {(?xw)
        (?:                 # non-capturing grouper
            ^ .*?           # any amount of stuff at the front
              bell          # look for a bell
              .*?           # followed by any amount of anything
              lab           # look for a lab
          )                 # end grouper
    |                       # otherwise, try the other direction
        (?:                 # non-capturing grouper
            ^ .*?           # any amount of stuff at the front
              lab           # look for a lab
              .*?           # followed by any amount of anything
              bell          # followed by a bell
          )                 # end grouper
} $brand] {
    puts "Our brand has bell and lab separate."
}

#-----------------------------
regexp {(?w)^(?:(?!waldo).)*$} $map

#-----------------------------
if [regexp {(?xw)
        ^                   # start of string
        (?:                 # non-capturing grouper
            (?!             # look ahead negation
                waldo       # is he ahead of us now?
            )               # is so, the negation failed
            .               # any character (cuzza /s)
        ) *                 # repeat that grouping 0 or more
        $                   # through the end of the string
} $map] {
    puts "There's no waldo here!"
}

#-----------------------------
{(?x)
    ^                       # anchored to the start
    (?!                     # zero-width look-ahead assertion
        .*                  # any amount of anything (faster than .*?)
        ttyp                # the string you don't want to find
    )                       # end look-ahead negation; rewind to start
    .*                      # any amount of anything (faster than .*?)
    tchrist                 # now try to find Tom
}

Matching Multiple-Byte Characters

#-----------------------------
# @@INCOMPLETE@@
# @@INCOMPLETE@@

Matching a Valid Mail Address

#-----------------------------
# @@INCOMPLETE@@
# @@INCOMPLETE@@

Matching Abbreviations

#-----------------------------
set ans [gets stdin]
set safe [interp create -safe]
if [$safe eval {regexp -nocase SEND $ans}] {
    puts {Action is send}
} elseif [$safe eval {regexp -nocase STOP $ans}] {
    puts {Action is stop}
} elseif [$safe eval {regexp -nocase START $ans}] {
    puts {Action is start}
} elseif [$safe eval {regexp -nocase ABORT $ans}] {
    puts {Action is abort}
} elseif [$safe eval {regexp -nocase LIST $ans}] {
    puts {Action is list}
} elseif [$safe eval {regexp -nocase EDIT $ans}] {
    puts {Action is edit}
}

#-----------------------------
set ans [gets stdin]
set safe [interp create -safe]
proc smatch {exp data} {
    variable safe
    return [$safe eval [list regexp -nocase $exp $data]]
}

set actions {SEND STOP START ABORT LIST EDIT}
foreach act $actions {
    if [smatch $act $ans] {
        puts "Action is [string tolower $act]"
    }
}

#-----------------------------
set errors 0
argf-iter {
    cmd {
        variable errors
        switch -regexp $cmd {
            edit invoke_editor
            send deliver_message
            list {$pager $file}
            abort {
                puts {see you}
                exit
            }
            default {
                puts "unknown command $cmd"
                incr errors
            }
        }
    }
}

Program: urlify

#-----------------------------
set urls {(http|telnet|gopher|file|wais|ftp)}
set ltrs {\w}
set gunk {/#~:.?+=&%@!\-}
set punc {.:?\-}
set any  "$ltrs$gunk$punc"
argf-iter {
    line {
        variable urls
        variable ltrs
        variable gunk
        variable punc
        variable any
        #puts [subst -nocommands (?x)
        regsub -all [subst -nocommands {(?x)
          \\y
          (
           $urls     :
           [$any] +?
          )
          (?=
           [$punc]*
           [^$any]
           |
           $
          )
         }] $line {<A HREF="\1">\1</A>} line
        puts $line
    }
}

Program: tcgrep

#-----------------------------
# @@INCOMPLETE@@
# @@INCOMPLETE@@

Regular Expression Grabbag

#-----------------------------
{(?i)^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$}

#-----------------------------
regsub {(\S+)(\s+)(\S+)} $str {\3\2\1} str

#-----------------------------
{(\w+)\s*=\s*(.*)\s*$}             # keyword is $1, value is $2

#-----------------------------
{.{80,}}
#-----------------------------
{(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)}

#-----------------------------
regsub -all {/usr/bin} $str {/usr/local/bin} str

#-----------------------------
gregsub {%([0-9A-Fa-f][0-9A-Fa-f])} $str {
    {match} {
        return [format %x $match]
    }
}

#-----------------------------
regsub -all {(?x)
    /\*                    # Match the opening delimiter
    .*?                    # Match a minimal number of characters
    \*/                    # Match the closing delimiter
} $str {}

#-----------------------------
regsub {^\s+} $str {} str
regsub {\s+$} $str {} str

# but really, in Ruby we'd just do:
string trim $str

#-----------------------------
regsub -all {\\n} $str "\n" str

#-----------------------------
regsub -all {^.*::} $str {} str

#-----------------------------
{(?x)^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\.
    ([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$}

#-----------------------------
regsub {.*$/|} $str {} str

#-----------------------------
set cols { }
if [info exists env::(TERMCAP)] {
    set cols $::env(TERMCAP)
}
if [regexp {:co#(\d+):} $cols all one] {
    set cols $one
} {
    set cols 80
}

#-----------------------------
set name [regsub -all { /\S+/|} "$::argv0 $argv" { }]

#-----------------------------
if {![regexp -nocase {linux} $tcl_platform(os)]} {
    error "This isn't Linux"
}

#-----------------------------
regsub -all {\n\s+} $str {} str

#-----------------------------
regexp -all {\d+\.?\d*|\.\d+} $line nums

#-----------------------------
# @@INCOMPLETE@@
# no direct translation for \W
# regexp -all {\y[^\Wa-z0-9_]+\y} $line capword

#-----------------------------
# regexp -all {\y[^\WA-Z0-9_]+\y} $line lowords

#-----------------------------
# regexp -all {\y[^\Wa-z0-9_][^\WA-Z0-9_]*\y} $line icwords

#-----------------------------
regexp -all {<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>} $line links

#-----------------------------
set initial {}
regexp {^\S+\s+(\S)\S*\s+\S} $line all initial

#-----------------------------
# @@INCOMPLETE@@
#-----------------------------
set sentences {}
foreach para [split [regsub -all -- "\n\n" [read -nonewline $fd] "\0"] "\0"] {
    regsub -all -- "\n" $para { } para
    regsub -all -- { {3,}} $para {  } para
    lappend sentences [regexp -all -inline {\S.*?[!?.](?=  |\Z)}]
}

#-----------------------------
{(\d{4})-(\d\d)-(\d\d)}            # YYYY in $1, MM in $2, DD in $3

#-----------------------------
# @@INCOMPLETE@@
#-----------------------------
{(?i)\yoh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\y}

#-----------------------------
# @@INCOMPLETE@@