1. Strings

Introduction

#-----------------------------
mystr = "\n"   # a newline character
mystr = r"\n"  # two characters, \ and n
#-----------------------------
mystr = "Jon 'Maddog' Orwant"  # literal single quote inside double quotes
mystr = 'Jon "Maddog" Orwant'  # literal double quote inside single quotes
#-----------------------------
mystr = 'Jon \'Maddog\' Orwant'  # escaped single quote
mystr = "Jon \"Maddog\" Orwant"  # escaped double quote
#-----------------------------
mystr = """
This is a multiline string literal
enclosed in triple double quotes.
"""
mystr = '''
And this is a multiline string literal
enclosed in triple single quotes.
'''
#-----------------------------

Accessing Substrings

#-----------------------------

# get a 5-char string, skip 3, then grab 2 8-char strings, then the rest
# Note that struct.unpack cannot use * for an unknown length.
# See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/65224
import struct
(lead, s1, s2), tail = struct.unpack("5s 3x 8s 8s", data[:24]), data[24:]

# split at five-char boundaries
fivers = struct.unpack("5s" * (len(data)//5), data)
fivers = print [x[i*5:i*5+5] for i in range(len(x)/5)]

# chop string into individual characters
chars = list(data)
#-----------------------------
mystr = "This is what you have"
#       +012345678901234567890  Indexing forwards  (left to right)
#        109876543210987654321- Indexing backwards (right to left)
#         note that 0 means 10 or 20, etc. above

first = mystr[0]                            # "T"
start = mystr[5:7]                          # "is"
rest = mystr[13:]                           # "you have"
last = mystr[-1]                            # "e"
end = mystr[-4:]                            # "have"
piece = mystr[-8:-5]                        # "you"
#-----------------------------
# Python strings are immutable.
# In general, you should just do piecemeal reallocation:
mystr = "This is what you have"
mystr = mystr[:5] + "wasn't" + mystr[7:]

# Or replace and reallocate
mystr = "This is what you have"
mystr = mystr.replace(" is ", " wasn't ")

# DON'T DO THIS: In-place modification could be done using character arrays
import array
mystr = array.array("c", "This is what you have")
mystr[5:7] = array.array("c", "wasn't")
# mystr is now array('c', "This wasn't what you have")

# DON'T DO THIS: It could also be done using MutableString 
from UserString import MutableString
mystr = MutableString("This is what you have")
mystr[-12:] = "ondrous"
# mystr is now "This is wondrous"
#-----------------------------
# you can test simple substrings with "in" (for regex matching see ch.6):
if txt in mystr[-10:]:
    print "'%s' found in last 10 characters"%txt

# Or use the startswith() and endswith() string methods:
if mystr.startswith(txt):
    print "%s starts with %s."%(mystr, txt)
if mystr.endswith(txt):
    print "%s ends with %s."%(mystr, txt)

#-----------------------------

Establishing a Default Value

#-----------------------------
# Introductory Note: quite a bit of this section is not terribly Pythonic
# as names must be set before being used. For instance, unless myvar has 
# been previously defined, these next lines will all raise NameError:
myvar = myvar or some_default
myvar2 = myvar or some_default
myvar |= some_default          # bitwise-or, not logical-or - for demo

# The standard way of setting a default is often:
myvar = default_value
if some_condition:
    pass                     # code which may set myvar to something else

# if myvar is returned from a function and may be empty/None, then use:
myvar = somefunc()
if not myvar:
    myvar = default_value

# If you want a default value that can be overridden by the person calling 
# your code, you can often wrap it in a function with a named parameter:
def myfunc(myvar="a"):
   return myvar + "b"
print myfunc(), myfunc("c")
#=> ab cb

# Note, though, that this won't work for mutable objects such as lists or
# dicts that are mutated in the function as the object is only created once 
# and repeated calls to the same function will return the same object.  This
# can be desired behaviour however - see section 10.3, for instance.
def myfunc(myvar=[]):
    myvar.append("x")
    return myvar
print myfunc(), myfunc()
#=> ['x'] ['x', 'x']

# You need to do:
def myfunc(myvar=None):
    if myvar is None:
        myvar = []
    myvar.append("x")
    return myvar
print myfunc(), myfunc()
#=> ['x'] ['x']

#=== Perl Equivalencies start here
# use b if b is true, otherwise use c
a = b or c

# as that is a little tricksy, the following may be preferred:
if b:
    a = b
else:
    a = c

# set x to y unless x is already true
if not x:
    x = y
#-----------------------------
# use b if b is defined, else c
try:
    a = b
except NameError:
    a = c
#-----------------------------
foo = bar or "DEFAULT VALUE"
#-----------------------------
# To get a user (for both UNIX and Windows), use:
import getpass
user = getpass.getuser()

# DON'T DO THIS: find the user name on Unix systems 
import os
user = os.environ.get("USER")
if user is None:
    user = os.environ.get("LOGNAME")
#-----------------------------
if not starting_point:
    starting_point = "Greenwich"
#-----------------------------
if not a:         # copy only if empty
    a = b

if b:             # assign b if nonempty, else c
    a = b
else:
    a = c
#-----------------------------

Exchanging Values Without Using Temporary Variables

#-----------------------------
v1, v2 = v2, v1
#-----------------------------
# DON'T DO THIS:
temp = a
a = b
b = temp
#-----------------------------
a = "alpha"
b = "omega"
a, b = b, a   # the first shall be last -- and versa vice 
#-----------------------------
alpha, beta, production = "January March August".split()
alpha, beta, production = beta, production, alpha
#-----------------------------

Converting Between ASCII Characters and Values

#-----------------------------
num = ord(char)
char = chr(num)
#-----------------------------
char = "%c" % num
print "Number %d is character %c" % (num, num)
print "Number %(n)d is character %(n)c" % {"n": num}
print "Number %(num)d is character %(num)c" % locals()
#=> Number 101 is character e
#-----------------------------
ascii_character_numbers = [ord(c) for c in "sample"]
print ascii_character_numbers
#=> [115, 97, 109, 112, 108, 101]

word = "".join([chr(n) for n in ascii_character_numbers])
word = "".join([chr(n) for n in [115, 97, 109, 112, 108, 101]])
print word
#=> sample
#-----------------------------
hal = "HAL"
ibm = "".join([chr(ord(c)+1) for c in hal]) # add one to each ASCII value
print ibm   
#=> IBM
#-----------------------------

Processing a String One Character at a Time

#-----------------------------
mylist = list(mystr)
#-----------------------------
for char in mystr:
    pass # do something with char
#-----------------------------
mystr = "an apple a day"
uniq = sorted(set(mystr))
print "unique chars are: '%s'" % "".join(uniq)
#=> unique chars are: ' adelnpy'
#-----------------------------
ascvals = [ord(c) for c in mystr]
print "total is %s for '%s'."%(sum(ascvals), mystr)
#=> total is 1248 for 'an apple a day'.
#-----------------------------
# sysv checksum
def checksum(myfile):
    values = [ord(c) for line in myfile for c in line]
    return sum(values)%(2**16) - 1

import fileinput
print checksum(fileinput.input())   # data from sys.stdin

# Using a function means any iterable can be checksummed:
print checksum(open("C:/test.txt")  # data from file
print checksum("sometext")          # data from string
#-----------------------------
#!/usr/bin/python
# slowcat - emulate a   s l o w  line printer
# usage: slowcat [- DELAY] [files ...]
import sys, select
import re
DELAY = 1
if re.match("^-\d+$",sys.argv[1]):
    DELAY=-int(sys.argv[1])
    del sys.argv[1]
for ln in fileinput.input():
    for c in ln:
        sys.stdout.write(c)
        sys.stdout.flush()
        select.select([],[],[], 0.005 * DELAY)
#-----------------------------

Reversing a String by Word or Character

#-----------------------------
# 2.3+ only
revchars = mystr[::-1]  # extended slice - step is -1
revwords = " ".join(mystr.split(" ")[::-1])

# pre 2.3 version:
mylist = list(mystr)
mylist.reverse()
revbytes = "".join(mylist)

mylist = mystr.split()
mylist.reverse()
revwords = ' '.join(mylist)

# Alternative version using reversed():
revchars = "".join(reversed(mystr))
revwords = " ".join(reversed(mystr.split(" ")))

# reversed() makes an iterator, which means that the reversal
# happens as it is consumed.  This means that "print reversed(mystr)" is not
# the same as mystr[::-1].  Standard usage is:
for char in reversed(mystr):
   pass  # ... do something
#-----------------------------
# 2.3+ only
word = "reviver"
is_palindrome = (word == word[::-1])
#-----------------------------
# Generator version
def get_palindromes(fname):
    for line in open(fname):
        word = line.rstrip()
        if len(word) > 5 and word == word[::-1]:
            yield word
long_palindromes = list(get_palindromes("/usr/share/dict/words"))

# Simpler old-style version using 2.2 string reversal
def rev_string(mystr):
    mylist = list(mystr)
    mylist.reverse()
    return "".join(mylist)

long_palindromes=[]
for line in open("/usr/share/dict/words"):
    word = line.rstrip()
    if len(word) > 5 and word == rev_string(word):
        long_palindromes.append(word)
print long_palindromes
#-----------------------------

Expanding and Compressing Tabs

#-----------------------------
mystr.expandtabs()
mystr.expandtabs(4)
#-----------------------------

Expanding Variables in User Input

#-----------------------------
text = "I am %(rows)s high and %(cols)s long"%{"rows":24, "cols":80)
print text
#=> I am 24 high and 80 long

rows, cols = 24, 80
text = "I am %(rows)s high and %(cols)s long"%locals()
print text
#=> I am 24 high and 80 long
#-----------------------------
import re
print re.sub("\d+", lambda i: str(2 * int(i.group(0))), "I am 17 years old")
#=> I am 34 years old
#-----------------------------
# expand variables in text, but put an error message in
# if the variable isn't defined
class SafeDict(dict):
    def __getitem__(self, key):
        return self.get(key, "[No Variable: %s]"%key)
    
hi = "Hello"
text = "%(hi)s and %(bye)s!"%SafeDict(locals())
print text
#=> Hello and [No Variable: bye]!

#If you don't need a particular error message, just use the Template class:
from string import Template
x = Template("$hi and $bye!")
hi = "Hello"
print x.safe_substitute(locals())
#=> Hello and $bye!
print x.substitute(locals()) # will throw a KeyError

#-----------------------------

Controlling Case

#-----------------------------
mystr = "bo peep".upper()  # BO PEEP
mystr = mystr.lower()      # bo peep
mystr = mystr.capitalize() # Bo peep
#-----------------------------
beast = "python"
caprest = beast.capitalize().swapcase() # pYTHON
#-----------------------------
print "thIS is a loNG liNE".title()
#=> This Is A Long Line
#-----------------------------
if a.upper() == b.upper():
    print "a and b are the same"
#-----------------------------
import random
def randcase_one(letter):
    if random.randint(0,5):   # True on 1, 2, 3, 4
        return letter.lower()
    else:
        return letter.upper()

def randcase(myfile):
    for line in myfile:
        yield "".join(randcase_one(letter) for letter in line[:-1])

for line in randcase(myfile):
    print line
#-----------------------------

Interpolating Functions and Expressions Within Strings

#-----------------------------
"I have %d guanacos." % (n + 1)
print "I have", n+1, "guanacos."
#-----------------------------
#Python templates disallow in-string calculations (see PEP 292)
from string import Template

email_template = Template("""\
To: $address
From: Your Bank
CC: $cc_number
Date: $date

Dear $name,

Today you bounced check number $checknum to us.
Your account is now closed.

Sincerely,
the management
""")

import random
import datetime

person = {"address":"Joe@somewhere.com",
          "name": "Joe",
          "cc_number" : 1234567890,
          "checknum" : 500+random.randint(0,99)}

print email_template.substitute(person, date=datetime.date.today())
#-----------------------------

Indenting Here Documents

#-----------------------------
# indenting here documents
#
# in python multiline strings can be used as here documents
var = """
      your text
      goes here
      """

# using regular expressions
import re
re_leading_blanks = re.compile("^\s+",re.MULTILINE)
var1 = re_leading_blanks.sub("",var)[:-1]

# using string methods 
# split into lines, use every line except first and last, left strip and rejoin.
var2 = "\n".join([line.lstrip() for line in var.split("\n")[1:-1]])

poem = """
       Here's your poem:
       Now far ahead the Road has gone,
          And I must follow, if I can,
       Pursuing it with eager feet,
          Until it joins some larger way
       Where many paths and errand meet.
          And whither then? I cannot say.
               --Bilbo in /usr/src/perl/pp_ctl.c  
       """

import textwrap
print textwrap.dedent(poem)[1:-1]
#-----------------------------
    

Reformatting Paragraphs

#-----------------------------
from textwrap import wrap 
output = wrap(para,
              initial_indent=leadtab
              subsequent_indent=nexttab)
#-----------------------------
#!/usr/bin/env python
# wrapdemo - show how textwrap works

txt = """\
Folding and splicing is the work of an editor,
not a mere collection of silicon
and
mobile electrons!
"""

from textwrap import TextWrapper

wrapper = TextWrapper(width=20,
                      initial_indent=" "*4,
                      subsequent_indent=" "*2)

print "0123456789" * 2
print wrapper.fill(txt)

#-----------------------------
"""Expected result:

01234567890123456789
    Folding and
  splicing is the
  work of an editor,
  not a mere
  collection of
  silicon and mobile
  electrons!
"""

#-----------------------------
# merge multiple lines into one, then wrap one long line

from textwrap import fill
import fileinput

print fill("".join(fileinput.input()))

#-----------------------------
# Term::ReadKey::GetTerminalSize() isn't in the Perl standard library. 
# It isn't in the Python standard library either. Michael Hudson's 
# recipe from python-list #530228 is shown here.
# (http://aspn.activestate.com/ASPN/Mail/Message/python-list/530228)
# Be aware that this will work on Unix but not on Windows.

from termwrap import wrap
import struct, fcntl
def getheightwidth():
    height, width = struct.unpack(
        "hhhh", fcntl.ioctl(0, TERMIOS.TIOCGWINSZ ,"\000"*8))[0:2]
    return height, width

# PERL <>, $/, $\ emulation
import fileinput
import re

_, width = getheightwidth()
for para in re.split(r"\n{2,}", "".join(fileinput.input())):
    print fill(para, width)

Escaping Characters

#-----------------------------
mystr = '''Mom said, "Don't do that."'''  #"
re.sub("['\"]", lambda i: "\\" + i.group(0), mystr)
re.sub("[A-Z]", lambda i: "\\" + i.group(0), mystr)
re.sub("\W", lambda i: "\\" + i.group(0), "is a test!") # no function like quotemeta?

Trimming Blanks from the Ends of a String

#-----------------------------
mystr = mystr.lstrip() # left
mystr = mystr.rstrip() # right
mystr = mystr.strip()  # both ends

Parsing Comma-Separated Data

#-----------------------------
import csv
def parse_csv(line):
    reader = csv.reader([line], escapechar='\\')
    return reader.next()

line = '''XYZZY,"","O'Reilly, Inc","Wall, Larry","a \\"glug\\" bit,",5,"Error, Core Dumped,",''' #"

fields = parse_csv(line)

for i, field in enumerate(fields):
    print "%d : %s" % (i, field)

# pre-2.3 version of parse_csv
import re
def parse_csv(text):
    pattern = re.compile('''"([^"\\\]*(?:\\\.[^"\\\]*)*)",?|([^,]+),?|,''')
    mylist = ["".join(elem) 
              for elem in re.findall(pattern, text)]
    if text[-1] == ",": 
        mylist += ['']
    return mylist

# cvs.reader is meant to work for many lines, something like:
# (NB: in Python default, quotechar is *not* escaped by backslash,
#      but doubled instead. That's what Excel does.)
for fields in cvs.reader(lines, dialect="some"):
    for num, field in enumerate(fields):
        print num, ":", field
#-----------------------------

Soundex Matching

#-----------------------------
def soundex(name, len=4):
    """ soundex module conforming to Knuth's algorithm
        implementation 2000-12-24 by Gregory Jorgensen
        public domain
    """

    # digits holds the soundex values for the alphabet
    digits = '01230120022455012623010202'
    sndx = ''
    fc = ''

    # translate alpha chars in name to soundex digits
    for c in name.upper():
        if c.isalpha():
            if not fc: 
                fc = c   # remember first letter
            d = digits[ord(c)-ord('A')]
            # duplicate consecutive soundex digits are skipped
            if not sndx or (d != sndx[-1]):
                sndx += d

    # replace first digit with first alpha character
    sndx = fc + sndx[1:]

    # remove all 0s from the soundex code
    sndx = sndx.replace('0','')

    # return soundex code padded to len characters
    return (sndx + (len * '0'))[:len]

user = raw_input("Lookup user: ")
if user == "":
    raise SystemExit

name_code = soundex(user)
for line in open("/etc/passwd"):
    line = line.split(":")
    for piece in line[4].split():
        if name_code == soundex(piece):
            print "%s: %s\n" % line[0], line[4])
#-----------------------------

Program: fixstyle

#-----------------------------
import sys, fileinput, re

data = """\
analysed        => analyzed
built-in        => builtin
chastized       => chastised
commandline     => command-line
de-allocate     => deallocate
dropin          => drop-in
hardcode        => hard-code
meta-data       => metadata
multicharacter  => multi-character
multiway        => multi-way
non-empty       => nonempty
non-profit      => nonprofit
non-trappable   => nontrappable
pre-define      => predefine
preextend       => pre-extend
re-compiling    => recompiling
reenter         => re-enter
turnkey         => turn-key
"""
mydict = {}
for line in data.split("\n"):
    if not line.strip():
        continue
    k, v = [word.strip() for word in line.split("=>")]
    mydict[k] = v
pattern_text = "(" + "|".join([re.escape(word) for word in mydict.keys()]) + ")"
pattern = re.compile(pattern_text)

args = sys.argv[1:]
verbose = 0
if args and args[0] == "-v":
    verbose = 1
    args = args[1:]

if not args:
    sys.stderr.write("%s: Reading from stdin\n" % sys.argv[0])

for line in fileinput.input(args, inplace=1, backup=".orig"):
    output = ""
    pos = 0
    while True:
        match = pattern.search(line, pos)
        if not match:
            output += line[pos:]
            break
        output += line[pos:match.start(0)] + mydict[match.group(1)]
        pos = match.end(0)
    sys.stdout.write(output)
#-----------------------------

Program: psgrep

#-----------------------------
#!/usr/bin/python
# psgrep - print selected lines of ps output by
#          compiling user queries into code.
#
# examples :
# psgrep "uid<10"
import sys, os, re

class PsLineMatch:
    # each field from the PS header
    fieldnames = ("flags","uid","pid","ppid","pri","nice","size", \
                  "rss","wchan","stat","tty","time","command")
    numeric_fields = ("flags","uid","pid","ppid","pri","nice","size","rss")
    def __init__(self):
        self._fields = {}

    def new_line(self, ln):
        self._ln = ln.rstrip()
        # ps header for option "wwaxl" (different than in the perl code)
        """
          F   UID   PID  PPID PRI  NI   VSZ  RSS WCHAN  STAT TTY        TIME COMMAND"
        004     0     1     0  15   0   448  236 schedu S    ?          0:07 init"
        .   .     .     .     .   .   .     .    .      .    .    .          .
        """
        # because only the last entry might contain blanks, splitting
        # is safe
        data = self._ln.split(None,12)
        for fn, elem in zip(self.fieldnames, data):
            if fn in self.numeric_fields:  # make numbers integer 
                self._fields[fn] = int(elem)
            else:
                self._fields[fn] = elem

    def set_query(self, args):
        # assume args: "uid==500", "command ~ ^wm"
        conds=[]
        m = re.compile("(\w+)([=<>]+)(.+)")
        for a in args:
            try:
                (field,op,val) = m.match(a).groups()
            except:
                print "can't understand query \"%s\"" % (a)
                raise SystemExit
            if field in self.numeric_fields:
                conds.append(a)
            else:
                conds.append("%s%s'%s'",(field,op,val))
        self._desirable = compile("(("+")and(".join(conds)+"))", "<string>","eval")

    def is_desirable(self):
        return eval(self._desirable, {}, self._fields)

    def __str__(self):
        # to allow "print".
        return self._ln

if len(sys.argv)<=1:
    print """usage: %s criterion ...
    Each criterion is a Perl expression involving: 
    %s
    All criteria must be met for a line to be printed.""" \
    % (sys.argv[0], " ".join(PsLineMatch().fieldnames))
    raise SystemExit

psln = PsLineMatch()
psln.set_query(sys.argv[1:])
p = os.popen("ps wwaxl")
print p.readline()[:-1]        # emit header line
for ln in p.readlines():
    psln.new_line(ln)
    if psln.is_desirable():
        print psln
p.close()

# alternatively one could consider every argument being a string and
# support wildcards: "uid==500" "command~^wm" by means of re, but this
# does not show dynamic python code generation, although re.compile
# also precompiles.
#-----------------------------