8. File Contents

Introduction

//----------------------------------------------------------------------------------
datafile = new File('Pleac/data/pleac8_0.txt') // change on your system

datafile.eachLine{ line -> print line.size() }

lines = datafile.readLines()

wholeTextFile = datafile.text

// on command line Groovy use -a auto split pattern instead of record separator
// default pattern is /\s/
// groovy -a -e 'println "First word is ${split[0][1]}"'

// (additional examples to original cookbook to illustrate -a)
// Print processes owned by root:
// ps aux|groovy -ane "if(split[0][1] =~ 'root')println split[0][10..-1]"

// Print all logins from /etc/passwd that are not commented:
// groovy -a':' -ne "if(!(split[0][1] =~ /^#/))println split[0][1]" /etc/passwd

// Add the first and the penultimate column of a file:
// groovy -ape "split[0][1].toInteger()+split[0][-2].toInteger()" accounts.txt

// no BEGIN and END in Groovy (has been proposed, may be added soon)

datafile.withOutputStream{ stream ->
    stream.print "one" + "two" + "three"    // "onetwothree" -> file
    println "Baa baa black sheep."          // sent to $stdout
}

// use streams or channels for advanced file handling
int size = datafile.size()
buffer = ByteBuffer.allocate(size) // for large files, use some block size, e.g. 4096
channel = new FileInputStream(datafile).channel
println "Number of bytes read was: ${channel.read(buffer)}" // -1 = EOF

channel = new FileOutputStream(File.createTempFile("pleac8", ".junk")).channel
size = channel.size()
channel.truncate(size) // shrinks file (in our case to same size)

pos = channel.position()
println "I'm $pos bytes from the start of datafile"
channel.position(pos)  // move to pos (in our case unchanged)
channel.position(0)    // move to start of file
channel.position(size) // move to end of file

// no sysread and syswrite are available but dataInput/output streams
// can be used to achieve similar functionality, see 8.15.
//----------------------------------------------------------------------------------

Reading Lines with Continuation Characters

//----------------------------------------------------------------------------------
testfile = new File('Pleac/data/pleac8_1.txt') // change on your system
// contents of testfile:
// DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) \
//         $(TEXINFOS) $(INFOS) $(MANS) $(DATA)
// DEP_DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) \
//         $(TEXINFOS) $(INFO_DEPS) $(MANS) $(DATA) \
//         $(EXTRA_DIST)

lines = []
continuing = false
regex = /\\$/
testfile.eachLine{ line ->
    stripped = line.replaceAll(regex,'')
    if (continuing) lines[-1] += stripped
    else lines += stripped
    continuing = (line =~ regex)
}
println lines.join('\n')
// =>
// DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS)         $(TEXINFOS) $(INFOS) $(MANS) $(DATA)
// DEP_DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS)         $(TEXINFOS) $(INFO_DEPS) $(MANS) $(DATA)         $(EXTRA_DIST)

// to remove hidden spaces after the slash (but keep the slash):
def trimtail(line) {
    line = line.replaceAll(/(?<=\\)\s*$/, '')
}
b = /\\/  // backslash
assert "abc  $b"   == trimtail("abc  $b")
assert "abc  "     == trimtail("abc  ")
assert "abc  $b"   == trimtail("abc  $b  ")
//----------------------------------------------------------------------------------

Counting Lines (or Paragraphs or Records) in a File

//----------------------------------------------------------------------------------
// unixScript:
println ("wc -l < $filename".execute().text)

// for small files which fit in memory
println testfile.readLines().size()

// streaming approach (lines and paras)
lines = 0; paras = 1
testfile.eachLine{ lines++; if (it =~ /^$/) paras++ }
println "Found $lines lines and $paras paras."
// note: counts blank line at end as start of next empty para

// with a StreamTokenizer
st = new StreamTokenizer(testfile.newReader())
while (st.nextToken() != StreamTokenizer.TT_EOF) {}
println st.lineno()
//----------------------------------------------------------------------------------

Processing Every Word in a File

//----------------------------------------------------------------------------------
// general pattern
def processWordsInFile(file, processWord) {
    testfile.splitEachLine(/\W+/) { matched ->
        matched.each{ w -> if (w) processWord(w) }
    }
}

testfile = new File('Pleac/src/pleac8.groovy')  // change path on your system

// count words
count = 0
processWordsInFile(testfile){ count++ }
println count

// (variation to Perl example)
// with a StreamTokenizer (counting words and numbers in Pleac chapter 8 source file)
words = 0; numbers = 0
st = new StreamTokenizer(testfile.newReader())
st.slashSlashComments(true) // ignore words and numbers in comments
while (st.nextToken() != StreamTokenizer.TT_EOF) {
    if (st.ttype == StreamTokenizer.TT_WORD) words++
    else if (st.ttype == StreamTokenizer.TT_NUMBER) numbers++
}
println "Found $words words and $numbers numbers."


// word frequency count
seen = [:]
processWordsInFile(testfile) {
    w = it.toLowerCase()
    if (seen.containsKey(w)) seen[w] += 1
    else seen[w] = 1
}
// output map in a descending numeric sort of its values
seen.entrySet().sort { a,b -> b.value <=> a.value }.each{ e ->
    printf("%5d %s\n", [e.value, e.key] )
}
// =>
//    25 pleac
//    22 line
//    20 file
//    19 println
//    19 lines
//    13 testfile
//    ...
//----------------------------------------------------------------------------------

Reading a File Backwards by Line or Paragraph

//----------------------------------------------------------------------------------
testfile.readLines().reverseEach{
    println it
}

lines = testfile.readLines()
// normally one would use the reverseEach, but you can use
// a numerical index if you want
((lines.size() - 1)..0).each{
    println lines[it]
}

// Paragraph-based processing could be done as in 8.2.

// A streaming-based solution could use random file access
// and have a sliding buffer working from the back of the
// file to the front.
//----------------------------------------------------------------------------------

Trailing a Growing File

//----------------------------------------------------------------------------------
logfile = new File('Pleac/data/sampleLog.txt')
// logTailingScript:
sampleInterval = 2000 // 2000 millis = 2 secs
file = new RandomAccessFile( logfile, "r" )
filePointer = 0 // set to logfile.size() to begin tailing from the end of the file
while( true ) {
    // Compare the length of the file to the file pointer
    long fileLength = logfile.size()
    if( fileLength < filePointer ) {
        // Log file must have been rotated or deleted;
        System.err.println "${new Date()}: Reopening $logfile"
        file = new RandomAccessFile( logfile, "r" )
        filePointer = 0
    }
    if( fileLength > filePointer ) {
        // There is data to read
        file.seek( filePointer )
        while( (line = file.readLine()) != null ) {
            println '##' + line
        }
        filePointer = file.filePointer
    }
    // Sleep for the specified interval
    Thread.sleep( sampleInterval )
}
//----------------------------------------------------------------------------------

Picking a Random Line from a File

//----------------------------------------------------------------------------------
//testfile = newFile('/usr/share/fortune/humorists')

// small files:
random = new Random()
lines = testfile.readLines()
println lines[random.nextInt(lines.size())]

// streamed alternative
count = 0
def adage
testfile.eachLine{ line ->
    count++
    if (random.nextInt(count) < 1) adage = line
}
println adage
//----------------------------------------------------------------------------------

Randomizing All Lines

//----------------------------------------------------------------------------------
// non-streamed solution (like Perl and Ruby)
lines = testfile.readLines()
Collections.shuffle(lines)
println lines.join('\n')
//----------------------------------------------------------------------------------

Reading a Particular Line in a File

//----------------------------------------------------------------------------------
desiredLine = 235
// for small files
lines = testfile.readLines()
println "Line $desiredLine: ${lines[desiredLine-1]}"

// streaming solution
reader = testfile.newReader()
count = 0
def line
while ((line = reader.readLine())!= null) {
    if (++count == desiredLine) break
}
println "Line $desiredLine: $line"
//----------------------------------------------------------------------------------

Processing Variable-Length Text Fields

//----------------------------------------------------------------------------------
println testfile.text.split(/@@pleac@@_8./i).size()
// => 23 (21 sections .0 .. .20 plus before .0 plus line above)
//----------------------------------------------------------------------------------

Removing the Last Line of a File

//----------------------------------------------------------------------------------
file = new RandomAccessFile( logfile, "rw" )
long previous, lastpos = 0
while( (line = file.readLine()) != null ) {
    previous = lastpos
    lastpos = file.filePointer
}
if (previous) file.setLength(previous)
//----------------------------------------------------------------------------------

Processing Binary Files

//----------------------------------------------------------------------------------
// Java's streams are binary at the lowest level if not processed with
// higher level stream mechanisms or readers/writers. Some additions
// to the Perl cookbook which illustrate the basics.

// Print first ten bytes of a binary file:
def dumpStart(filename) {
    bytes = new File(filename).newInputStream()
    10.times{
        print bytes.read() + ' '
    }
    println()
}
dumpStart(System.getProperty('java.home')+'/lib/rt.jar')
// => 80 75 3 4 10 0 0 0 0 0 (note first two bytes = PK - you might recognize this
// as the starting sequence of a zip file)
dumpStart('Pleac/classes/pleac8.class') // after running groovyc compiler in src directory
// => 202 254 186 190 0 0 0 47 2 20 (starting bytes in HEX: CAFEBABE)

binfile = new File('Pleac/data/temp.bin')
binfile.withOutputStream{ stream -> (0..<20).each{ stream.write(it) }}
binfile.eachByte{ print it + ' ' }; println()
// => 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
//----------------------------------------------------------------------------------

Using Random-Access I/O

//----------------------------------------------------------------------------------
// lets treat binfile as having 5 records of size 4, let's print out the 3rd record
recsize = 4
recno = 2 // index starts at 0
address = recsize * recno
randomaccess = new RandomAccessFile(binfile, 'r')
randomaccess.seek(address)
recsize.times{ print randomaccess.read() + ' ' }; println()  // => 8 9 10 11
randomaccess.close()
//----------------------------------------------------------------------------------

Updating a Random-Access File

//----------------------------------------------------------------------------------
// let's take the example from 8.12 but replace the 3rd record with
// 90 - the original value in the file
// this is an alternative example to the Perl cookbook which is cross platform
// see chapter 1 regarding un/pack which could be combined with below
// to achieve the full functionality of the original 8.13
recsize = 4
recno = 2 // index starts at 0
address = recsize * recno
randomaccess = new RandomAccessFile(binfile, 'rw')
randomaccess.seek(address)
bytes = []
recsize.times{ bytes += randomaccess.read() }
randomaccess.seek(address)
bytes.each{ b -> randomaccess.write(90 - b) }
randomaccess.close()
binfile.eachByte{ print it + ' ' }; println()
// => 0 1 2 3 4 5 6 7 82 81 80 79 12 13 14 15 16 17 18 19
//----------------------------------------------------------------------------------

Reading a String from a Binary File

//----------------------------------------------------------------------------------
// reading a String would involve looping and collecting the read bytes

// simple bgets
// this is similar to the revised 8.13 but would look for the terminating 0

// simplistic strings functionality
binfile.eachByte{ b -> if ((int)b in 32..126) print ((char)b) }; println() // => RQPO
//----------------------------------------------------------------------------------

Reading Fixed-Length Records

//----------------------------------------------------------------------------------
// You could combine the byte-level reading/writing mechanisms shown
// in 8.11 - 8.12 and combine that with the un/pack functionality from
// Chapter 1 to achieve the desired functionality. A more Java and Groovy
// friendly way to do this would be to use the Scattering and Gathering
// stream operations of channels for byte-oriented record fields or
// data-oriented records. Alternatively, the dataInput/output stream
// capabilities for data-oriented records. Finally, the
// objectInput/output stream capabilities could be used for object types.
// Note, these examples mix reading and writing even though the original
// Perl example was just about reading.


// fixed-length byte-oriented records using channels
// typical approach used with low-level protocols or file formats
import java.nio.*
binfile.delete(); binfile.createNewFile() // start from scratch
buf1 = ByteBuffer.wrap([10,11,12,13] as byte[]) // simulate 4 byte field
buf2 = ByteBuffer.wrap([44,45] as byte[])       // 2 byte field
buf3 = ByteBuffer.wrap('Hello'.bytes)           // String
records = [buf1, buf2, buf3] as ByteBuffer[]
channel = new FileOutputStream(binfile).channel
channel.write(records) // gathering byte records
channel.close()
binfile.eachByte{ print it + ' ' }; println()
// => 10 11 12 13 44 45 72 101 108 108 111
// ScatteringInputStream would convert this back into an array of byte[]


// data-oriented streams using channels
binfile.delete(); binfile.createNewFile() // start from scratch
buf = ByteBuffer.allocate(24)
now = System.currentTimeMillis()
buf.put('PI='.bytes).putDouble(Math.PI).put('Date='.bytes).putLong(now)
buf.flip() // readies for writing: set length and point back to start
channel = new FileOutputStream(binfile).channel
channel.write(buf)
channel.close()
// now read it back in
channel = new FileInputStream(binfile).channel
buf = ByteBuffer.allocate(24)
channel.read(buf)
buf.flip()
3.times{ print ((char)buf.get()) }
println (buf.getDouble())
5.times{ print ((char)buf.get()) }
println (new Date(buf.getLong()))
channel.close()
// =>
// PI=3.141592653589793
// Date=Sat Jan 13 00:14:50 EST 2007

// object-oriented streams
binfile.delete(); binfile.createNewFile() // start from scratch
class Person implements Serializable { def name, age }
binfile.withObjectOutputStream{ oos ->
    oos.writeObject(new Person(name:'Bernie',age:16))
    oos.writeObject([1:'a', 2:'b'])
    oos.writeObject(new Date())
}
// now read it back in
binfile.withObjectInputStream{ ois ->
    person = ois.readObject()
    println "$person.name is $person.age"
    println ois.readObject()
    println ois.readObject()
}
// =>
// Bernie is 16
// [1:"a", 2:"b"]
// Sat Jan 13 00:22:13 EST 2007
//----------------------------------------------------------------------------------

Reading Configuration Files

//----------------------------------------------------------------------------------
// use built-in Java property class
// suppose you have the following file:
// # set your database settings here
// server=localhost
// url=jdbc:derby:derbyDB;create=true
// user.name=me
// user.password=secret
props = new Properties()
propsfile=new File('Pleac/data/plain.properties')
props.load(propsfile.newInputStream())
props.list(System.out)
// =>
// -- listing properties --
// user.name=me
// user.password=secret
// url=jdbc:derby:derbyDB;create=true
// server=localhost

// There are also provisions for writing properties file.

// (additional example to Perl)
// You can also read and write xml properties files.
new File('Pleac/data/props.xml').withOutputStream{ os ->
    props.storeToXML(os, "Database Settings")
}
// =>
// <?xml version="1.0" encoding="UTF-8"?>
// <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
// <properties>
// <comment>Database Settings</comment>
// <entry key="user.password">secret</entry>
// <entry key="user.name">me</entry>
// <entry key="url">jdbc:derby:derbyDB;create=true</entry>
// <entry key="server">localhost</entry>
// </properties>
//----------------------------------------------------------------------------------

Testing a File for Trustworthiness

//----------------------------------------------------------------------------------
// The File class provides canRead(), canWrite() and canExecute() (JDK6) methods
// for finding out about security information specific to the user. JSR 203
// (expected in Java 7) provides access to additional security related attributes.

// Another useful package to use when wondering about the trustworthiness of a
// file is the java.security package. It contains many classes. Just one is
// MessageDigest. This would allow you to create a strong checksum of a file.
// Your program could refuse to operate if a file it was accessing didn't have the
// checksum it was expecting - an indication that it may have been tampered with.

// (additional info)
// While getting file-based security permissions correct is important, it isn't the
// only mechanism to use for security when using Java based systems. Java provides
// policy files and an authorization and authentication API which lets you secure
// any reources (not just files) at various levels of granularity with various
// security mechanisms.
// Security policies may be universal, apply to a particular codebase, or
// using JAAS apply to individuals. Some indicative policy statements:
// grant {
//     permission java.net.SocketPermission "*", "connect";
//     permission java.io.FilePermission "C:\\users\\cathy\\foo.bat", "read";
// };
// grant codebase "file:./*", Principal ExamplePrincipal "Secret" {
//     permission java.io.FilePermission "dummy.txt", "read";
// };
//----------------------------------------------------------------------------------

Program: tailwtmp

//----------------------------------------------------------------------------------
// general purpose utility methods
def getString(buf,size){
    // consider get(buf[]) instead of get(buf) for efficiency
    b=[]; size.times{b+=buf.get()}; new String(b as byte[]).trim()
}
def getInt(buf,size) {
    // normally in Java we would just use methods like getLong()
    // to read a long but wish to ignore platform issues here
    long val = 0
    for (n in 0..<size) { val += ((int)buf.get() & 0xFF) << (n * 8) }
    return val
}
def getDate(buf) {
    return new Date(getInt(buf,4) * 1000) // Java uses millis
}

// specific utility method (wtmp file from ubuntu 6.10)
def processWtmpRecords(file, origpos) {
    channel = new RandomAccessFile(file, 'r').channel
    recsize = 4 + 4 + 32 + 4 + 32 + 256 + 8 + 4 + 40
    channel.position(origpos)
    newpos = origpos
    buf = ByteBuffer.allocate(recsize)
    while ((count = channel.read(buf)) != -1) {
        if (count != recsize) break
        buf.flip()
        print getInt(buf,4) + ' '         // type
        print getInt(buf,4) + ' '         // pid
        print getString(buf,32) + ' '     // line
        print getString(buf,4) + ' '      // inittab
        print getString(buf,32) + ' '     // user
        print getString(buf,256) + ' '    // hostname
        buf.position(buf.position() + 8)  // skip
        println "${getDate(buf)} "        // time
        buf.clear()
        newpos = channel.position()
    }
    return newpos
}

wtmp = new File('Pleac/data/wtmp')
// wtmpTailingScript:
sampleInterval = 2000 // 2000 millis = 2 secs
filePointer = wtmp.size() // begin tailing from the end of the file
while(true) {
    // Compare the length of the file to the file pointer
    long fileLength = wtmp.size()
    if( fileLength > filePointer ) {
        // There is data to read
        filePointer = processWtmpRecords(wtmp, filePointer)
    }
    // Sleep for the specified interval
    Thread.sleep( sampleInterval )
}
//----------------------------------------------------------------------------------

Program: tctee

//----------------------------------------------------------------------------------
// contains most of the functionality of the original (not guaranteed to be perfect)
// -i ignores errors, e.g. if one target is write protected, the others will work
// -u writes files in unbuffered mode (ignore for '|')
// -n not to stdout
// -a all files are in append mode
// '>>file1' turn on append for individual file
// '|wc' or '|grep x' etc sends output to forked process (only one at any time)
class MultiStream {
    private targets
    private ignoreErrors
    MultiStream(List targets, ignore) {
        this.targets = targets
        ignoreErrors = ignore
    }
    def println(String content) {
        targets.each{
            try {
                it?.write(content.bytes)
            } catch (Exception ex) {
                if (!ignoreErrors) throw ex
                targets -= it
                it?.close()
            }
        }
    }
    def close() { targets.each{ it?.close() } }
}

class TeeTarget {
    private filename
    private stream
    private p

    TeeTarget(String name, append, buffered, ignore) {
        if (name.startsWith('>>')) {
            createFileStream(name[2..-1],true,buffered,ignore)
        } else if (name.startsWith('|')) {
            createProcessReader(name[1..-1])
        } else {
            createFileStream(name,append,buffered,ignore)
        }
    }

    TeeTarget(OutputStream stream) { this.stream = stream }

    def write(bytes) { stream?.write(bytes) }
    def close() { stream?.close() }

    private createFileStream(name, append, buffered, ignore) {
        filename = name
        def fos
        try {
            fos = new FileOutputStream(name, append)
        } catch (Exception ex) {
            if (ignore) return
        }
        if (!buffered) stream = fos
        else stream = new BufferedOutputStream(fos)
    }
    private createWriter(os) {new PrintWriter(new BufferedOutputStream(os))}
    private createReader(is) {new BufferedReader(new InputStreamReader(is))}
    private createPiperThread(br, pw) {
        Thread.start{
            def next
            while((next = br.readLine())!=null) {
                pw.println(next)
            }
            pw.flush(); pw.close()
        }
    }
    private createProcessReader(name) {
        def readFromStream = new PipedInputStream()
        def r1 = createReader(readFromStream)
        stream = new BufferedOutputStream(new PipedOutputStream(readFromStream))
        p = Runtime.runtime.exec(name)
        def w1 = createWriter(p.outputStream)
        createPiperThread(r1, w1)
        def w2 = createWriter(System.out)
        def r2 = createReader(p.inputStream)
        createPiperThread(r2, w2)
    }
}

targets = []
append = false; ignore = false; includeStdout = true; buffer = true
(0..<args.size()).each{
    arg = args[it]
    if (arg.startsWith('-')) {
        switch (arg) {
            case '-a': append = true; break
            case '-i': ignore = true; break
            case '-n': includeStdout = false; break
            case '-u': buffer = false; break
            default:
                println "usage: tee [-ainu] [filenames] ..."
                System.exit(1)
        }
    } else targets += arg
}
targets = targets.collect{ new TeeTarget(it, append, buffer, ignore) }
if (includeStdout) targets += new TeeTarget(System.out)
def tee = new MultiStream(targets, ignore)
while (line = System.in.readLine()) {
    tee.println(line)
}
tee.close()
//----------------------------------------------------------------------------------

Program: laston

//----------------------------------------------------------------------------------
// most of the functionality - uses an explicit uid - ran on ubuntu 6.10 on intel
lastlog = new File('Pleac/data/lastlog')
channel = new RandomAccessFile(lastlog, 'r').channel
uid = 1000
recsize = 4 + 32 + 256
channel.position(uid * recsize)
buf = ByteBuffer.allocate(recsize)
channel.read(buf)
buf.flip()
date = getDate(buf)
line = getString(buf,32)
host = getString(buf,256)
println "User with uid $uid last logged on $date from ${host?host:'unknown'} on $line"
// => User with uid 1000 last logged on Sat Jan 13 09:09:35 EST 2007 from unknown on :0
//----------------------------------------------------------------------------------