crosstab

Richard Suchenwirth 2007-02-02 - Just to prove that I'm not always writing very short procs and scripts, here's a utility to cross-tabulate data, in which each line stands for one "case" characterized by attributes in fields separated by a specific character, e.g. CSV. For example, this little data file (saved as test.csv)

 John;M;soccer
 Jane;F;tennis
 Tom;M;football
 Dick;M;soccer
 Harry;M;tennis
 Mary;F;baseball
 Jeff;M;baseball
 Jane;F;tennis

can yield the following tabulation:

 $ crosstab.tcl test.csv -c 2,3

 1\2                    F           M       Total
 ------------------------------------------------
 baseball               1           1           2
 football               0           1           1
 soccer                 0           2           2
 tennis                 2           1           3

 Total                  3           5           8

which might make the point that tennis is more popular with females than with males, etc.

A later addition is that you can also specify a third column as "z axis", where for all values of z a table like the above is produced. The script is also a demonstration of the set usage ...; proc main ...; ...; main pattern that I usually follow.


#!/usr/bin/env tclsh
set usage {$Id: 17641,v 1.7 2007-02-05 19:00:22 jcw Exp $
    usage: crosstab.tcl infile ?-c x,y? ?-sep '\t'? -w 12
    Reads the infile (or stdin if infile is "-") for cases
    Prints on stdout a cross-tabulation for the specified columns.
    -c x,y : columns to use (1...), default is 1,2
    -sep x : column separator, default is ";"
    -w   n : output column width, default is 12
}
if {[llength $argv] == 0} {puts stderr $usage; exit 1}

proc main argv {
     getopt argv -sep sep  ";" 
     getopt argv -c   cols 1,2
     getopt argv -w w 12

    set infile [lindex $argv 0]
    if {$infile eq "-"} {
        set f stdin
    } elseif {[file exists $infile]} {
        set f [open $infile]
    } else {puts stderr "no such file $infile\n$::usage"; exit 1}

    foreach {x y z} [split $cols ,] break
    incr x -1; incr y -1
    if {$z ne ""} {incr z -1}
    while {[gets $f line] >= 0} {
        set fields [split $line $sep]
        set key [lindex $fields $x],[lindex $fields $y]
        if {$z ne ""} {append key ,[lindex $fields $z]}
        inc N($key)
    }
    if {$f ne "stdout"} {close $f}

    set xs [get_values N 0]
    set ys [get_values N 1]
    if {$z ne ""} {set zs [get_values N 2]} else {set zs {{}}}
    foreach zv $zs {
        puts $zv
        set header [format %-${w}s $x\\$y]
        foreach xv [concat $xs Total] {append header [format %${w}s $xv]}
        puts $header
        puts [string repeat - [string length $header]]
        foreach yv [concat $ys Total] {
            set line [format %-${w}s $yv]
            set sum 0
            foreach xv $xs {
                set key $xv,$yv
                if {$z ne ""} {append key ,$zv}
                set n [get N($key)]
                append line [format %${w}d $n]
                incr sum  $n
                set key $xv,Total
                if {$z ne ""} {append key ,$zv}
                inc N($key) $n
            }
            append line [format %${w}d $sum]
            if {$yv eq "Total"} {puts ""}
            puts $line
        }
        puts ""
    }
}
#-- retrieve a variable value, if existing, else return 0
proc get _var {
    upvar 1 $_var var
    if {[info exists var]} {set var} else {return 0}
}
#-- enumerate values used in array keys at one position (comma-separated)
proc get_values {_arr pos} {
    upvar 1 $_arr arr
    set values {}
    foreach i [array names arr] {
        lappend values [lindex [split $i ,] $pos]
    }
    lsort -unique $values
}
#-- option retriever, see [getopt] page
proc getopt {_argv name {_var ""} {default ""}} {
    upvar 1 $_argv argv $_var var
    set pos [lsearch -regexp $argv ^$name]
    if {$pos>=0} {
        set to $pos
        if {$_var ne ""} {
            set var [lindex $argv [incr to]]
        }
        set argv [lreplace $argv $pos $to]
        return 1
    } else {
        if {[llength [info level 0]] == 5} {set var $default}
        return 0
    }
}
#-- From Tcl 8.5, [incr] will auto-initialize, so this workaround will no longer be needed
proc inc {_var {amount 1}} {
    upvar 1 $_var var
    if ![info exists var] {set var 0}
    incr var $amount
}

main $argv

See also crosstab again for a more functional rewrite.