Reading XLSX file (Excel OpenXML) using vfs::zip and tdom

ALX 2018-02-05

package require Tcl 8.6
package require vfs::zip
package require tdom 0.9.0-

array set cells {}

proc Cell { name data } {
  global cells

  set row {}
  set tmp {}
  foreach char [string toupper [split $name {}]] {
    switch -glob -- $char {
      [A-Z] { 
        binary scan $char cu val
        lappend tmp [incr val -64]
      }
      [0-9] { append row $char }
    }
  }
  set column 0
  set idx [llength $tmp]
  foreach num [lreverse $tmp] {
    set column [expr {int($column + $num * pow(26,[incr idx -1]))}]
  }
  incr row -1
  incr column -1
  set cells($row,$column) $data
}

set sheet 1
set xlsxFile {MyExcelFile.xlsx}

set mnt [vfs::zip::Mount $xlsxFile xlsx]

if {![catch {open xlsx/xl/sharedStrings.xml r} fd]} {
  if {![catch {dom parse [read $fd]} doc]} {
    set root [$doc documentElement]
    set idx -1
    foreach node [$root selectNodes -namespaces [list X [$root namespaceURI]] {/X:sst/X:si/X:t/text()}] {
      set sharedStrings([incr idx]) [$node nodeValue]
    }
    $doc delete
  }
  close $fd
}

if {![catch {open xlsx/xl/worksheets/sheet${sheet}.xml r} fd]} {
  if {![catch {dom parse [read $fd]} doc]} {
    set root [$doc documentElement]
    foreach cell [$root selectNodes -namespaces [list X [$root namespaceURI]] {/X:worksheet/X:sheetData/X:row/X:c}] {
      if {[$cell hasAttribute t]} {
        set type [$cell getAttribute t]
      } else {
        set type n
      }
      set value {}
      switch -- $type {
        n - b - d {
          if {[set node [$cell selectNodes -namespaces [list X [$cell namespaceURI]] X:v/text()]] ne {}} {
            set value [$node nodeValue]
          } else {
            continue
          }
        }
        s {
          if {[set node [$cell selectNodes -namespaces [list X [$cell namespaceURI]] X:v/text()]] ne {}} {
            set index [$node nodeValue]
            if {[info exists sharedStrings($index)]} {
              set value $sharedStrings($index)
            }
          } else {
            continue
          }
        }
        str {
        }
        inlineStr {
          if {[set node [$cell selectNodes -namespaces [list X [$cell namespaceURI]] X:is/text()]] ne {}} {
            set value [$node nodeValue]
          } else {
            continue
          }
        }
        e {
        }
      }
      Cell [$cell getAttribute r] $value
    }
    $doc delete
  }
  close $fd
}

vfs::zip::Unmount $mnt xlsx

array unset -nocomplain sharedStrings

foreach cell [lsort -dictionary [array names cells]] {
  puts "cell $cell = $cells($cell)"
}

SEE ALSO

WEB: Standard ECMA-376 Office Open XML File Formats [L1 ]

Some Notes

beware 23/09/2019: It's necessary to specify unicode encoding before reading the sharedstrings xml file.

If one of the shared strings contains formatting, then there might not be a t node, but rather a set of r nodes. The above code then gets every shared string after the offending string wrong, as the index is off.