Version 17 of Parsing csv file

Updated 2011-03-19 15:22:47 by pi

gwlester 15-Mar-2011

The CSV package in TclLib does handle multiline quoted fields:

% package require csv
0.7.2
% set line  {a,"b1
b2",c}
a,"b1
b2",c
% set list [::csv::split $line]
a {b1
b2} c
% 

Line csv parser can't do proper handling of multiline quoted field.

Michael Heca's solution:

 proc csvParseFileCmd { file cmd {sep {,}} {quot {'}} } {

    # prepare regular expresions
    set rQuotedText [subst -nobackslashes -nocommands {(?:[^${quot}]|${quot}${quot})*}]
    set rQ  [subst -nobackslashes -nocommands {^[ \t]*${quot}}]
    set rQN [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
    set rQE [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*$}]
    set rQO [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)$}]
    set rON [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
    set rOE [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*$}]
    set rOX [subst -nobackslashes -nocommands {^$rQuotedText${quot}[^${quot}]}]
    set rUN [subst -nobackslashes -nocommands {^([^${sep}]*)${sep}(.*)$}]
    set qMap [list "${quot}${quot}" $quot]

    # parse whole file
    gets $file line
    while { $line != {} } {
        set list {}
        # parse one record
        while { $line != {} } {
            if { [regexp $rQ $line] } {
                # handle quoted fields
                if { [regexp $rQN $line match field rest] } {
                    # quoted with next
                    set field [string map $qMap $field]
                    set line $rest
                } elseif { [regexp $rQE $line match field rest] } {
                    # last quoted 
                    set field [string map $qMap $field]
                    set line {}
                } elseif { [regexp $rQO $line match field] } {
                    # open quoted
                    set field [string map $qMap $field]
                    gets $file line
                    while { 1 } {
                        if { [string first $quot $line] == -1 } {
                            # speed next line
                            append field $line
                            gets $file line
                        } elseif { [regexp $rON $line match next rest] } {
                            # end with next fields
                            append field "\n[string map $qMap $next]"
                            set line $rest
                            break
                        } elseif { [regexp $rOE $line match next] } {
                            # end
                            append field "\n[string map $qMap $next]"
                            set line {}
                            break
                        } elseif { [regexp $rOX $line] } {
                            error "Invalid CSV syntax.\nline=$line\nfield=$field"
                        } else {
                            # next line
                            append field "\n[string map $qMap $next]"
                            gets $file line
                        }
                    }
                } else {
                    error "Invalid CSV syntax.\nline=$line"
                }
            } else {
                # handle unquoted field
                if { [regexp $rUN $line match field rest] } {
                    # unquoted with next
                    set line $rest
                } else {
                    # unquoted last
                    set field $line
                    set line {}
                }
                # tream leading and trailing spaces
                set field [string trim $field]
            }
            # append parset field to record list
            lappend list $field
        }
        # call command
        eval $cmd [list $list]
        # get next record line
        gets $file line
    }
 }

File is read by line. For each csv record is called proc cmd with list of parsed fields as params.

Test example:

 proc csvDump { list } {
    global out
    puts $out [string map { \n "#" } [join $list |]]
 }

 proc test {} {
    global out
    set out [open test.out w]
    fconfigure $out -encoding iso8859-2
    set fd [open test.csv]
    fconfigure $fd -encoding cp1250
    csvParseFileCmd $fd csvDump , '
    close $fd
    close $out
 }

 test

Tested with 118MB csv file. On Athlon XP 1600/512MB RAM, Debian Woody/Tcl 8.4 take about 3 min 20 sec.


Maybe this is too simple but this has worked for me. If the line ends in a quoted field there will be an odd number of " 's. I'm deliberately not keeping the embedded line ends so it may not suit others' needs. The returned line can be parsed with tcllib's csv parser - VPT

 proc buildLine {ch} {

      set cnt 0
      set line {}
      while {![eof $ch] } {
          gets $ch linePart
          set cnt [expr $cnt + [regexp -all {"} $linePart]]
          append line $linePart
          if {$cnt % 2 == 0} break
      }
      return $line
 }

pi31415

Here is a brute force emitter and parser based on a script I wrote for a recent hobby project. csv::formatCsv expects a list of lists and produces CSV. csv::parseText expects CSV and produces a list of lists.

 # Code to emit or parse csv
 # See also: http://en.wikipedia.org/wiki/CSV_application_support
 # Uses Unicode.  For ASCII: http://billposer.org/Software/uni2ascii.html

 namespace eval csv {
     # leave eol \n and use translation to write other formats
     set equo {""}
     set eol "\n"
     set quo {"}
     set sep ","
 }

 proc csv::encodeLine {fields} {
     set field undefined
     set parts [list]
     set retval undefined
     foreach {field} $fields {
         set field [regsub -all $csv::quo $field $csv::equo]
         if {[regexp "\[$csv::sep\\s\]" $field]} {
             set field "$csv::quo$field$csv::quo"
         }
         lappend parts $field
     }
     set retval [join [list [join $parts $csv::sep] $csv::eol] ""]
     return $retval
 }

 proc csv::formatCsv {data} {
     set line undefined
     set lines [list]
     set retval undefined
     foreach {line} $data {
         lappend lines [encodeLine $line]
     }
     set retval [join $lines ""]
     return $retval
 }

 proc csv::parseText {text} {
     set c undefined
     set chars [list]
     set fields [list]
     set i undefined
     set parts [list]
     set quoted false
     set retval [list]
     set text [string map [list "\r" ""] $text]
     if {![regexp "$csv::eol\$" $text]} {
         append text $csv::eol
     }

     # Parse the content character by character
     for {set i 0} {$i < [string length $text]} {incr i} {
         set c [string index $text $i]
         if {$c eq $csv::quo} {
             set quoted [expr {!$quoted}]
         }

         if {$c eq $csv::eol && !$quoted} {
             # End of line
             lappend parts $chars

             foreach {chars} $parts {
                 if {[llength $chars] > 0} {
                     if {[lindex $chars 0] eq $csv::quo} {
                         # Remove enclosure delimiters
                         set chars [lrange $chars 1 "end-1"]
                     } elseif {[join [lrange $chars 0 1] ""] eq "=$csv::quo"} {
                         # Remove formula sigil for CSV_application_support
                         set chars [lrange $chars 2 "end-1"]
                     }
                 }

                 # dequote quotes
                 lappend fields [regsub -all \
                     $csv::equo              \
                     [join $chars ""]        \
                     $csv::quo               \
                 ]
             }

             lappend retval $fields
             set chars [list]
             set fields [list]
             set parts [list]
         } elseif {$c eq $csv::sep && !$quoted} {
             # End of field
             lappend parts $chars
             set chars [list]
         } else {
             # Inside the field
             lappend chars $c
         }
     }
     return $retval
 }

 set data [list                     \
     [list row row row "your boat"] \
     [list gently down the stream]  \
 ]
 puts "original data:\n$data\n"

 set text [csv::formatCsv $data]
 puts "text:\n$text"

 set newdata [csv::parseText $text]
 puts "newdata:\n$newdata"

See also csv

See also LogParser