Version 7 of Parsing csv file

Updated 2006-02-11 15:16:03

Line csv parser can't do proper handling of multiline quoted field.

Michael Heca's solution:

 proc csvParseFileCmd { file cmd {sep {,}} {quot {'}} } {

    # prepare regular expresions
    set rQuotedText [subst -nobackslashes -nocommands {(?:[^${quot}]|${quot}${quot})*}]
    set rQ  [subst -nobackslashes -nocommands {^[ \t]*${quot}}]
    set rQN [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
    set rQE [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*$}]
    set rQO [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)$}]
    set rON [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
    set rOE [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*$}]
    set rOX [subst -nobackslashes -nocommands {^$rQuotedText${quot}[^${quot}]}]
    set rUN [subst -nobackslashes -nocommands {^([^${sep}]*)${sep}(.*)$}]
    set qMap [list "${quot}${quot}" $quot]

    # parse whole file
    gets $file line
    while { $line != {} } {
        set list {}
        # parse one record
        while { $line != {} } {
            if { [regexp $rQ $line] } {
                # handle quoted fields
                if { [regexp $rQN $line match field rest] } {
                    # quoted with next
                    set field [string map $qMap $field]
                    set line $rest
                } elseif { [regexp $rQE $line match field rest] } {
                    # last quoted 
                    set field [string map $qMap $field]
                    set line {}
                } elseif { [regexp $rQO $line match field] } {
                    # open quoted
                    set field [string map $qMap $field]
                    gets $file line
                    while { 1 } {
                        if { [string first $quot $line] == -1 } {
                            # speed next line
                            append field $line
                            gets $file line
                        } elseif { [regexp $rON $line match next rest] } {
                            # end with next fields
                            append field "\n[string map $qMap $next]"
                            set line $rest
                            break
                        } elseif { [regexp $rOE $line match next] } {
                            # end
                            append field "\n[string map $qMap $next]"
                            set line {}
                            break
                        } elseif { [regexp $rOX $line] } {
                            error "Invalid CSV syntax.\nline=$line\nfield=$field"
                        } else {
                            # next line
                            append field "\n[string map $qMap $next]"
                            gets $file line
                        }
                    }
                } else {
                    error "Invalid CSV syntax.\nline=$line"
                }
            } else {
                # handle unquoted field
                if { [regexp $rUN $line match field rest] } {
                    # unquoted with next
                    set line $rest
                } else {
                    # unquoted last
                    set field $line
                    set line {}
                }
                # tream leading and trailing spaces
                set field [string trim $field]
            }
            # append parset field to record list
            lappend list $field
        }
        # call command
        eval $cmd [list $list]
        # get next record line
        gets $file line
    }
 }

File is read by line. For each csv record is called proc cmd with list of parsed fields as params.

Test example:

 proc csvDump { list } {
    global out
    puts $out [string map { \n "#" } [join $list |]]
 }

 proc test {} {
    global out
    set out [open test.out w]
    fconfigure $out -encoding iso8859-2
    set fd [open test.csv]
    fconfigure $fd -encoding cp1250
    csvParseFileCmd $fd csvDump , '
    close $fd
    close $out
 }

 test

Tested with 118MB csv file. On Athlon XP 1600/512MB RAM, Debian Woody/Tcl 8.4 take about 3 min 20 sec.


See also csv

See also LogParser


Maybe this is too simple but this has worked for me. If the line ends in a quoted field there will be an odd number of "'s. I'm deliberately not keeping the embedded line ends so it may not suit others needs. The returned line can be parsed with tcllib's csv parser - VPT

 proc buildLine {ch} {

      set cnt 0
      set line {}
      while {![eof $ch] } {
          gets $ch linePart
          set cnt [expr $cnt + [regexp -all {"} $linePart]]
          append line $linePart
          if {$cnt % 2 == 0} break
      }
      return $line
 }

Category File