Version 0 of Parsing csv file

Updated 2002-12-15 11:56:02

Line csv parser can't do proper handling of multiline quoted field.

Michael Heca's solution:

 proc csvParseFileCmd { file cmd {sep {,}} {quot {'}} } {

    # prepare regular expresions
    set rQuotedText [subst -nobackslashes -nocommands {(?:[^${quot}]|${quot}${quot})*}]
    set rQ  [subst -nobackslashes -nocommands {^[ \t]*${quot}}]
    set rQN [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
    set rQE [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*$}]
    set rQO [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)$}]
    set rON [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
    set rOE [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*$}]
    set rOX [subst -nobackslashes -nocommands {^$rQuotedText${quot}[^${quot}]}]
    set rUN [subst -nobackslashes -nocommands {^([^${sep}]*)${sep}(.*)$}]
    set qMap [list "${quot}${quot}" $quot]

    # parse whole file
    gets $file line
    while { $line != {} } {
        set list {}
        # parse one record
        while { $line != {} } {
            if { [regexp $rQ $line] } {
                # handle quoted fields
                if { [regexp $rQN $line match field rest] } {
                    # quoted with next
                    set field [string map $qMap $field]
                    set line $rest
                } elseif { [regexp $rQE $line match field rest] } {
                    # last quoted 
                    set field [string map $qMap $field]
                    set line {}
                } elseif { [regexp $rQO $line match field] } {
                    # open quoted
                    set field [string map $qMap $field]
                    gets $file line
                    while { 1 } {
                        if { [string first $quot $line] == -1 } {
                            # speed next line
                            append field $line
                            gets $file line
                        } elseif { [regexp $rON $line match next rest] } {
                            # end with next fields
                            append field "\n[string map $qMap $next]"
                            set line $rest
                            break
                        } elseif { [regexp $rOE $line match next] } {
                            # end
                            append field "\n[string map $qMap $next]"
                            set line {}
                            break
                        } elseif { [regexp $rOX $line] } {
                            error "Invalid CSV syntax.\nline=$line\nfield=$field"
                        } else {
                            # next line
                            append field "\n[string map $qMap $next]"
                            gets $file line
                        }
                    }
                }
            } else {
                # handle unquoted field
                if { [regexp $rUN $line match field rest] } {
                    # unquoted with next
                    set line $rest
                } else {
                    # unquoted last
                    set field $line
                    set line {}
                }
                # tream leading and trailing spaces
                set field [string trim $field]
            }
            # append parset field to record list
            lappend list $field
        }
        # call command
        eval $cmd [list $list]
        # get next record line
        gets $file line
    }
 }

Test example:

 proc csvDump { list } {
    global out
    puts $out [string map { \n "#" } [join $list |]]
 }

 proc test {} {
    global out
    set out [open test.out w]
    fconfigure $out -encoding iso8859-2
    set fd [open test.csv]
    fconfigure $fd -encoding cp1250
    csvParseFileCmd $fd csvDump , '
    close $fd
    close $out
 }

 test

See also csv