[gwlester] 15-Mar-2011
The CSV package in TclLib does handle multiline quoted fields:
======
% package require csv
0.7.2
% set line {a,"b1
b2",c}
a,"b1
b2",c
% set list [::csv::split $line]
a {b1
b2} c
%
======
----
[pi31415] 18-Mar-2011
Some tcllib examples don't handle multiline quoted fields:
$ cd examples/csv
$ cat foo.csv
a,"b1
b2",c
$ ./csv2html foo.csv
Title
Title
----
Line csv parser can't do proper handling of multiline quoted field.
[Michael Heca]'s solution:
proc csvParseFileCmd { file cmd {sep {,}} {quot {'}} } {
# prepare regular expresions
set rQuotedText [subst -nobackslashes -nocommands {(?:[^${quot}]|${quot}${quot})*}]
set rQ [subst -nobackslashes -nocommands {^[ \t]*${quot}}]
set rQN [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
set rQE [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)${quot}[ \t]*$}]
set rQO [subst -nobackslashes -nocommands {^[ \t]*${quot}($rQuotedText)$}]
set rON [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*${sep}(.*)$}]
set rOE [subst -nobackslashes -nocommands {^($rQuotedText)${quot}[ \t]*$}]
set rOX [subst -nobackslashes -nocommands {^$rQuotedText${quot}[^${quot}]}]
set rUN [subst -nobackslashes -nocommands {^([^${sep}]*)${sep}(.*)$}]
set qMap [list "${quot}${quot}" $quot]
# parse whole file
gets $file line
while { $line != {} } {
set list {}
# parse one record
while { $line != {} } {
if { [regexp $rQ $line] } {
# handle quoted fields
if { [regexp $rQN $line match field rest] } {
# quoted with next
set field [string map $qMap $field]
set line $rest
} elseif { [regexp $rQE $line match field rest] } {
# last quoted
set field [string map $qMap $field]
set line {}
} elseif { [regexp $rQO $line match field] } {
# open quoted
set field [string map $qMap $field]
gets $file line
while { 1 } {
if { [string first $quot $line] == -1 } {
# speed next line
append field $line
gets $file line
} elseif { [regexp $rON $line match next rest] } {
# end with next fields
append field "\n[string map $qMap $next]"
set line $rest
break
} elseif { [regexp $rOE $line match next] } {
# end
append field "\n[string map $qMap $next]"
set line {}
break
} elseif { [regexp $rOX $line] } {
error "Invalid CSV syntax.\nline=$line\nfield=$field"
} else {
# next line
append field "\n[string map $qMap $next]"
gets $file line
}
}
} else {
error "Invalid CSV syntax.\nline=$line"
}
} else {
# handle unquoted field
if { [regexp $rUN $line match field rest] } {
# unquoted with next
set line $rest
} else {
# unquoted last
set field $line
set line {}
}
# tream leading and trailing spaces
set field [string trim $field]
}
# append parset field to record list
lappend list $field
}
# call command
eval $cmd [list $list]
# get next record line
gets $file line
}
}
File is read by line. For each csv record is called proc cmd with list of parsed fields as params.
Test example:
proc csvDump { list } {
global out
puts $out [string map { \n "#" } [join $list |]]
}
proc test {} {
global out
set out [open test.out w]
fconfigure $out -encoding iso8859-2
set fd [open test.csv]
fconfigure $fd -encoding cp1250
csvParseFileCmd $fd csvDump , '
close $fd
close $out
}
test
Tested with 118MB csv file. On Athlon XP 1600/512MB RAM, Debian Woody/Tcl 8.4 take about 3 min 20 sec.
----
Maybe this is too simple but this has worked for me. If the line ends in a quoted field there will be an odd number of " 's. I'm deliberately not keeping the embedded line ends so it may not suit others' needs. The returned line can be parsed with tcllib's csv parser - VPT
proc buildLine {ch} {
set cnt 0
set line {}
while {![eof $ch] } {
gets $ch linePart
set cnt [expr $cnt + [regexp -all {"} $linePart]]
append line $linePart
if {$cnt % 2 == 0} break
}
return $line
}
----
[pi31415]
Here is a brute force emitter and parser based on a script I wrote for a recent hobby project. csv::formatCsv expects a list of lists and produces CSV. csv::parseText expects CSV and produces a list of lists.
# Code to emit or parse csv
# See also: http://en.wikipedia.org/wiki/CSV_application_support
# Uses Unicode. For ASCII: http://billposer.org/Software/uni2ascii.html
namespace eval csv {
# leave eol \n and use translation to write other formats
set equo {""}
set eol "\n"
set quo {"}
set sep ","
}
proc csv::encodeLine {fields} {
set field undefined
set parts [list]
set retval undefined
foreach {field} $fields {
set field [regsub -all $csv::quo $field $csv::equo]
if {[regexp "\[$csv::sep\\s\]" $field]} {
set field "$csv::quo$field$csv::quo"
}
lappend parts $field
}
set retval [join [list [join $parts $csv::sep] $csv::eol] ""]
return $retval
}
proc csv::formatCsv {data} {
set line undefined
set lines [list]
set retval undefined
foreach {line} $data {
lappend lines [encodeLine $line]
}
set retval [join $lines ""]
return $retval
}
proc csv::parseText {text} {
set c undefined
set chars [list]
set fields [list]
set i undefined
set parts [list]
set quoted false
set retval [list]
set text [string map [list "\r" ""] $text]
if {![regexp "$csv::eol\$" $text]} {
append text $csv::eol
}
# Parse the content character by character
for {set i 0} {$i < [string length $text]} {incr i} {
set c [string index $text $i]
if {$c eq $csv::quo} {
set quoted [expr {!$quoted}]
}
if {$c eq $csv::eol && !$quoted} {
# End of line
lappend parts $chars
foreach {chars} $parts {
if {[llength $chars] > 0} {
if {[lindex $chars 0] eq $csv::quo} {
# Remove enclosure delimiters
set chars [lrange $chars 1 "end-1"]
} elseif {[join [lrange $chars 0 1] ""] eq "=$csv::quo"} {
# Remove formula sigil for CSV_application_support
set chars [lrange $chars 2 "end-1"]
}
}
# dequote quotes
lappend fields [regsub -all \
$csv::equo \
[join $chars ""] \
$csv::quo \
]
}
lappend retval $fields
set chars [list]
set fields [list]
set parts [list]
} elseif {$c eq $csv::sep && !$quoted} {
# End of field
lappend parts $chars
set chars [list]
} else {
# Inside the field
lappend chars $c
}
}
return $retval
}
set data [list \
[list row row row "your boat"] \
[list gently down the stream] \
]
puts "original data:\n$data\n"
set text [csv::formatCsv $data]
puts "text:\n$text"
set newdata [csv::parseText $text]
puts "newdata:\n$newdata"
----
See also [csv]
See also [LogParser]
<> File | Parsing