A very simple package i wrote to parse SOIF Objects (RFC 2655, [L1 ]) as used by the harvest search engine [L2 ].
Drop me a note, if you find it useful:
# Parse SOIF Objects as defined by RFC 2655 # # (c) 2002 Michael Schlenker <[email protected]> # # # # License: Use under the same license as the tcl core. # # # uses uri package from tcllib package require Tcl 8.2 package require uri package provide SOIF 0.1 namespace eval SOIF { set version 0.1 set notalnumregexp {[^a-zA-Z0-9\-_]} set identifier {([a-zA-Z0-9\-_]+)\{([0-9]+)\}(:\t)} } # -- SOIF::parse # # Description: Parse a SOIF object into a # list of values. # # Input: single SOIF object # # Output: list of the form # "TEMPLATE-TYPE URL ATTRIBUTE-VALUELIST" # # proc SOIF::parse {obj} { variable notalnumregexp variable identifier # check for @ symbol set k [string first @ $obj] if {$k == -1} { error "No SOIF Object" } # check for template type set l [string first \{ $obj ] if {$l == -1} { error "No SOIF Template Type" } set template_type "" set template_type [string trim \ [string range $obj [expr {$k+1}] [expr {$l-1}]]] # validate, that it is alphanumeric if {[regexp $notalnumregexp $template_type]} { error "Template Type \"$template_type\" not valid \ alphanumeric template-type." } # check for URL set m $l set url_candidate "" while {[string length $url_candidate]==0} { # the rfc is unclear how to identify # the url, trying this set n [string first "\n" $obj $m] set url_candidate [string trim \ [string range $obj [expr {$m+1}] $n]] set m [expr {$n+1}] } # handle the special case that no url is given if {![string equal $url_candidate "-"]} { # check if this is a URL here, # this should throw an error # if no valid url is found if {[catch {uri::split $url_candidate} clist] == 1} { error "URL \"$url_candidate\" not of \ known type." } } set url $url_candidate set attvalue "" # header is done, now parse attribute value pairs set start $n while {[regexp -indices -start $start --\ $identifier $obj -> id length delimeter]} { set id_text [string range $obj \ [lindex $id 0] [lindex $id 1]] set length [string range $obj \ [lindex $length 0] [lindex $length 1]] set offset [expr [lindex $delimeter 1] +1] set value [string range \ $obj $offset [expr {$offset+$length-1}]] lappend attvalue $id_text $value set start [expr {$offset+$length}] } # all identifiers and values have been parsed # check for closing \} if {![regexp -indices -start $start -- {\}} $obj -> dummy]} { error "Missing close brace on obj" } set result [list $template_type $url $attvalue] return $result } # -- SOIF::readObjectFromFile # # Description: Reads a SOIF Object from File # (only one object per file should be used) # # Input: Filename # # Output: SOIF Object # proc SOIF::readObjectFromFile { filename } { if {![file exists $filename]} { error "No file \"${filename}\" exists." } if {[catch {open $filename r} fid]} { error "Opening \"$filename\" failed." } # set the translation to binary, # as SOIF can contain arbitrary data fconfigure $fid -translation binary set obj [read $fid] if {[catch {close $fid}]} { error "Closing \"$filename\" with \ channel ID \"$fid\" failed." } return $obj } # -- SOIF::writeObjectToFile # # Description: Writes the string rep of a SOIF Object to Disk # The string rep should be built # with SOIF::create. # # Input: Filename # SOIF-Object # # Output: -- # proc SOIF::writeObjectToFile { filename obj } { if {![file exists $filename]} { error "File \"$filename\" exists, cannot write" } if {[catch {open $filename w+} fid]} { error "File \"$filename\" could not be \ opened for writing." } # set the translation to binary, # as SOIF can contain arbitrary data fconfigure $fid -translation binary puts -nonewline $fid $obj if {[catch {close $fid}]} { error "Closing \"$filename\" with channel \ ID \"$fid\" failed." } } # -- SOIF::create # # Description: Creates a string rep from the parts # of a SOIF object. # # Bugs: Does not check, if the data it gets # is well formed. # # Input: Template-Type # URL # Identifier-Value List # # Output: SOIF String rep # proc SOIF::create {template-type url attvaluelist} { set obj "" append obj "@${template-type} \{ " append obj $url append obj "\r\n" foreach {attribute value} $attlist { set length [string length $value] set identifier "$attribute\{$length\}:\t" append obj $identifier $value # prettyprinting with extra newlines append obj "\r\n" } append obj "\}\n" return $obj }