xtt: XML <--> Text Tag translator

EKB I wrote this long before I knew about this wiki. It converts tags in a text canvas to and from XML (so you can write up an XML file, then load it into a text widget). It needs some work (such as entities and handling nested tags), but can do a few tricks, so I thought it was worth sharing.

The main file, xtt.tcl:

    package require dom 2.0
    package require struct
    
    # xtt: The XML <--> Text Tag Translator
    # ver 1.0
    #
    # Copyright (c) 2003 Eric Kemp-Benedict
    # All Rights Reserved
    #
    # This code is freely distributable, but is provided as-is with
    # no waranty expressed or implied.
    #
    # Send comments to [email protected]. If you make improvements,
    # please send them to me. I will give you credit and distribute the
    # improved code. Thanks!
    
    # == Description ==
    #
    # xtt offers an interface between a tk text widget's tags and
    # an XML document's tags.
    #
    # NOTE: It's pretty basic. In particular, it doesn't (yet) handle
    # nested tags or entities.
    #
    # To use it, just "source" xtt.tcl in your tcl script. Note
    # that xtt requires the dom and struct packages. Struct is
    # part of tcllib.
    #
    # Example:
    #
    # Step 1: Associate XML codes with text widget tags
    #
    #  set xtt::tagArray(i) italic      ;# "i" is the XML tag, "italic" is the text widget tag
    #  set xtt::tagArray(b) bold
    #  set xtt::tagArray(bi) boldital
    #  set xtt::tagArray(sup) super
    #  set xtt::tagArray(sub) sub
    #
    # Step 2: Specify a tag for a paragraph element (defaults to p, so this is optional)
    #
    #  set xtt::paraElem para
    #
    # Step 3: Start translating!
    #
    # From XML -> TextWidget
    #  xtt::XMLtoText .myTextWidget $parsedXMLdoc
    #
    # From Text Widget -> XML
    #  set XMLoutput [xtt::TextToXML .t]
    #
    # From Text Widget -> DOMnode
    #  set DOMnode [xtt::TextToDOM .t]
    #
    
    namespace eval xtt {
    
       #############################################
       ##
       ## Interface
       ##
       #############################################
    
       variable tagArray
       variable paraElem "p"
    
       proc XMLtoText {w DOMnode} {
          variable tagArray
          variable paraElem
    
          set paralist [dom::element getElementsByTagName $DOMnode $paraElem]
          foreach p [set $paralist] {
             xtt::expandNodes $w $p
             $w insert end "\n"
          }
       }
    
       proc TextToXML {w} {
          # NOTE: The stack is for future flexibility. At the moment nested tags are not
          # processed. In future versions I expect to process nested tags and that will
          # be easier with a stack.
    
          struct::stack tagStack
    
          set dump [$w dump -tag -text 1.0 end]
          set length [llength $dump]
    
          set retval "<p>"
    
          for {set i 0} {$i < $length} {incr i} {
             switch [lindex $dump $i] {
                text {
                   incr i
                   set retval $retval[lindex $dump $i]
                }
                tagon {
                   incr i
                   tagStack push [xtt::getTagCode [lindex $dump $i]]
                   set retval $retval<[tagStack peek]>
                }
                tagoff {
                   incr i
                   set retval $retval</[tagStack pop]>
                }
             }
          }
    
          set retval $retval</p>
    
          # Replace all newlines with "</p><p>"
          regsub -all -- "\\n" $retval "</p><p>" retval
    
          # Strip multiple newlines at the end
          regsub -- "(<p></p>)+$" $retval "" retval
    
          tagStack destroy
    
          return $retval
       }
    
       proc TextToDOM {w} {
          # Wrap the XML in a fake "document"
          set XMLtext "<document>[TextToXML $w]</document>"
    
          # Return the first child (which is all the contents)
          return [dom::node cget [dom::parse $XMLtext] -firstChild]
       }
    
       #############################################
       ##
       ## Supporting routines
       ##
       #############################################
    
       proc expandNodes {w paraNode} {
          variable tagArray
          variable paraElem
    
          set childList [dom::node children $paraNode]
          foreach child $childList {
             if {[dom::node cget $child -nodeType] != "textNode"} {
                # Recursively call expandNodes, to nest tags
                # Nested tags follow formatting rules for Tk text widget tags
                xtt::expandNodes $w $child
             }
             set type [dom::node cget [dom::node parent $child] -nodeName]
             set val [stripNewlines [dom::node cget $child -nodeValue]]
             if {$type == $paraElem} {
                $w insert end $val
             } else {
                $w insert end $val $tagArray($type)
             }
          }
    }
    
       proc stripNewlines {text} {
    
          # Loop through and remove any newlines from text. Replace with a space if adjacent characters are not spaces,
          #   or if not at beginning or end of string.
          while {[string first "\n" $text] != -1} {
             set newlinePos [string first "\n" $text]
             set charBefore [expr $newlinePos - 1]
             set charAfter [expr $newlinePos + 1]
             set alreadySpace false
             if {$newlinePos == 0 || $newlinePos == [expr [string length $text] - 1]} {set alreadySpace true}
             if {$newlinePos != 0} {
                if {[string range $text $charBefore $charBefore] == " "} {set alreadySpace true}
             }
             if {$newlinePos != [expr [string length $text] - 1]} {
                if {[string range $text $charAfter $charAfter] == " "} {set alreadySpace true}
             }
             if {$alreadySpace} {
                set replaceText ""
             } else {
                set replaceText " "
             }
             set text [string replace $text $newlinePos $newlinePos $replaceText]
          }
          return $text
       }
    
       proc getTagCode {code} {
          variable tagArray
    
          foreach name [array names tagArray] {
             if {$tagArray($name) == $code} {return $name}
          }
    
          error "Tag code does not exist"
       }
    
    
    }

A demo script:

    source "xtt.tcl"
    
    ##
    ## Set up the text widget
    ##
    set font(normal) "Times 12"
    set font(ital) "$font(normal) italic"
    set font(bold) "$font(normal) bold"
    set font(boldital) "$font(normal) bold italic"
    set font(small) "Times 8"
    
    text .t -font $font(normal) -wrap word -spacing3 18p -spacing2 6p -width 70 -height 10
    
    # Add the ".proc" window to look at the processed XML
    text .proc -font $font(normal) -wrap word -width 70 -height 10
    
    pack .t -fill both -expand yes -side top
    pack .proc -fill both -expand yes
    
    .t tag config italic -font $font(ital)
    .t tag config bold -font $font(bold)
    .t tag config boldital -font $font(boldital)
    .t tag config super -offset 6 -font $font(small)
    .t tag config sub -offset -6 -font $font(small)
    
    ##
    ## Load the xml source
    ##
    set xmlFile [open "TestDoc.xml" r]
    set document [read $xmlFile]
    close $xmlFile
    
    ##
    ## Process the xml source
    ##
    # Move from "document" down to the main node
    set parsedDoc [dom::node cget [dom::parse $document] -firstChild]
    
    ########################################################
    ##
    ## This is the interface between text widget and XML
    ##
    ########################################################
    
    ##
    ## Associate XML codes with text widget tags
    ##
    set xtt::tagArray(i) italic
    set xtt::tagArray(b) bold
    set xtt::tagArray(bi) boldital
    set xtt::tagArray(sup) super
    set xtt::tagArray(sub) sub
    
    xtt::XMLtoText .t $parsedDoc
    
    .proc insert end [xtt::TextToXML .t]
    
    ########################################################
    ##
    ## End of interface
    ##
    ########################################################

The sample file, TestDoc.xml used by the demo script:

    <body>
    <p>
    This is text, <i>this is italicized</i>, this is normal. Here's a subscript: CO<sub>2</sub>. The rest of the paragraph is pretty long,
    allowing it to be wrapped in the window. It just keeps going and going and there isn't much you can do about it. What would you do about it, anyway? Just make sure it wraps
    properly and also that any newlines in the XML file are properly
    stripped out before putting them in the text widget. Only text marked
    off with paragraph tags should receive newlines.
    </p>
    <p>
    This is another paragraph, with <b>bold</b> text in it. Later in this paragraph I will add some other
    special text, but first I want a long enough run of text that there may be some wrapping. Otherwise,
    I'm curious to see what a superscript<sup>1</sup> might look like.
    </p>
    <p>Unfortunately, xtt doesn't (yet) handle nested tags, so I have to
    make up a new tag to do <bi>bold italic</bi>. To have some bold text inside
    an italicized block, I have to do this: <i>This is </i><bi>so</bi> <i>italic!</i></p>
    </body>
if you prefer working with tdom replace these rows:
in xtt.tcl

line 1: package require tdom
line 70: set paralist [$DOMnode getElementsByTagName $paraElem]
line 71: foreach p $paralist {
line 125: return [dom parse $XMLtext]
line 138: set childList [$paraNode childNodes]
line 140: if {[$child nodeType] != "TEXT_NODE"} {
line 145: set type [[$child parentNode] nodeName]
line 146: set val [stripNewlines [$child nodeValue]]


in test.tcl
    set parsedDoc [dom  parse $document]