I asked some questions about regular expressions on the Tclers Chat and those who helped me with my answers suggested I post the solutions to a new page, so I did here Advanced Regular Expression Examples. In addition, they thought I should post my entire code, so I have, below. buchs
#!/usr/bin/tclsh # Convert Wiki Content from the OddMuse (http://www.oddmuse.org) wiki syntax to # Confluence (http://www.atlassian.com/software/confluence/) wiki syntax. # Results is a series of .txt files that need to be imported via Confluence. # Kevin Buchs, Mayo Clinic, 2008 # Specify the location of the OddMuse storage and the directory where you want output files written. set gbl(wikisource) //sppdgfs/local/doc/oddmuse-sppdg/page/* set gbl(work) /users/buchs/wiki_import # This converts the contents of each file found in OddMuse proc convert_body {thebody} { # double-bracketed expressions with URLs regsub -all {\[\[(http://.*?) (.*?)\]\]} $thebody {[\2|\1]} thebody # double-bracketed expressions without URLs # internal links might have spaces between the words, e.g. [[My Node]] and these # are mapped by OddMuse to underscores for the sake of the file names - which are # how the pages are referenced. So, we need to take those spaces out first. # Someone have a cleaner way to do this without the repetition? while { [regsub -all {(\[\[[^] ]+) (.+?\]\])} $thebody {\1_\2} thebody] > 0 } {} # then make the double brackets single brackets regsub -all {\[\[(.*?)\]\]} $thebody {[\1]} thebody # various single-bracketed expressions with URLs regsub -all {\[(http://[^] ]*?)\]} $thebody {[note|\1]} thebody regsub -all {\[(http://[^ ]*?) ([^]]*?)\]} $thebody {[\2|\1]} thebody regsub -all {\[image:([^] ]*?)[ ]*([^]]*?)\]} $thebody {!\1!} thebody regsub -all {\[image:([^] ]*)[ ]*([^]]*?)\]} $thebody {!\1!} thebody # look for CamelCase words and force them to links - but check that they are not already links # old version: regsub -all {(\s)([A-Z]+?[a-z]+?[A-Z]+?.*?)(\s)} $thebody {\1[\2]\3} thebody regsub -all {([^A-Za-z)([A-Z]+?[a-z]+?[A-Z]+?.*?)(\s)} $thebody {\1[\2]\3} thebody # look for escaped CamelCase words, remove the escape. regsub -all {(!)([A-Z]+?[a-z]+?[A-Z])} $thebody {\2} thebody # Headings - start lowest to highest regsub -all {====== *([^=\n]+?) *======} $thebody {.h6 \1} thebody regsub -all {===== *([^=\n]+?) *=====} $thebody {.h5 \1} thebody regsub -all {==== *([^=\n]+?) *====} $thebody {.h4 \1} thebody regsub -all {=== *([^=\n]+?) *===} $thebody {.h3 \1} thebody regsub -all {== *([^=\n]+?) *==} $thebody {.h2 \1} thebody regsub -all {= *([^=\n]+?) *=} $thebody {.h1 \1} thebody # Lists - nothing to do for bullets or numbered. For definition lists there # is no good analog. Just approximate by bolding the term with a colon and # two spaces following regsub -all -lineanchor {^;([^:]+?:) *} $thebody {*\1* } thebody # Bold and Italics - need to include newlines regsub -all {'''(.+?)'''} $thebody {*\1*} thebody regsub -all {''(.+?)''} $thebody {*\1*} thebody # Tables regsub -all {\|\|} $thebody {|} thebody # Code lines regsub -all -lineanchor {((^ +[^\n]+\n)+)} $thebody {{noformat} \1{noformat} } thebody # Indented Text (leading colon) - just make 2 spaces - do this after handling code lines regsub -all -lineanchor {^:} $thebody { } thebody # Horizontal lines - four dashes on a line alone - same in confluence. # HTML Tags that are supported regsub -all {</?(em|i)>} $thebody {_} thebody regsub -all {</?(strong|b)>} $thebody {*} thebody regsub -all {</?u>} $thebody {+} thebody # Next fixed width format regsub -all {<tt>(.+?)</tt>} $thebody {\{\{\1\}\}} thebody # The special tags nowiki, code and pre are different in OddMuse but will/can all be translated as # the same {{}} construct in Confluence because Confluence respects inserted line breaks anyway. regsub -all {<nowiki>(.+?)</nowiki>} $thebody {\{\{\1\}\}} thebody regsub -all {<code>(.+?)</code>} $thebody {\{\{\1\}\}} thebody regsub -all {<pre>(.+?)</pre>} $thebody {\{\{\1\}\}} thebody return $thebody } # Steps through the files of OddMuse, converts the content via a call to convert_body and then writes the results to a similarly named file with a .txt extension. proc ExtractionFromOddmuseFiles {} { global gbl # the following is Control-^ times two, it is the delimiter of choice for OddMuse database files set doohicky # Step through the directories foreach dir1 [glob $gbl(wikisource)] { # And each file in each directory foreach filename [glob ${dir1}/*.db] { set fp [open $filename r] set data [read $fp] close $fp if { ! [regexp "${doohicky}3text${doohicky}3\(.*?\)${doohicky}3" $data match thebody] } { puts "file: \"$filename\" had a format\n which didn't match my expectations" } else { set thebody [convert_body $thebody] set ofilename \ "${gbl(work)}/[file rootname [file tail $filename]].txt" set fp [open $ofilename w] puts $fp $thebody close $fp } } } puts "Scan complete" } # MAIN LINE ExtractionFromOddmuseFiles # exit