I asked some questions about regular expressions on the Tclers Chat and those who helped me with my answers suggested I post the solutions to a new page, so I did here [Advanced Regular Expression Examples]. In addition, they thought I should post my entire code, so I have, below. [buchs]
----
======
#!/usr/bin/tclsh
# Convert Wiki Content from the OddMuse (http://www.oddmuse.org) wiki syntax to
# Confluence (http://www.atlassian.com/software/confluence/) wiki syntax.
# Results is a series of .txt files that need to be imported via Confluence.
# Kevin Buchs, Mayo Clinic, 2008
# Specify the location of the OddMuse storage and the directory where you want output files written.
set gbl(wikisource) //sppdgfs/local/doc/oddmuse-sppdg/page/*
set gbl(work) /users/buchs/wiki_import
# This converts the contents of each file found in OddMuse
proc convert_body {thebody} {
# double-bracketed expressions with URLs
regsub -all {\[\[(http://.*?) (.*?)\]\]} $thebody {[\2|\1]} thebody
# double-bracketed expressions without URLs
# internal links might have spaces between the words, e.g. [[My Node]] and these
# are mapped by OddMuse to underscores for the sake of the file names - which are
# how the pages are referenced. So, we need to take those spaces out first.
# Someone have a cleaner way to do this without the repetition?
while { [regsub -all {(\[\[[^] ]+) (.+?\]\])} $thebody {\1_\2} thebody] > 0 } {}
# then make the double brackets single brackets
regsub -all {\[\[(.*?)\]\]} $thebody {[\1]} thebody
# various single-bracketed expressions with URLs
regsub -all {\[(http://[^] ]*?)\]} $thebody {[note|\1]} thebody
regsub -all {\[(http://[^ ]*?) ([^]]*?)\]} $thebody {[\2|\1]} thebody
regsub -all {\[image:([^] ]*?)[ ]*([^]]*?)\]} $thebody {!\1!} thebody
regsub -all {\[image:([^] ]*)[ ]*([^]]*?)\]} $thebody {!\1!} thebody
# look for CamelCase words and force them to links - but check that they are not already links
# old version: regsub -all {(\s)([A-Z]+?[a-z]+?[A-Z]+?.*?)(\s)} $thebody {\1[\2]\3} thebody
regsub -all {([^A-Za-z)([A-Z]+?[a-z]+?[A-Z]+?.*?)(\s)} $thebody {\1[\2]\3} thebody
# look for escaped CamelCase words, remove the escape.
regsub -all {(!)([A-Z]+?[a-z]+?[A-Z])} $thebody {\2} thebody
# Headings - start lowest to highest
regsub -all {====== *([^=\n]+?) *======} $thebody {.h6 \1} thebody
regsub -all {===== *([^=\n]+?) *=====} $thebody {.h5 \1} thebody
regsub -all {==== *([^=\n]+?) *====} $thebody {.h4 \1} thebody
regsub -all {=== *([^=\n]+?) *===} $thebody {.h3 \1} thebody
regsub -all {== *([^=\n]+?) *==} $thebody {.h2 \1} thebody
regsub -all {= *([^=\n]+?) *=} $thebody {.h1 \1} thebody
# Lists - nothing to do for bullets or numbered. For definition lists there
# is no good analog. Just approximate by bolding the term with a colon and
# two spaces following
regsub -all -lineanchor {^;([^:]+?:) *} $thebody {*\1* } thebody
# Bold and Italics - need to include newlines
regsub -all {'''(.+?)'''} $thebody {*\1*} thebody
regsub -all {''(.+?)''} $thebody {*\1*} thebody
# Tables
regsub -all {\|\|} $thebody {|} thebody
# Code lines
regsub -all -lineanchor {((^ +[^\n]+\n)+)} $thebody {{noformat}
\1{noformat}
} thebody
# Indented Text (leading colon) - just make 2 spaces - do this after handling code lines
regsub -all -lineanchor {^:} $thebody { } thebody
# Horizontal lines - four dashes on a line alone - same in confluence.
# HTML Tags that are supported
regsub -all {?(em|i)>} $thebody {_} thebody
regsub -all {?(strong|b)>} $thebody {*} thebody
regsub -all {?u>} $thebody {+} thebody
# Next fixed width format
regsub -all {(.+?)} $thebody {\{\{\1\}\}} thebody
# The special tags nowiki, code and pre are different in OddMuse but will/can all be translated as
# the same {{}} construct in Confluence because Confluence respects inserted line breaks anyway.
regsub -all {(.+?)
} $thebody {\{\{\1\}\}} thebody
regsub -all {
(.+?)} $thebody {\{\{\1\}\}} thebody return $thebody } # Steps through the files of OddMuse, converts the content via a call to convert_body and then writes the results to a similarly named file with a .txt extension. proc ExtractionFromOddmuseFiles {} { global gbl # the following is Control-^ times two, it is the delimiter of choice for OddMuse database files set doohicky # Step through the directories foreach dir1 [glob $gbl(wikisource)] { # And each file in each directory foreach filename [glob ${dir1}/*.db] { set fp [open $filename r] set data [read $fp] close $fp if { ! [regexp "${doohicky}3text${doohicky}3\(.*?\)${doohicky}3" $data match thebody] } { puts "file: \"$filename\" had a format\n which didn't match my expectations" } else { set thebody [convert_body $thebody] set ofilename \ "${gbl(work)}/[file rootname [file tail $filename]].txt" set fp [open $ofilename w] puts $fp $thebody close $fp } } } puts "Scan complete" } # MAIN LINE ExtractionFromOddmuseFiles # exit ====== ---- !!!!!! %| [Category String Processing] |% !!!!!!