Textile to HTML

jdp 2009-11-15 01:42:49 UTC Here's some code for converting Textile [L1 ]. At the moment it's reasonably reliable (I think there are a few cases which cause grief, but none come to mind), but table support is nonexistent. It doesn't generate a full document, just the part that would go between the <body> tags. Enjoy.

Oh, and this should be run inside a safe interpreter and be modified so that you pass in the markup instead of the filename. It uses subst for some of the parsing which has the mixed blessing side effect of being able to do cool things like get formatted timestamps... or execute arbitrary code at the privilege level of the server.

Warn and log are there to explain problems with the markup, mostly for debugging purposes, they probably ought to be redefined to do something else.

Almost forgot, many thanks to rewt in the chatroom for helping with the regex for the inline styles.

proc warn {args} {
        set dataout [open report.log a]
        puts $dataout "WARNING: [join $args " "]"
        puts stdout "WARNING: [join $args " "]"
        close $dataout
}

proc log {args} {
        set dataout [open report.log a]
        puts $dataout "       : [join $args " "]"
        puts stdout "       : [join $args " "]"
        close $dataout
}

proc parse {filename} {
        set datain [open $filename]
        set doctext [read $datain]
        close $datain

        regsub -all -- {\n{2,}} $doctext "\uFFFF" doctext
        set text [split $doctext \uFFFF]

        set out [list]
        set premode 0
        
        foreach line $text {        
                if {[regexp {<pre>.*</pre>} $line]} {
                        if {[catch {lappend out [subst $line]} e i]} {
                                warn $e
                                log $i
                                log $line
                        }
                        continue
                } elseif {[regexp {<pre>} $line]} {
                        if {[catch {lappend out [subst $line]} e i]} {
                                warn $e
                                log $i
                                log $line
                        }
                        set premode 1
                        continue
                } elseif {[regexp {</pre>} $line]} {
                        if {[catch {lappend out [subst $line]} e i]} {
                                warn $e
                                log $i
                                log $line
                        }
                        set premode 0
                        continue
                } elseif {$premode} {
                        if {[catch {lappend out [subst $line]} e i]} {
                        warn $e
                        log $i
                        log $line
                }
                        continue
                }
        
                if {[regexp {^h([1-6])(.*?)\. (.*?)$} $line match level style text]} {
                        set line "<h$level[style $style]>$text</h$level>"
                } elseif {[regexp {^bq(.*?)\. (.*?)$} $line match style text]} {
                        set line "<blockquote[style $style]>$text</blockquote>"
                } elseif {[regexp {^fn(\d)(.*?)\. (.*?)$} $line match number style text]} {
                        set line "<p id=\"fn$number\"[style $style]><sup>$number</sup> $text</p>"
                } elseif {[regexp {^#.*? .*$} $line match style]} {
                        set line "<ol>\n$match\n</ol>"
                } elseif {[regexp {^\* .*$} $line match]} {
                        set line "<ul>\n$match\n</ul>"
                #} elseif {[regexp {^(\|.*?)+\|} $line match]} {
                #        set line "<table>\n$match\n</table>"
                } elseif {[regexp {^p(.*?)\. (.*?)$} $line match style text]} {
                        set line "<p[style $style]>$text</p>"
                } else {
                        set line "<p>$line</p>"
                }
                
                regsub -all -line -- {^#(.*?) (.*?)(\n)} $line {<li[style "\1"]>\2</li>\3} line
                regsub -all -line -- {^\*(.*?) (.*?)(\n)} $line {<li[style "\1"]>\2</li>\3} line
                
                regsub -all -- {"([^\"]*?)(?:\((.*?)\))??":([[:graph:]]+)([[:space:];#%&\*\{\}\\<>\?\+]|$)} $line {<a href="\3" title="\2">\1</a>\4} line
                
                regsub -all -- {\!([[:graph:]]+)(?:\((.*?)\))?\!:([[:graph:]]+)([[:space:]]|$)} $line {<a href="\3"><img src="\1" alt="\2" /></a>\4} line
                regsub -all -- {\!([[:graph:]]+)(?:\((.*?)\))?\!} $line {<img src="\1" alt="\2" />} line
                
                regsub -all -- {([^()])\m__([[({].*?[})\]])??(\S.*?)__\M([^()])} $line {\1<i[style "\2"]>\3</i>\4} line
                regsub -all -- {([^()])\m_([[({].*?[})\]])??(\S.*?)_\M([^()])} $line {\1<em[style "\2"]>\3</em>\4} line
                
                regsub -all -- {\*\*\m([[({].*?[})\]])??(\S.*?)\M\*\*} $line {<b[style "\1"]>\2</b>} line
                regsub -all -- {\*\m([[({].*?[})\]])??(\S.*?)\M\*} $line {<strong[style "\1"]>\2</strong>} line
                
                regsub -all -- {\?\?\m([[({].*?[})\]])??(\S.*?)\M\?\?} $line {<cite[style "\1"]>\2</cite>} line
                
                regsub -all -- {-\m([[({].*?[})\]])??(\S.*?)\M-} $line {<del[style "\1"]>\2</del>} line
                regsub -all -- {\+\m([[({].*?[})\]])??(\S.*?)\M\+} $line {<ins[style "\1"]>\2</ins>} line

                regsub -all -- {\^\m([[({].*?[})\]])??(\S.*?)\M\^} $line {<sup[style "\1"]>\2</sup>} line
                regsub -all -- {~\m([[({].*?[})\]])??(\S.*?)\M~} $line {<sub[style "\1"]>\2</sub>} line
                
                regsub -all -- {%\m([[({].*?[})\]])??(\S.*?)\M%} $line {<span[style "\1"]>\2</span>} line
                
                regsub -all -- {[[:graph:]]+?\((.+?)\)} $line {<acronym title="\2">\1</acronym>} line
                regsub -all -- {([[:graph:]])\[(\d+?)\]} $line {\1<sup><a href="#fn\2">\2</a></sup>} line
                regsub -all -- {([[:alnum:]_])\n} $line {\1<br />\n} line
                
                if {[catch {lappend out [subst $line]} e i]} {
                        warn $e
                        log $i
                        log $line
                }
        }
        
        return [join $out \n\n]
}

proc style {attrs} {
        set result [list]
        set stylelist [list]
        if {[regexp {\(#([[:alnum:]_]+?)\)} $attrs match id]} {
                lappend result " id=\"$id\""
        }
        if {[regexp {\(([[:alnum:]_]+?)\)} $attrs match class]} {
                lappend result " class=\"$class\""
        }
        if {[regexp {\(([[:alnum:]_]+?)#([[:alnum:]_]+?)\)} $attrs match class id]} {
                lappend result "class=\"$class\""
                lappend result " id=\"$id\""
        }
        if {[regexp {\{(.+?)\}} $attrs match style]} {
                lappend stylelist $style
        }
        if {[regexp {\[([[:alpha:]]+?)\]} $attrs match lang]} {
                lappend result " lang=\"$lang\""
        }
        
        if {[regexp {^<>} $attrs]} {
                lappend stylelist "text-align:justify"
        } elseif {[regexp {^>} $attrs]} {
                lappend stylelist "text-align:right"
        } elseif {[regexp {^<} $attrs]} {
                lappend stylelist "text-align:left"
        } elseif {[regexp {^=} $attrs]} {
                lappend stylelist "text-align:center"
        } elseif {[regexp {^(\(*?)(\)*?)(?![[:alnum:]_])} $attrs match left right]} {
                if {[string length $left] > 0} {
                        lappend stylelist "padding-left:[string length $left]em"
                }
                if {[string length $right] > 0} {
                        lappend stylelist "padding-right:[string length $right]em"
                }
        }
        if {[llength $stylelist] > 0} {
                lappend result " style=\"[join $stylelist ;]\""
        }
        
        return [join $result ""]
}

jdp - 2009-11-16 15:37:00

After a chat with mjanssen in the chatroom, I think I'll have a go at implementing this without subst soon... this is very unsafe code.

jnc - 2009-12-06

Any more work on implementing this without subst?

jdp - 2010-03-13 19:55:02

Oops, I forgot about this code entirely. I'm currently busy with school; when I get around to fixing it, I'll post here and talk it up in the chatroom.