XML Shallow Parsing with Regular Expressions

if 0 { Brian Theado - A paper with the same title as this page can be found at [L1 ]. The Appendix of the paper includes sample implementation in Perl, Javascript and Flex/Lex. The Appendix also includes an interactive demo (using the Javascript implementation apparently). Here is a translation of the Perl code into Tcl. The translation mostly involved escaping the square braces and enclosing some variable names in {}.

}

 # REX/Perl 1.0 
 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
 # Technical Report TR 1998-17, School of Computing Science, Simon Fraser 
 # University, November, 1998.
 # Copyright (c) 1998, Robert D. Cameron.
 # The following code may be freely used and distributed provided that
 # this copyright and citation notice remains intact and that modifications
 # or additions are clearly identified.
 #
 # 06Apr03 Brian Theado - Direct translation from Perl to Tcl 

 set TextSE "\[^<]+"
 set UntilHyphen "\[^-]*-"
 set Until2Hyphens "${UntilHyphen}(?:\[^-]$UntilHyphen)*-"
 set CommentCE "${Until2Hyphens}>?"
 set UntilRSBs "\[^\\]]*](?:\[^\\]]+])*]+"
 set CDATA_CE "${UntilRSBs}(?:\[^\\]>]$UntilRSBs)*>"
 set S "\[ \\n\\t\\r]+"
 set NameStrt "\[A-Za-z_:]|\[^\\x00-\\x7F]"
 set NameChar "\[A-Za-z0-9_:.-]|\[^\\x00-\\x7F]"
 set Name "(?:$NameStrt)(?:$NameChar)*"
 set QuoteSE "\"\[^\"]*\"|'\[^']*'"
 set DT_IdentSE "$S${Name}(?:${S}(?:${Name}|$QuoteSE))*"
 set MarkupDeclCE "(?:\[^\\]\"'><]+|$QuoteSE)*>"
 set S1 "\[\\n\\r\\t ]"
 set UntilQMs "\[^?]*\\?+"
 set PI_Tail "\\?>|$S1${UntilQMs}(?:\[^>?]$UntilQMs)*>"
 set DT_ItemSE  "<(?:!(?:--${Until2Hyphens}>|\[^-]$MarkupDeclCE)|\\?${Name}(?:$PI_Tail))|%$Name;|$S"
 set DocTypeCE "${DT_IdentSE}(?:$S)?(?:\\\[(?:$DT_ItemSE)*](?:$S)?)?>?"
 set DeclCE "--(?:$CommentCE)?|\\\[CDATA\\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"
 set PI_CE "${Name}(?:$PI_Tail)?"
 set EndTagCE "${Name}(?:$S)?>?"
 set AttValSE "\"\[^<\"]*\"|'\[^<']*'"
 set ElemTagCE "${Name}(?:$S${Name}(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?"
 set MarkupSPE "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"
 set XML_SPE "$TextSE|$MarkupSPE"


 proc ShallowParse {xml} { 
    global XML_SPE
    return [regexp -inline -all $XML_SPE $xml]
 }

if 0 {

Example use:

 % set xml {<html>
 <head>
 <title>XML Shallow Parsing with Regular Expressions</title>
 <meta http-equiv="Pragma" content="no-cache"></meta>
 <meta http-equiv="Expire" content="Mon, 04 Dec 1999 21:29:02 GMT"></meta>
 <link rel="stylesheet" href="https://wiki.tcl-lang.org/wikit.css"
 type="text/css"></link>
 <base href="https://wiki.tcl-lang.org/">
 </head>}

 % ShallowParse $xml
 <html> {
 } <head> {
 } <title> {XML Shallow Parsing with Regular Expressions} </title> {
 } {<meta http-equiv="Pragma" content="no-cache">} </meta> {
 } {<meta http-equiv="Expire" content="Mon, 04 Dec 1999 21:29:02 GMT">} </meta> {
 } {<link rel="stylesheet" href="https://wiki.tcl-lang.org/wikit.css"
 type="text/css">} </link> {
 } {<base href="https://wiki.tcl-lang.org/">} {
 } </head>

}

if 0 {


}