Version 0 of XML Shallow Parsing with Regular Expressions

Updated 2003-04-06 22:48:54

if 0 { Brian Theado - A paper with the same title as this page can be found at [L1 ]. The Appendix of the paper includes sample implementation in Perl, Javascript and Flex/Lex. The Appendix also includes an interactive demo (using the Javascript implementation apparently). Here is a translation of the Perl code into Tcl. The translation mostly involved escaping the square braces and enclosing some variable names in {}. }

 # REX/Perl 1.0 
 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
 # Technical Report TR 1998-17, School of Computing Science, Simon Fraser 
 # University, November, 1998.
 # Copyright (c) 1998, Robert D. Cameron.
 # The following code may be freely used and distributed provided that
 # this copyright and citation notice remains intact and that modifications
 # or additions are clearly identified.
 #
 # 06Apr03 Brian Theado - Direct translation from Perl to Tcl 

 set TextSE "\[^<]+"
 set UntilHyphen "\[^-]*-"
 set Until2Hyphens "${UntilHyphen}(?:\[^-]$UntilHyphen)*-"
 set CommentCE "${Until2Hyphens}>?"
 set UntilRSBs "\[^\\]]*](?:\[^\\]]+])*]+"
 set CDATA_CE "${UntilRSBs}(?:\[^\\]>]$UntilRSBs)*>"
 set S "\[ \\n\\t\\r]+"
 set NameStrt "\[A-Za-z_:]|\[^\\x00-\\x7F]"
 set NameChar "\[A-Za-z0-9_:.-]|\[^\\x00-\\x7F]"
 set Name "(?:$NameStrt)(?:$NameChar)*"
 set QuoteSE "\"\[^\"]*\"|'\[^']*'"
 set DT_IdentSE "$S${Name}(?:${S}(?:${Name}|$QuoteSE))*"
 set MarkupDeclCE "(?:\[^\\]\"'><]+|$QuoteSE)*>"
 set S1 "\[\\n\\r\\t ]"
 set UntilQMs "\[^?]*\\?+"
 set PI_Tail "\\?>|$S1${UntilQMs}(?:\[^>?]$UntilQMs)*>"
 set DT_ItemSE  "<(?:!(?:--${Until2Hyphens}>|\[^-]$MarkupDeclCE)|\\?${Name}(?:$PI_Tail))|%$Name;|$S"
 set DocTypeCE "${DT_IdentSE}(?:$S)?(?:\\\[(?:$DT_ItemSE)*](?:$S)?)?>?"
 set DeclCE "--(?:$CommentCE)?|\\\[CDATA\\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"
 set PI_CE "${Name}(?:$PI_Tail)?"
 set EndTagCE "${Name}(?:$S)?>?"
 set AttValSE "\"\[^<\"]*\"|'\[^<']*'"
 set ElemTagCE "${Name}(?:$S${Name}(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?"
 set MarkupSPE "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"
 set XML_SPE "$TextSE|$MarkupSPE"


 proc ShallowParse {xml} { 
    global XML_SPE
    return [regexp -inline -all $XML_SPE $xml]
 }