DG found this RE on http://www.foad.org/~abigail/Perl/url2.html and rewrote it for Tcl. Holy RE, batman! It isn't perfect, doesn't match the latest URI spec [L1 ], and needs a good test suite for verification, but watch this page!
Just added IPv6 support. ${hsegment} has a problem picking-up closing parens, which is a legal character, but in the following context it is not: "See my website (http://www.example.com/ ) for the answer to life, the universe and everything"
...Stop the press... A sharp knife clued me in to the joke.. The proc is really a joke. And the joke was on me. Here's a dumb RE, just as I needed it:
(?:[[:alpha:]]?)(?:\w){2,7}:(?://?)(?:[^[:space:]>"]*)
I'll leave the rest in case someone is building an over the top URI parser and wants to turn off all the non-reporting atoms :) Maybe some of the bits are useful. Have a nice day!
AK: Tcllib's uri module has a similar set of regular expressions as well. Might benefit from the RE's given here too.
proc makeURI_RE {} { set nz_digit {[[:digit:]]} set nz_digits "(?:$nz_digit+)" set digits {(?:\d+)} set space {(?:%20)} set nl {(?:%0[Aa])} set dot {\.} set plus {\+} set qm {\?} set ast {\*} set hex {[a-fA-F\d]} set alpha {[[:alpha:]]} set alphas "(?:${alpha}+)" set alphanum {[[:alnum:]]} set xalphanum "(?:${alphanum}|%(?:3\\d|\[46\]$hex|\[57\]\[Aa\\d\]))" set alphanums "(?:${alphanum}+)" set escape "(?:%$hex\{2\})" set safe {[$\-_.+]} set extra {[~!*'(),]} set unwise {[{}|\\^[\]`]} set punctuation {[[:punct:]]} set reserved {[;/?:@&=]} set uchar "(?:${alphanum}|${safe}|${extra}|${escape})" set xchar "(?:${alphanum}|${safe}|${extra}|${reserved}|${escape})" # URL schemeparts for ip based protocols: set user "(?:(?:${uchar}|\[\;?&=\])*)" set password "(?:(?:${uchar}|\[\;?&=\])*)" set hostnumber "(?:${digits}(?:${dot}${digits}){3})" set toplabel "(?:${alpha}(?:(?:${alphanum}|-)*${alphanum})?)" set domainlabel "(?:${alphanum}(?:(?:${alphanum}|-)*${alphanum})?)" # only dotted-4 allowed (maybe I'm being strict) set IPv4address {(?:(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)(?:\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})} # IPv6, all forms. set IPv6full {(?:(?:[[:xdigit:]]{1,4}:){7}[[:xdigit:]]{1,4})} set IPv6hexcomp {(?:(?:(?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?)::((?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?))} set IPv6hex4dec {(?:(?:(?:[[:xdigit:]]{1,4}:){6,6})(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})} set IPv6hex4deccomp {(?:(?:(?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?)::(?:(?:[[:xdigit:]]{1,4}:)*)(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})} set IPv6address "(?:${IPv6full}|${IPv6hexcomp}|${IPv6hex4dec}|${IPv6hex4deccomp})" set hostname "(?:(?:${domainlabel}${dot})*${toplabel})" set host "(?:${hostname}|${IPv4address}|(?:\\\[${IPv6address}\\\]))" set hostport "(?:${host}(?::${digits})?)" set login "(?:(?:${user}(?::${password})?@)?${hostport})" # The predefined schemes: # FTP (see also RFC959) set fsegment "(?:(?:${uchar}|\[?:@&=\])*)" set fpath "(?:${fsegment}(?:/${fsegment})*)" set ftpurl "(?:ftp://${login}(?:/${fpath}(?:;type=\[AIDaid\])?)?)" # FILE set fileurl "(?:file://(?:${host}|localhost)?/${fpath})" # HTTP http://www.ietf.org/rfc/rfc2616.txt set hsegment "(?:(?:${uchar}|\[\;:@&=\])*)" set search "(?:(?:${uchar}|\[\;:@&=\])*)" set hpath "(?:${hsegment}(?:/${hsegment})*)" set httpurl "(?:http(?:s?)://${hostport}(?:/${hpath}(?:${qm}${search})?)?)" # GOPHER (see also RFC1436) set gopher_plus "(?:${xchar}*)" set selector "(?:${xchar}*)" set gtype ${xchar} set gopherurl [join [list "(?:gopher://${hostport}(?:/${gtype}(?:${selector}(?:%09${search}" \ "(?:%09${gopher_plus})?)?)?)?)" ] ""] # MAILTO (see also RFC822) set encoded822addr "(?:$xchar+)" set mailtourl "(?:mailto:$encoded822addr)" # NEWS (see also RFC1036) set article "(?:(?:${uchar}|\[\;/?:&=\])+@${host})" set group "(?:${alpha}(?:${alphanum}|\[_.+-\])*)" set grouppart "(?:${article}|${group}|${ast})" set newsurl "(?:news:${grouppart})" # NNTP (see also RFC977) set nntpurl "(?:nntp://${hostport}/${group}(?:/${digits})?)" # TELNET set telneturl "(?:telnet://${login}/?)" # WAIS (see also RFC1625) set wpath "(?:${uchar}*)" set wtype "(?:${uchar}*)" set database "(?:${uchar}*)" set waisdoc "(?:wais://${hostport}/${database}/${wtype}/${wpath})" set waisindex "(?:wais://${hostport}/${database}${qm}${search})" set waisdatabase "(?:wais://${hostport}/${database})" # my $waisurl = "(?:${waisdatabase}|${waisindex}|${waisdoc})"; # Speed up: the 3 types share a common prefix. set waisurl "(?:wais://${hostport}/${database}(?:(?:/${wtype}/${wpath})|${qm}${search})?)" # PROSPERO set fieldvalue "(?:(?:${uchar}|\[?:@&\])*)" set fieldname "(?:(?:${uchar}|\[?:@&\])*)" set fieldspec "(?:;${fieldname}=${fieldvalue})" set psegment "(?:(?:${uchar}|\[?:@&=\])*)" set ppath "(?:${psegment}(?:/${psegment})*)" set prosperourl "(?:prospero://${hostport}/${ppath}(?:${fieldspec})*)" # LDAP (see also RFC1959) # First. import stuff from RFC 1779 (Distinguished Names). # We've modified things a bit. set dn_separator "(?:\[\;,\])" set dn_optional_space "(?:${nl}?${space}*)" set dn_spaced_separator "(?:${dn_optional_space}${dn_separator}${dn_optional_space})" set dn_oid "(?:${digits}(?:${dot}${digits})*)" set dn_keychar "(?:${xalphanum}|${space})" set dn_key "(?:${dn_keychar}+|(?:OID|oid)${dot}${dn_oid})" set dn_string "(?:${uchar}*)" set dn_attribute "(?:(?:${dn_key}${dn_optional_space}=${dn_optional_space})?${dn_string})" set dn_name_component "(?:${dn_attribute}(?:${dn_optional_space}${plus}${dn_optional_space}${dn_attribute})*)" set dn_name [join [list "(?:${dn_name_component}(?:${dn_spaced_separator}${dn_name_component})*" \ "${dn_spaced_separator}?)" ] ""] # RFC 1558 defines the filter syntax, but that requires a PDA to recognize. # Since that's too powerful for Perl's REs, we allow any char between the # parenthesis (which have to be there.) set ldap_filter "(?:\(${xchar}+\))" # This is from RFC 1777. It defines an attributetype as an 'OCTET STRING', # whatever that is. set ldap_attr_type "(?:${uchar}+)" # Now we are at the grammar of RFC 1959. set ldap_attr_list "(?:${ldap_attr_type}(?:,${ldap_attr_type})*)" set ldap_attrs "(?:${ldap_attr_list}?)" set ldap_scope "(?:base|one|sub)" set ldapurl [join [list "(?:ldap://(?:${hostport})?/${dn_name}(?:${qm}${ldap_attrs}" \ "(?:${qm}${ldap_scope}(?:${qm}${ldap_filter})?)?)?)" ] ""] # RFC 2056 defines the format of URLs for the Z39.50 protocol. set z_database "(?:${uchar}+)" set z_docid "(?:${uchar}+)" set z_elementset "(?:${uchar}+)" set z_recordsyntax "(?:${uchar}+)" set z_scheme "(?:z39${dot}50\[rs\])" set z39_50url [join [list "(?:${z_scheme}://${hostport}(?:/(?:${z_database}(?:${plus}" \ "${z_database})*(?:${qm}${z_docid})?)?(?:\;esn=${z_elementset})?" \ "(?:\;rs=${z_recordsyntax}(?:${plus}${z_recordsyntax})*)?))" ] ""] # RFC 2111 defines the format for cid/mid URLs. set url_addr_spec "(?:(?:${uchar}|\[;?:@&=\])*)" set message_id $url_addr_spec set content_id $url_addr_spec set cidurl "(?:cid:${content_id})" set midurl "(?:mid:${message_id}(?:/${content_id})?)" # RFC 2122 defines the Vemmi URLs. set vemmi_attr "(?:(?:${uchar}|\[/?:@&\])*)" set vemmi_value "(?:(?:${uchar}|\[/?:@&\])*)" set vemmi_service "(?:(?:${uchar}|\[/?:@&=\])*)" set vemmi_param "(?:\;${vemmi_attr}=${vemmi_value})" set vemmiurl "(?:vemmi://${hostport}(?:/${vemmi_service}(?:${vemmi_param}*))?)" # RFC 2192 for IMAP URLs. # Import from RFC 2060. # set imap4_astring "" # set imap4_search_key "" # set imap4_section_text "" set imap4_nz_number $nz_digits; set achar "(?:${uchar}|\[&=~\])" set bchar "(?:${uchar}|\[&=~:@/\])" set enc_auth_type "(?:${achar}+)" set enc_list_mbox "(?:${bchar}+)" set enc_mailbox "(?:${bchar}+)" set enc_search "(?:${bchar}+)" set enc_section "(?:${bchar}+)" set enc_user "(?:${achar}+)" set i_auth "(?:\;\[Aa\]\[Uu\]\[Tt\]\[Hh\]=(?:${ast}|${enc_auth_type}))"; set i_list_type "(?:\[Ll\](?:\[Ii\]\[Ss\]\[Tt\]|\[Ss\]\[Uu\]\[Bb\]))"; set i_mailboxlist "(?:${enc_list_mbox}?\;\[Tt\]\[Yy\]\[Pp\]\[Ee\]=${i_list_type})"; set i_uidvalidity [join [list "(?:\;\[Uu\]\[Ii\]\[Dd\]\[Vv\]\[Aa\]\[Ll\]\[Ii\]\[Dd\]\[Ii\]" \ "\[Tt\]\[Yy\]=${imap4_nz_number})" ] ""] set i_messagelist "(?:${enc_mailbox}(?:${qm}${enc_search})?(?:${i_uidvalidity})?)" set i_section "(?:/\;\[Ss\]\[Ee\]\[Cc\]\[Tt\]\[Ii\]\[Oo\]\[Nn\]=${enc_section})" set i_uid "(?:/\;\[Uu\]\[Ii\]\[Dd\]=${imap4_nz_number})" set i_messagepart "(?:${enc_mailbox}(?:${i_uidvalidity})?${i_uid}(?:${i_section})?)" set i_command "(?:${i_mailboxlist}|${i_messagelist}|${i_messagepart})" set i_userauth "(?:(?:${enc_user}(?:${i_auth})?)|(?:${i_auth}(?:${enc_user})?))" set i_server "(?:(?:${i_userauth}@)?${hostport})" set imapurl "(?:imap://${i_server}/(?:$i_command)?)" # RFC 2224 for NFS. set nfs_mark {[$-_.!~*'(),]} set nfs_unreserved "(?:${alphanum}|${nfs_mark})" set nfs_pchar "(?:${nfs_unreserved}|${escape}|\[:@&=+\])" set nfs_segment "(?:${nfs_pchar}*)" set nfs_path_segs "(?:${nfs_segment}(?:/${nfs_segment})*)" set nfs_url_path "(?:/?${nfs_path_segs})" set nfs_rel_path "(?:${nfs_path_segs}?)" set nfs_abs_path "(?:/${nfs_rel_path})" set nfs_net_path "(?://${hostport}(?:${nfs_abs_path})?)" set nfs_rel_url "(?:${nfs_net_path}|${nfs_abs_path}|${nfs_rel_path})" set nfsurl "(?:nfs:${nfs_rel_url})" # Combining all the different URL formats into a single regex. return [join [list $httpurl $ftpurl $newsurl $nntpurl \ $telneturl $gopherurl $waisurl $mailtourl \ $fileurl $prosperourl $ldapurl $z39_50url \ $cidurl $midurl $vemmiurl $imapurl $nfsurl] |] }