URI detector for arbitrary text as a regular expression

DG found this RE on http://www.foad.org/~abigail/Perl/url2.html and rewrote it for Tcl. Holy RE, batman! It isn't perfect, doesn't match the latest URI spec [L1 ], and needs a good test suite for verification, but watch this page!


Just added IPv6 support. ${hsegment} has a problem picking-up closing parens, which is a legal character, but in the following context it is not: "See my website (http://www.example.com/ ) for the answer to life, the universe and everything"


...Stop the press... A sharp knife clued me in to the joke.. The proc is really a joke. And the joke was on me. Here's a dumb RE, just as I needed it:

 (?:[[:alpha:]]?)(?:\w){2,7}:(?://?)(?:[^[:space:]>"]*)

I'll leave the rest in case someone is building an over the top URI parser and wants to turn off all the non-reporting atoms :) Maybe some of the bits are useful. Have a nice day!

AK: Tcllib's uri module has a similar set of regular expressions as well. Might benefit from the RE's given here too.


 proc makeURI_RE {} {

    set nz_digit        {[[:digit:]]}
    set nz_digits        "(?:$nz_digit+)"
    set digits                {(?:\d+)}
    set space                {(?:%20)}
    set nl                {(?:%0[Aa])}
    set dot                {\.}
    set plus                {\+}
    set qm                {\?}
    set ast                {\*}
    set hex                {[a-fA-F\d]}
    set alpha                {[[:alpha:]]}
    set alphas                "(?:${alpha}+)"
    set alphanum        {[[:alnum:]]}
    set xalphanum        "(?:${alphanum}|%(?:3\\d|\[46\]$hex|\[57\]\[Aa\\d\]))"
    set alphanums        "(?:${alphanum}+)"
    set escape                "(?:%$hex\{2\})"
    set safe                {[$\-_.+]}
    set extra                {[~!*'(),]}
    set unwise                {[{}|\\^[\]`]}
    set punctuation        {[[:punct:]]}
    set reserved        {[;/?:@&=]}
    set uchar                "(?:${alphanum}|${safe}|${extra}|${escape})"
    set xchar                "(?:${alphanum}|${safe}|${extra}|${reserved}|${escape})"

 # URL schemeparts for ip based protocols:
    set user                "(?:(?:${uchar}|\[\;?&=\])*)"
    set password        "(?:(?:${uchar}|\[\;?&=\])*)"
    set hostnumber        "(?:${digits}(?:${dot}${digits}){3})"
    set toplabel        "(?:${alpha}(?:(?:${alphanum}|-)*${alphanum})?)"
    set domainlabel        "(?:${alphanum}(?:(?:${alphanum}|-)*${alphanum})?)"

    # only dotted-4 allowed (maybe I'm being strict)
    set IPv4address        {(?:(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)(?:\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})}

    # IPv6, all forms.
    set IPv6full        {(?:(?:[[:xdigit:]]{1,4}:){7}[[:xdigit:]]{1,4})}
    set IPv6hexcomp        {(?:(?:(?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?)::((?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?))}
    set IPv6hex4dec        {(?:(?:(?:[[:xdigit:]]{1,4}:){6,6})(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})}
    set IPv6hex4deccomp {(?:(?:(?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?)::(?:(?:[[:xdigit:]]{1,4}:)*)(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})}
    set IPv6address        "(?:${IPv6full}|${IPv6hexcomp}|${IPv6hex4dec}|${IPv6hex4deccomp})"

    set hostname        "(?:(?:${domainlabel}${dot})*${toplabel})"
    set host                "(?:${hostname}|${IPv4address}|(?:\\\[${IPv6address}\\\]))"
    set hostport        "(?:${host}(?::${digits})?)"
    set login                "(?:(?:${user}(?::${password})?@)?${hostport})"

 # The predefined schemes:

 # FTP (see also RFC959)
    set fsegment        "(?:(?:${uchar}|\[?:@&=\])*)"
    set fpath                "(?:${fsegment}(?:/${fsegment})*)"
    set ftpurl                "(?:ftp://${login}(?:/${fpath}(?:;type=\[AIDaid\])?)?)"

 # FILE
    set fileurl                "(?:file://(?:${host}|localhost)?/${fpath})"

 # HTTP  http://www.ietf.org/rfc/rfc2616.txt
    set hsegment        "(?:(?:${uchar}|\[\;:@&=\])*)"
    set search                "(?:(?:${uchar}|\[\;:@&=\])*)"
    set hpath                "(?:${hsegment}(?:/${hsegment})*)"
    set httpurl                "(?:http(?:s?)://${hostport}(?:/${hpath}(?:${qm}${search})?)?)"

 # GOPHER (see also RFC1436)
    set gopher_plus        "(?:${xchar}*)"
    set selector        "(?:${xchar}*)"
    set gtype                ${xchar}        
    set gopherurl        [join [list "(?:gopher://${hostport}(?:/${gtype}(?:${selector}(?:%09${search}" \
                        "(?:%09${gopher_plus})?)?)?)?)" ] ""]

 # MAILTO (see also RFC822)
    set encoded822addr        "(?:$xchar+)"
    set mailtourl        "(?:mailto:$encoded822addr)"

 # NEWS (see also RFC1036)
    set article                "(?:(?:${uchar}|\[\;/?:&=\])+@${host})"
    set group                "(?:${alpha}(?:${alphanum}|\[_.+-\])*)"
    set grouppart        "(?:${article}|${group}|${ast})"
    set newsurl                "(?:news:${grouppart})"

 # NNTP (see also RFC977)
    set nntpurl                "(?:nntp://${hostport}/${group}(?:/${digits})?)"

 # TELNET
    set telneturl        "(?:telnet://${login}/?)"

 # WAIS (see also RFC1625)
    set wpath                "(?:${uchar}*)"
    set wtype                "(?:${uchar}*)"
    set database        "(?:${uchar}*)"
    set waisdoc                "(?:wais://${hostport}/${database}/${wtype}/${wpath})"
    set waisindex        "(?:wais://${hostport}/${database}${qm}${search})"
    set waisdatabase        "(?:wais://${hostport}/${database})"
 # my $waisurl        =  "(?:${waisdatabase}|${waisindex}|${waisdoc})";
 # Speed up: the 3 types share a common prefix.
    set waisurl                "(?:wais://${hostport}/${database}(?:(?:/${wtype}/${wpath})|${qm}${search})?)"

 # PROSPERO
    set fieldvalue        "(?:(?:${uchar}|\[?:@&\])*)"
    set fieldname        "(?:(?:${uchar}|\[?:@&\])*)"
    set fieldspec        "(?:;${fieldname}=${fieldvalue})"
    set psegment        "(?:(?:${uchar}|\[?:@&=\])*)"
    set ppath                "(?:${psegment}(?:/${psegment})*)"
    set prosperourl        "(?:prospero://${hostport}/${ppath}(?:${fieldspec})*)"

 # LDAP (see also RFC1959)
 # First. import stuff from RFC 1779 (Distinguished Names).
 # We've modified things a bit.
    set dn_separator        "(?:\[\;,\])"
    set dn_optional_space "(?:${nl}?${space}*)"
    set dn_spaced_separator "(?:${dn_optional_space}${dn_separator}${dn_optional_space})"
    set dn_oid                "(?:${digits}(?:${dot}${digits})*)"
    set dn_keychar        "(?:${xalphanum}|${space})"
    set dn_key                "(?:${dn_keychar}+|(?:OID|oid)${dot}${dn_oid})"
    set dn_string        "(?:${uchar}*)"
    set dn_attribute        "(?:(?:${dn_key}${dn_optional_space}=${dn_optional_space})?${dn_string})"
    set dn_name_component "(?:${dn_attribute}(?:${dn_optional_space}${plus}${dn_optional_space}${dn_attribute})*)"
    set dn_name                [join [list "(?:${dn_name_component}(?:${dn_spaced_separator}${dn_name_component})*" \
                        "${dn_spaced_separator}?)" ] ""]

 # RFC 1558 defines the filter syntax, but that requires a PDA to recognize.
 # Since that's too powerful for Perl's REs, we allow any char between the
 # parenthesis (which have to be there.)
    set ldap_filter        "(?:\(${xchar}+\))"

 # This is from RFC 1777. It defines an attributetype as an 'OCTET STRING',
 # whatever that is.
    set ldap_attr_type        "(?:${uchar}+)"

 # Now we are at the grammar of RFC 1959.
    set ldap_attr_list        "(?:${ldap_attr_type}(?:,${ldap_attr_type})*)"
    set ldap_attrs        "(?:${ldap_attr_list}?)"

    set ldap_scope        "(?:base|one|sub)"
    set ldapurl                [join [list "(?:ldap://(?:${hostport})?/${dn_name}(?:${qm}${ldap_attrs}" \
                        "(?:${qm}${ldap_scope}(?:${qm}${ldap_filter})?)?)?)" ] ""]


 # RFC 2056 defines the format of URLs for the Z39.50 protocol.
    set z_database        "(?:${uchar}+)"
    set z_docid                "(?:${uchar}+)"
    set z_elementset        "(?:${uchar}+)"
    set z_recordsyntax        "(?:${uchar}+)"
    set z_scheme        "(?:z39${dot}50\[rs\])"
    set z39_50url        [join [list "(?:${z_scheme}://${hostport}(?:/(?:${z_database}(?:${plus}" \
                        "${z_database})*(?:${qm}${z_docid})?)?(?:\;esn=${z_elementset})?" \
                        "(?:\;rs=${z_recordsyntax}(?:${plus}${z_recordsyntax})*)?))" ] ""]

 # RFC 2111 defines the format for cid/mid URLs.
    set url_addr_spec        "(?:(?:${uchar}|\[;?:@&=\])*)"
    set message_id        $url_addr_spec
    set content_id        $url_addr_spec
    set cidurl                "(?:cid:${content_id})"
    set midurl                "(?:mid:${message_id}(?:/${content_id})?)"


 # RFC 2122 defines the Vemmi URLs.
    set vemmi_attr        "(?:(?:${uchar}|\[/?:@&\])*)"
    set vemmi_value        "(?:(?:${uchar}|\[/?:@&\])*)"
    set vemmi_service        "(?:(?:${uchar}|\[/?:@&=\])*)"
    set vemmi_param        "(?:\;${vemmi_attr}=${vemmi_value})"
    set vemmiurl        "(?:vemmi://${hostport}(?:/${vemmi_service}(?:${vemmi_param}*))?)"

 # RFC 2192 for IMAP URLs.
 # Import from RFC 2060.
 # set imap4_astring    ""
 # set imap4_search_key ""
 # set imap4_section_text ""
    set imap4_nz_number        $nz_digits;
    set achar                "(?:${uchar}|\[&=~\])"
    set bchar                "(?:${uchar}|\[&=~:@/\])"
    set enc_auth_type        "(?:${achar}+)"
    set enc_list_mbox        "(?:${bchar}+)"
    set enc_mailbox        "(?:${bchar}+)"
    set enc_search        "(?:${bchar}+)"
    set enc_section        "(?:${bchar}+)"
    set enc_user        "(?:${achar}+)"
    set i_auth                "(?:\;\[Aa\]\[Uu\]\[Tt\]\[Hh\]=(?:${ast}|${enc_auth_type}))";
    set i_list_type        "(?:\[Ll\](?:\[Ii\]\[Ss\]\[Tt\]|\[Ss\]\[Uu\]\[Bb\]))";
    set i_mailboxlist        "(?:${enc_list_mbox}?\;\[Tt\]\[Yy\]\[Pp\]\[Ee\]=${i_list_type})";
    set i_uidvalidity        [join [list "(?:\;\[Uu\]\[Ii\]\[Dd\]\[Vv\]\[Aa\]\[Ll\]\[Ii\]\[Dd\]\[Ii\]" \
                        "\[Tt\]\[Yy\]=${imap4_nz_number})" ] ""]
    set i_messagelist        "(?:${enc_mailbox}(?:${qm}${enc_search})?(?:${i_uidvalidity})?)"
    set i_section        "(?:/\;\[Ss\]\[Ee\]\[Cc\]\[Tt\]\[Ii\]\[Oo\]\[Nn\]=${enc_section})"
    set i_uid                "(?:/\;\[Uu\]\[Ii\]\[Dd\]=${imap4_nz_number})"
    set i_messagepart        "(?:${enc_mailbox}(?:${i_uidvalidity})?${i_uid}(?:${i_section})?)"
    set i_command        "(?:${i_mailboxlist}|${i_messagelist}|${i_messagepart})"
    set i_userauth        "(?:(?:${enc_user}(?:${i_auth})?)|(?:${i_auth}(?:${enc_user})?))"
    set i_server        "(?:(?:${i_userauth}@)?${hostport})"
    set imapurl                "(?:imap://${i_server}/(?:$i_command)?)"

 # RFC 2224 for NFS.
    set nfs_mark        {[$-_.!~*'(),]}
    set nfs_unreserved        "(?:${alphanum}|${nfs_mark})"
    set nfs_pchar        "(?:${nfs_unreserved}|${escape}|\[:@&=+\])"
    set nfs_segment        "(?:${nfs_pchar}*)"
    set nfs_path_segs        "(?:${nfs_segment}(?:/${nfs_segment})*)"
    set nfs_url_path        "(?:/?${nfs_path_segs})"
    set nfs_rel_path        "(?:${nfs_path_segs}?)"
    set nfs_abs_path        "(?:/${nfs_rel_path})"
    set nfs_net_path        "(?://${hostport}(?:${nfs_abs_path})?)"
    set nfs_rel_url        "(?:${nfs_net_path}|${nfs_abs_path}|${nfs_rel_path})"
    set nfsurl                "(?:nfs:${nfs_rel_url})"


 # Combining all the different URL formats into a single regex.
    return [join [list $httpurl $ftpurl $newsurl $nntpurl \
                $telneturl $gopherurl $waisurl $mailtourl \
                $fileurl $prosperourl $ldapurl $z39_50url \
                $cidurl $midurl $vemmiurl $imapurl $nfsurl] |]
 }