Version 0 of Parsing RFC2822 dates and times

Updated 2009-08-01 21:31:48 by kbk

KBK 2009-08-01 -- Emiliano Gavilán raised a question on the Tcl'ers Chat about parsing RFC2822-formatted dates and times (such as the ones that appear in mail headers). His question made me realize that I had never Wikified the code that I use for the purpose. It's a wrapper around [clock scan] that matches the format from a limited set, and then uses clock scan to extract the data.

if {![package vsatisfies [info tclversion] 8.5]} {
    package require newclock
}

namespace eval rfc2822 {

    variable datepats {}
    
}

# AddDatePat --
#
#       Internal procedure that adds a date pattern to the pattern list
#
# Parameters:
#       wpat - Regexp pattern that matches the weekday
#       wgrp - Format group that matches the weekday
#       ypat - Regexp pattern that matches the year
#       ygrp - Format group that matches the year
#       spat - Regexp pattern that matches the seconds of the minute
#       sgrp - Format group that matches the seconds of the minute
#       zpat - Regexp pattern that matches the time zone
#       zgrp - Format group that matches the time zone
#
# Results:
#       None
#
# Side effects:
#       Adds a complete regexp and a complete [clock scan] pattern to
#       'datepats'

proc rfc2822::AddDatePat { wpat wgrp ypat ygrp spat sgrp zpat zgrp } {
        
        variable datepats
        set regexp {^[[:space:]]*}
        set pat {}
        append regexp $wpat {\d\d?[[:space:]]+[[:alpha:]]+[[:space:]]+} $ypat
        append pat $wgrp {%d %b } $ygrp
        append regexp {[[:space:]]+\d\d?:\d\d} $spat
        append pat { %H:%M} $sgrp
        append regexp $zpat
        append pat $zgrp
        append regexp {[[:space:]]*$}
        lappend datepats $regexp $pat
        return
    }
    
# InitDatePats --
#
#       Internal rocedure that initializes the set of date patterns allowed in
#       an RFC2822 date
#
# Parameters:
#       permissible - 1 if erroneous (but common) time zones are to be
#                     allowed, 0 if they are to be rejected
#
# Results:
#       None.
#
# Side effects:

proc rfc2822::InitDatePats { permissible } {
        
    # Produce formats for the observed variants of ISO2822 dates.  Permissible
    # variants come first in the list; impermissible ones come later.
    
    # The year may be two digits, or four. Four digit year is done first.
    
    foreach ypat {{\d\d\d\d} {\d\d}} ygrp {%Y %y} {
        
        # The seconds of the minute may be provided, or omitted.
        
        foreach spat {{:\d\d} {}} sgrp {:%S {}} {
            
            # The weekday may be provided or omitted. It is common but
            # impermissible to omit the comma after the weekday name.
            
            foreach wpat {
                {(?:Mon|T(?:ue|hu)|Wed|Fri|S(?:at|un)),[[:space:]]+}
                {(?:Mon|T(?:ue|hu)|Wed|Fri|S(?:at|un))[[:space:]]+}
                {}
            } wgrp {
                {%a, }
                {%a }
                {}
            } wperm {
                1
                0
                1
            } {
                
                # Time zone is defined as +/- hhmm, or as a named time zone.
                # Other common but buggy formats are GMT+-hh:mm, a time
                # zone name in quotation marks, and complete omission of
                # the time zone.
                
                foreach zpat {
                    {[[:space:]]+(?:[-+]\d\d\d\d|[[:alpha:]]+)} 
                    {[[:space:]]+GMT[-+]\d\d:\d\d}
                    {[[:space:]]+"[[:alpha:]]+"}
                    {}
                } zgrp {
                    { %Z}
                    { GMT%Z}
                    { "%Z"}
                    {}
                } zperm {
                    1
                    0
                    0
                    0
                } {
                    if { ($zperm && $wperm) == $permissible } {
                        AddDatePat $wpat $wgrp $ypat $ygrp \
                            $spat $sgrp $zpat $zgrp
                    }
                }
            }
        }
    }
    return
}

# Initialize the date patterns

namespace eval rfc2822 {
    InitDatePats 1
    InitDatePats 0
    rename AddDatePat {}
    rename InitDatePats {}
}

# rfc2822::parseDate --
#
#       Parses a date expressed in RFC2822 format
#
# Parameters:
#       date - The date to parse
#
# Results:
#       Returns the date expressed in seconds from the Epoch, or throws
#       an error if the date could not be parsed.

proc rfc2822::parseDate { date } {
    variable datepats

    # Strip comments and excess whitespace from the date field

    regsub -all -expanded {
        \(              # open parenthesis
        (:?
              [^()[.\.]]   # character other than ()\
              |\\.         # or backslash escape
        )*              # any number of times
        \)              # close paren
    } $date {} date
    set date [string trim $date]

    # Match the patterns in order of preference, returning the first success

    foreach {regexp pat} $datepats {
        if { [regexp -nocase $regexp $date] } {
            return [clock scan $date -format $pat]
        }
    }

    return -code error -errorcode {RFC2822 BADDATE} \
        "expected an RFC2822 date, got \"$date\""

}

# Usage example

if {![info exists ::argv0] || [info script] ne $::argv0} return
puts [clock format \
          [rfc2822::parseDate {Mon(day), 23 Aug(ust) 2004 01:23:45 UT}]]

See also: Parsing ISO8601 dates and times


Category Date and Time Category Package Category Parsing