[KBK] 2009-08-01 -- Emiliano Gavilán raised a question on the Tcl'ers Chat about parsing RFC2822-formatted dates and times (such as the ones that appear in mail headers). His question made me realize that I had never Wikified the code that I use for the purpose. It's a wrapper around [[clock scan]] that matches the format from a limited set, and then uses [clock scan] to extract the data. ====== if {![package vsatisfies [info tclversion] 8.5]} { package require newclock } namespace eval rfc2822 { namespace export parseDate variable datepats {} } # AddDatePat -- # # Internal procedure that adds a date pattern to the pattern list # # Parameters: # wpat - Regexp pattern that matches the weekday # wgrp - Format group that matches the weekday # ypat - Regexp pattern that matches the year # ygrp - Format group that matches the year # mdpat - Regexp pattern that matches month and day # mdgrp - Format group that matches month and day # spat - Regexp pattern that matches the seconds of the minute # sgrp - Format group that matches the seconds of the minute # zpat - Regexp pattern that matches the time zone # zgrp - Format group that matches the time zone # # Results: # None # # Side effects: # Adds a complete regexp and a complete [clock scan] pattern to # 'datepats' proc rfc2822::AddDatePat { wpat wgrp ypat ygrp mdpat mdgrp spat sgrp zpat zgrp } { variable datepats set regexp {^[[:space:]]*} set pat {} append regexp $wpat $mdpat {[[:space:]]+} $ypat append pat $wgrp $mdgrp $ygrp append regexp {[[:space:]]+\d\d?:\d\d} $spat append pat { %H:%M} $sgrp append regexp $zpat append pat $zgrp append regexp {[[:space:]]*$} lappend datepats $regexp $pat return } # InitDatePats -- # # Internal rocedure that initializes the set of date patterns allowed in # an RFC2822 date # # Parameters: # permissible - 1 if erroneous (but common) time zones are to be # allowed, 0 if they are to be rejected # # Results: # None. # # Side effects: proc rfc2822::InitDatePats { permissible } { # Produce formats for the observed variants of ISO2822 dates. Permissible # variants come first in the list; impermissible ones come later. # The month and day may be "%b %d" or "%d %b" foreach mdpat {{[[:alpha:]]+[[:space:]]+\d\d?} {\d\d?[[:space:]]+[[:alpha:]]+}} \ mdgrp {{%b %d} {%d %b}} \ mdperm {0 1} { # The year may be two digits, or four. Four digit year is done # first. foreach ypat {{\d\d\d\d} {\d\d}} ygrp {%Y %y} { # The seconds of the minute may be provided, or omitted. foreach spat {{:\d\d} {}} sgrp {:%S {}} { # The weekday may be provided or omitted. It is common but # impermissible to omit the comma after the weekday name. foreach wpat { {(?:Mon|T(?:ue|hu)|Wed|Fri|S(?:at|un)),[[:space:]]+} {(?:Mon|T(?:ue|hu)|Wed|Fri|S(?:at|un))[[:space:]]+} {} } wgrp { {%a, } {%a } {} } wperm { 1 0 1 } { # Time zone is defined as +/- hhmm, or as a # named time zone. Other common but buggy # formats are GMT+-hh:mm, a time zone name in # quotation marks, and complete omission of # the time zone. foreach zpat { {[[:space:]]+(?:[-+]\d\d\d\d|[[:alpha:]]+)} {[[:space:]]+GMT[-+]\d\d:?\d\d} {[[:space:]]+"[[:alpha:]]+"} {} } zgrp { { %Z} { GMT%Z} { "%Z"} {} } zperm { 1 0 0 0 } { if { ($zperm && $wperm && $mdperm) == $permissible } { AddDatePat $wpat $wgrp $ypat $ygrp \ $mdpat $mdgrp \ $spat $sgrp $zpat $zgrp } } } } } } return } # Initialize the date patterns namespace eval rfc2822 { InitDatePats 1 InitDatePats 0 rename AddDatePat {} rename InitDatePats {} puts [join $datepats \n] } # rfc2822::parseDate -- # # Parses a date expressed in RFC2822 format # # Parameters: # date - The date to parse # # Results: # Returns the date expressed in seconds from the Epoch, or throws # an error if the date could not be parsed. proc rfc2822::parseDate { date } { variable datepats # Strip comments and excess whitespace from the date field regsub -all -expanded { \( # open parenthesis (:? [^()[.\.]] # character other than ()\ |\\. # or backslash escape )* # any number of times \) # close paren } $date {} date set date [string trim $date] # Match the patterns in order of preference, returning the first success foreach {regexp pat} $datepats { if { [regexp -nocase $regexp $date] } { return [clock scan $date -format $pat] } } return -code error -errorcode {RFC2822 BADDATE} \ "expected an RFC2822 date, got \"$date\"" } # Usage example if {![info exists ::argv0] || [info script] ne $::argv0} return puts [clock format \ [rfc2822::parseDate {Mon(day), 23 Aug(ust) 2004 01:23:45 UT}]] puts [clock format \ [rfc2822::parseDate "Tue, Jul 21 2009 19:37:47 GMT-0400"]] ====== ---- See also: [Parsing ISO8601 dates and times] ---- %|[Category Date and Time] |[Category Package] |[Category Parsing]|%