BibTeX parser

NEM 3Mar2005: Here's a very simple parser for the BibTeX bibliography database format. It's rather braindead at the moment, and is really not much more than a tokeniser. To use it, call bibtex::parse with the contents of a database, and you'll get back a list of entries. Each entry is itself a list of three elements: type (e.g. article, thesis, etc), key (whatever key it was given in the database), and a final list of key/value pairs (in dict form) representing the record. It doesn't handle BibTeXs (or TeXs) insane file format fully, and doesn't handle @string macros etc. There may well be further restrictions on what it can cope with. It suffices for what I need it for though -- a quick and dirty hack to dump a bibtex db to a website. Anyway, enjoy!

NEM Version 0.2 -- fixed some bugs. Now successfully handles unescaped @ signs in records (not technically allowed, if I remember correctly, but often used in email addresses), by requiring that the @ be preceeded by at least one whitespace character (not the case in email addresses, or \@). It also now should handle @string macros.

NEM Version 0.3 -- changed to a SAX-style callback parser interface. You now have to pass options for callbacks which are invoked as each record etc is parsed. No documentation, and no test-suite as yet, so it may still be buggy. Started adding in the beginnings of options to handle async parsing (i.e. using the event loop), and parsing from a channel (instead of in memory), both of which will be useful for large databases. These options aren't implemented yet, though. This is likely the last version which will appear on the wiki: needs to move to a proper CVS repository soon, I think.

AK Version 0.3 has been added to Tcllib (CVS Head). Some bugs were fixed, making this version 0.4. Testsuite added. Documentation added. NOTE. documentation and code are not in sync yet. Package is not installed yet because of these rough edges.

# bibtex.tcl --
#
#      A basic parser for BibTeX bibliography databases.
#
# Copyright (c) 2005 Neil Madden.
# License: Tcl/BSD style.

package require Tcl 8.4
package provide bibtex 0.3

# A rough grammar (case-insensitive):
#
# Database        ::= (Junk '@' Entry)*
# Junk                ::= .*?
# Entry        ::= Record
#                |   Comment
#                |   String
#                |   Preamble
# Comment        ::= "comment" [^\n]* \n                -- ignored
# String        ::= "string" '{' Field* '}'
# Preamble        ::= "preamble" '{' .* '}'         -- (balanced)
# Record        ::= Type '{' Key ',' Field* '}'
#                |   Type '(' Key ',' Field* ')' -- not handled
# Type                ::= Name
# Key                ::= Name
# Field        ::= Name '=' Value
# Name                ::= [^\s\"#%'(){}]*
# Value        ::= [0-9]+
#                |   '"' ([^'"']|\\'"')* '"'
#                |   '{' .* '}'                         -- (balanced)

namespace eval bibtex {
    variable id 0
    variable data
    array set data { }
 
    # bibtex::parse --
    #
    #        Parse a bibtex file.
    #
    # parse ?options? ?bibtex?
    # where options can be:
    #        -recordcommand cmd        -- callback for each record
    #        -preamblecommand cmd        -- callback for @preamble blocks
    #        -stringcommand cmd        -- callback for @string macros
    #        -commentcommand cmd        -- callback for @comment blocks
    #        -progresscommand cmd        -- callback to indicate progress of parse
    proc parse {args} {
        variable data
        variable id
 
        # Argument processing
        if {[llength $args] < 1} {
            set err "[lindex [info level 0] 0] ?options? ?bibtex?"
            return -code error "wrong # args: should be \"$err\""
        }
        set token bibtex[incr id]
        array set options {
            -async                0
            -blocksize                1024
        }
        set options(-stringcommand) [list [namespace current]::addStrings $token]
        if {[llength $args] % 2 == 1} {
            set data($token,buffer) [lindex $args end]
            set data($token,eof) 1
            array set options [lrange $args 0 end-1]
        } else {
            set data($token,buffer) ""
            set data($token,eof) 0
            array set options [lrange $args 0 end]
            if {![info exists options(-channel)]} {
                cancel $token
                return -code error "no channel and no data given"
            }
            if {$options(-async)} {
                fileevent $options(-channel) readable \
                    [list [namespace current]::ReadChan $token]]
            } else {
                # Snarf it all up in one go for now
                set data($token,buffer) [read $options(-channel)]
                set data($token,eof) 1
            }
        }
        foreach {k v} [array get options] { set data($token,$k) $v }
        # String mappings
        set data($token,strings) { }
        if {$options(-async)} {
            cancel $token
            error "not implemented"
        } else {
            ParseRecords $token
        }
    }

    # Cleanup a parser, cancelling any callbacks etc.
    proc cancel {token} {
        variable data
        if {[info exists $data($token,channel)]} {
            fileevent $data($token,channel) readable {}
        }
        foreach key [array names data $name,*] {
            unset data($key)
        }
    }

    # bibtex::addStrings --
    #
    #        Add strings to the map for a particular parser. All strings are
    #        expanded at parse time.
    proc addStrings {token strings} {
        variable data
        eval [list lappend data($token,strings)] $strings
    }

    # Private utility routines =================================
    proc Callback {token type args} {
        variable data
        if {[info exists data($token,-${type}command)]} {
            if {$data($token,-async)} {
                after 0 $data($token,-${type}command) $args
            } else {
                eval $data($token,-${type}command) $args
            }
        }
    }

    proc ReadChan {token} {
        variable data
        set chan $data($token,-channel)
        append data($token,buffer) [read $chan]
        if {[eof $chan]} {
            set data($token,eof) 1
        }
    }

    proc ParseRecords {token} {
        variable data
        set bibtex $data($token,buffer)
 
        # Run through each block
        set db [regexp -all -inline {((?:[^@]|\S@|[^\n][\r\t\f ]*@)*)\s?@} $bibtex]
        set total [expr {([llength $db]-2)/2}]
        set step [expr {double($total) / 100.0}]
        set istep [expr {$step > 1 ? int($step) : 1}]
        set count 0
        foreach {_ block} [lrange $db 2 end] {
            if {([incr count] % $istep) == 0} {
                Callback $token progress [expr {int($count / $step)}]
            }
            if {[regexp -nocase {\s*comment([^\n])*\n(.*)} $block \
                -> cmnt rest]} {
                # Are @comments blocks, or just 1 line?
                # Does anyone care?
                Callback $token comment $cmnt
            } elseif {[regexp -nocase {\s*string[^\{]*\{(.*)\}[^\}]*} \
                $block -> rest]} {
                # string macro defs
                Callback $token string [ParseBlock $rest]
            } elseif {[regexp -nocase {\s*preamble[^\{]*\{(.*)\}[^\}]*} \
                $block -> rest]} {
                Callback $token preamble $rest
            } elseif {[regexp {([^\{]+)\{([^,]*),(.*)\}[^\}]*} $block -> \
                type key rest]} {
                # Do any @string mappings (these are case insensitive)
                set rest [string map -nocase $data($token,strings) $rest]
                Callback $token record [Tidy $type] [string trim $key] \
                    [ParseBlock $rest]
            } else {
                puts stderr "Skipping: $block"
            }
        }
    }

    proc Tidy {str} {
        string tolower [string trim $str]
    }

    proc ParseBlock {block} {
        set ret [list]
        set index 0
        while {1} {
            if {[regexp -start $index -indices -- {(\S+)[^=]*=(.*)} $block -> \
                key rest]} {

                foreach {ks ke} $key { break }
                set k [Tidy [string range $block $ks $ke]]
                foreach {rs re} $rest { break }
                foreach {v index} \
                    [ParseBibString $rs [string range $block $rs $re]] \
                    { break }
                lappend ret $k $v
            } else { break }
        }
        return $ret
    }

    proc ParseBibString {index str} {
        set count 0
        set retstr ""
        set escape 0
        foreach char [split $str ""] {
            incr index
            if {$escape} {
                set escape 0
            } else {
                if {$char eq "\{"} {
                    incr count
                    continue
                } elseif {$char eq "\}"} {
                    incr count -1
                    if {$count < 0} { incr index -1; break }
                    continue
                } elseif {$char eq ","} {
                    if {$count == 0} { break }
                } elseif {$char eq "\\"} {
                    set escape 1; continue
                } elseif {$char eq "\""} { continue }
            }
            append retstr $char
        }
        regsub -all {\s+} $retstr { } retstr
        return [list [string trim $retstr] $index]
    }
}
proc readfile file {
    set fd [open $file]
    set cn [read $fd]
    close $fd
    return $cn
}
proc progress {percent} {
     set str [format "Processing: \[%3d%%\]" $percent]
     puts -nonewline "\r$str"; flush stdout
}
proc count {type key data} {
     global count total
     if {[info exists count($type)]} {
         incr count($type)
     } else {
         set count($type) 1
     }
     incr total
}
array set count { }
set total 0
puts -nonewline "Processing: \[  0%\]"; flush stdout
bibtex::parse \
    -recordcommand count \
    -progresscommand progress [readfile [lindex $argv 0]]
puts ""
puts "Summary ======"
puts "Total: $total"
parray count

schlenk Well spoken, the file format is just insane...

Lars H: I agree, and (AFAICT) the only documentation of the BibTeX .bib format is "by example". A staggering contrast to the very comprehensive BNF grammars given for the syntax of TeX.