Version 3 of Parsing PDF

Updated 2007-10-21 19:35:01 by kpv

Keith Vetter 2007-10-20 : I've been playing with PDF and generating it on the fly. My first effort resulted in Text2PDF, but I needed more formatting control so the next step was to delve into the file format [L1 ]. PDF can be a VERY complex format with support for incremental updates and generations. I find that the only way for me to understand a new technology is to get my hands dirty, so I decided to write a parser to pull apart a PDF file.

So here is code to parse a PDF file. It understands the lexemes comprising the file, and it understands the document structure and the page tree data structure. It lets you see, but does NOT understand, the low-level stuff which makes up pages or defines fonts, etc.

If anybody else wants to delve into the PDF deeper, this code would make a good starting point.

The code here contains both the PDF parsing code plus two different ways to run it, both a command line interface and a GUI interface. In either case, it displays a PDF's catalog and root of the page layout tree, plus the meta-data for a specified page.


Andreas otto 2007-10-21 : other software solution pdflib

KPV: Note, pdflib is a commercial, non-TCL (ANSI C) product


 ##+##########################################################################
 #
 # Parsing PDF -- decodes PDF document
 # by Keith Vetter. October 2007
 #
 # http://partners.adobe.com/asn/tech/pdf/specifications.jsp
 # http://www.adobe.com/devnet/pdf/pdfs/PDFReference13.pdf


 proc ParsePDF {} {
    global D CATALOG PAGES PAGE CONTENT

    unset -nocomplain D
    unset -nocomplain CATALOG
    unset -nocomplain PAGES
    unset -nocomplain PAGE
    unset -nocomplain CONTENT

    CheckValidPDF
    GetTrailer
    ReadXREF
    ReadObject $D(root) CATALOG
    ReadObject $CATALOG(/Pages) PAGES
 }

 proc ParsePDF.File {fname} {
    set fin [open $fname r]
    fconfigure $fin -translation binary
    SetText [read $fin]
    close $fin

    ParsePDF
 }
 ##+##########################################################################
 #
 # CheckValidPDF -- does some simple sanity checks then reads in the
 # cross-reference location.
 #
 proc CheckValidPDF {} {
    global data D

    if {[string range $data 0 4] ne "%PDF-"} { error "Mal-formed PDF 1" }
    set tail [string range $data end-1024 end] ; list
    if {! [regexp {%%EOF[\r\n]*$} $tail]} { error "Mal-formed PDF 2" }
    set n [regexp {[\n\r]+startxref[\n\r]+(\d+)} $tail => fpos]
    if {! $n} { error "Mal-formed PDF 3" }
    set D(xref,fpos) $fpos
 }
 ##+##########################################################################
 #
 # GetTrailer -- reads key info out of the trailer section.
 # Not sure this is correct with incrementally updated pdf files.
 #
 proc GetTrailer {} {
    global data D

    # Is this the best way to locate the trailer??? especially with
    # multiple trailers with incremental updates
    set offset [string first trailer $data $D(xref,fpos)]
    SetPosition $offset

    set token [GetToken]                        ;# Trailer keyword
    set token [GetToken]                        ;# Start of dict
    set dict [ReadDict]
    unset -nocomplain tmp
    array set tmp $dict

    if {! [info exists tmp(/Size)]} { error "trailer missing /Size" }
    if {! [info exists tmp(/Root)]} { error "trailer missing /Root" }
    set D(xref,cnt) $tmp(/Size)
    set D(root) $tmp(/Root)
    if {[info exists tmp(/Prev)]} { set D(prev) $tmp(/Prev) }
 }
 ##+##########################################################################
 #
 # ReadXREF -- reads in the main and possibly prev XREF section.
 # NB. can there be more than 2?
 #
 proc ReadXREF {} {
    global D

    set offsets $D(xref,fpos)
    if {[info exists D(prev)]} { lappend offsets $D(prev) }
    foreach offset $offsets {
        SetPosition $offset
        GetToken
        set token [GetToken]
        foreach {start cnt} [split $token " "] break

        for {set i 0} {$i < $cnt} {incr i} {
            set id [expr {$start + $i}]
            set token [GetToken]
            scan $token "%ld %d %s" offset generation inuse
            if {$inuse eq "f"} continue
            set D(xref,$id,offset) $offset
        }
    }
 }
 ##+##########################################################################
 #
 # ReadObject -- assumes an object is just a dictionary, stores the
 # dictionary into specified array
 #
 proc ReadObject {objID arrName} {
    upvar 1 $arrName arr
    unset -nocomplain arr

    PositionAtObject $objID
    set token [GetToken]
    if {$token ne "<<"} { error "Object $objID not a dictionary" }
    set dict [ReadDict]
    array set arr $dict
 }
 proc ShowObject {id title} {
    ReadObject $id tmp
    myParray tmp $title
 }
 ##+##########################################################################
 #
 # GetPage -- Searches and extracts a given document page
 #
 proc GetPage {num} {
    global PAGE CONTENT

    set id [FindPage $num]
    if {$id eq ""} {return ""}                  ;# No page
    ReadObject $id PAGE

    # Content can be quite complex, just try for the simple case
    if {[catch {ReadObject $PAGE(/Contents) CONTENT}]} {return $id}
    if {! [string is integer -strict $CONTENT(/Length)]} { ;# Deref length
        PositionAtObject $CONTENT(/Length)
        set len [GetToken]
        append CONTENT(/Length) " => $len"
    }
    return $id
 }
 ##+##########################################################################
 #
 # FindPage -- walks the page tree to find a given page
 #
 proc FindPage {pageNum} {
    global PAGES

    if {$pageNum < 1 || $pageNum > $PAGES(/Count)} {
        return ""
        error "Page number too big"
    }

    set thisPage 1
    set kids [eval concat $PAGES(/Kids)]
    while {1} {
        foreach {id gen R} $kids {
            ReadObject $id child
            if {$child(/Type) eq "/Page"} {
                if {$thisPage == $pageNum} { return $id }
                incr thisPage
                continue
            }

            set lastPage [expr {$thisPage + $child(/Count)}]
            if {$pageNum < $lastPage} {         ;# Go down a level
                set kids [eval concat $child(/Kids)]
                break
            } else {
                set thisPage $lastPage
            }
        }
    }
 }
 ##+##########################################################################
 #
 # PositionAtObject -- "seeks" to position of a direct or indirect reference
 #
 proc PositionAtObject {ref} {
    set id [lindex $ref 0]
    SetPosition $::D(xref,$id,offset)
    set token [GetToken]                        ;# Object define
 }
 ##+##########################################################################
 #
 # myParray -- specialized version of parray pretty printing an array
 #
 proc myParray {arrName {title ""} {pattern *}} {
    upvar 1 $arrName arr

    if {$title eq ""} { set title $arrName }
    puts $title

    set max 0
    foreach name [array names arr $pattern] {
        if {[string length $name] > $max} {
            set max [string length $name]
        }
    }
    incr max 1
    if {[info exists arr(/Type)]} {
        puts [format "  %-*s = %s" $max /Type $arr(/Type)]
    }

    foreach name [lsort -dictionary [array names arr $pattern]] {
        if {$name eq "/Type"} continue
        puts [format "  %-*s = %s" $max $name $arr($name)]
    }
    puts ""
 }
 ##+##########################################################################
 #
 # ReadArray -- returns a list of everything up to
 # the matching closing array character.
 #
 # NB. can be nested w/ arrays and dictionaries
 #
 proc ReadArray {} {
    set arr {}
    while {1} {
        set token [GetToken]
        if {$token eq "]"} break
        if {$token eq "\["} {
            set token [ReadArray]
        } elseif {$token eq "<<"} {
            set token [ReadDict]
        }
        lappend arr $token
    }
    return $arr
 }
 ##+##########################################################################
 #
 # ReadDict -- returns a list of everything up to
 # the matching closing dictionary character.
 #
 # NB. can be nested w/ arrays and dictionaries
 #
 proc ReadDict {} {
    set dict {}
    while {1} {
        set name [GetToken]
        if {$name eq ">>"} break

        set value [GetToken]
        if {$value eq "<<"} {
            set value [ReadDict]
        } elseif {$value eq "\["} {
            set value [ReadArray]
        }
        lappend dict $name $value
    }
    return $dict
 }
 ##+##########################################################################
 #
 # GetToken -- breaks input stream into PDF tokens, worrying about
 # strings, boolean, hex sequences, names, arrays, dictionaries and
 # atoms.
 #
 proc GetToken {} {
    set eotoken "()<>\[]{}/%"

    # Past white space and comments
    set inComment false
    while {1} {
        set ch [GetChar]
        if {$inComment} {
            if {$ch eq "\n" || $ch eq "\r"} {
                set inComment false
            }
        } elseif {$ch eq "%"} {
            set inComment true
        } elseif {! [string is space $ch]} break
    }

    set start $::D(pos)
    while {1} {
        if {$ch eq "(" || $ch eq "\{"} {        ;# In a string or boolean
            set backslash false
            set match [expr {$ch eq "(" ? ")" : "\}"}]
            while {1} {
                set ch [GetChar]
                if {$ch eq "\\"} {
                    set backslash true
                } elseif {! $backslash && $ch eq $match} {
                    break
                } else {
                    set backslash false
                }
            }
            break
        }
        if {$ch eq "\[" || $ch eq "]"} break
        if {$ch eq "<"} {                       ;# Hex or dictionary
            set ch [GetChar]
            if {$ch eq "<"} break               ;# Dictionary
            while {1} {
                set ch [GetChar]
                if {$ch eq ">"} break
            }
            break
        }
        if {$ch eq ">"} {
            GetChar
            break
        }

        set name [expr {$ch eq "/"}]
        while {1} {
            set ch [GetChar]
            if {$ch eq "\n" || $ch eq "\r"} break
            if {$name && [string is space $ch]} break
            if {[string first $ch $eotoken] != -1} break
        }
        UnGetChar
        break
    }
    set ::D(token,start) $start
    set ::D(token,end) $::D(pos)
    return [string range $::data $start $::D(pos)]
 }
 proc GetChar {} {
    set ch [string index $::data [incr ::D(pos)]]
    return $ch
 }
 proc UnGetChar {} {
    incr ::D(pos) -1
 }
 proc SetPosition {pos} {
    set ::D(pos) [expr {$pos-1}]
 }
 proc SetText {txt} {
    set ::data $txt
    set ::D(pos) -1
 }
 ################################################################
 #
 # DEMO CODE
 #

 proc NoGUI {} {
    global argc argv
    global CATALOG PAGES PAGE CONTENT

    if {$argc == 0 || $argc > 2} {
        puts stderr "usage: parsePDF <pdf file> ?page number?"
        exit
    } elseif {$argc == 1} {
        lappend argv 1
    }
    foreach {fname pageNum} $argv break

    puts "Parsing [file tail $fname]\n"
    ParsePDF.File $fname

    myParray CATALOG Catalog
    myParray PAGES "Page Tree"
    GetPage $pageNum
    myParray PAGE "Page $pageNum Metadata"
    myParray CONTENT "Page $pageNum Contents"
 }

 if {! [info exists tk_version]} {
    NoGUI
    exit
 }

 ################################################################
 #
 # GUI DEMO CODE
 #

 if {! [catch {package require xtile}]} {
    namespace import -force ::ttk::button
 }

 set S(pageNum) 1
 set S(havePDF) 0
 set S(fname) ""

 proc DoDisplay {} {
    global S

    wm title . "Parsing PDF"
    image create photo ::img::question -data {
        R0lGODlhCQAMALMAAODf49PS1oeHiURDRQAAAJOSlVtbXG1sbre2uczLz3t7fa6usf//////////
        /////yH5BAEAAAAALAAAAAAJAAwAAwQmEAQxyCggGxwKwcAxZABBkChhoBlCIGwywKyysNmJ79lx
        7D4eKwIAOw==
    }
    image create bitmap ::img::dots -data {
        #define dots_width 16
        #define dots_height 9
        static char dots_bits = {
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x00, 0x84, 0x10, 0xce, 0x39, 0x84, 0x10}}
    image create bitmap ::img::star -data {
        #define plus_width  11
        #define plus_height 9
        static char plus_bits[] = {
            0x00,0x00, 0x24,0x01, 0xa8,0x00, 0x70,0x00, 0xfc,0x01,
            0x70,0x00, 0xa8,0x00, 0x24,0x01, 0x00,0x00 }}

    frame .file -bd 2 -relief ridge
    frame .fcat -bd 2 -relief ridge
    frame .fpages -bd 2 -relief ridge
    frame .fpage -bd 2 -relief ridge

    # File area
    label .file.l -text "PDF File"
    entry .file.fname -textvariable S(fname)
    button .file.choose -image ::img::dots -command ChooseFile
    button .file.go -text Parse -command Go
    button .file.about -image ::img::question -command About

    grid .file.l .file.fname .file.choose -sticky ns -pady 5
    grid .file.go - - -pady 5
    place .file.about -anchor se -relx 1 -rely 1 -x -5 -y -5

    # Catalog and data area
    label .fcat.title -text "Catalog"
    scrollbar .fcat.sb -orient vertical -command {.fcat.catalog yview}
    listbox .fcat.catalog -listvariable S(catalog) -yscroll {.fcat.sb set}
    label .fpages.title -text "Page Tree Root"
    scrollbar .fpages.sb -orient vertical -command {.fpages.pages yview}
    listbox .fpages.pages -listvariable S(pages) -yscroll {.fpages.sb set}

    frame .fpage.btns
    label .fpage.l -text "Page"
    entry .fpage.num -width 3 -textvariable S(pageNum) -justify center \
        -validate key -vcmd {string is integer -strict %P}
    button .fpage.go -image ::img::star -state disabled -command GoPage
    scrollbar .fpage.sb -orient vertical -command {.fpage.page yview}
    listbox .fpage.page -listvariable S(page) -yscroll {.fpage.sb set}

    grid .file - - -sticky ew
    grid .fcat .fpages .fpage -sticky news
    grid columnconfigure . {0 1 2} -weight 1
    grid rowconfigure . {1} -weight 1

    pack .fcat.title -side top -fill x
    pack .fcat.sb -side right -fill y
    pack .fcat.catalog -side top -fill both -expand 1
    pack .fpages.title -side top -fill x
    pack .fpages.sb -side right -fill y
    pack .fpages.pages -side top -fill both -expand 1

    pack .fpage.btns -side top
    pack .fpage.l .fpage.num .fpage.go -in .fpage.btns -side left
    pack .fpage.sb -side right -fill y
    pack .fpage.page -side top -fill both -expand 1
    #grid x .fpage.l .fpage.num .fpage.go x
    #grid configure .fpage.go -padx 5 -sticky ns
    #grid .fpage.page - - - - .fpage.sb -sticky news
    #grid columnconfigure .fpage {0 4} -weight 1
    #grid rowconfigure .fpage {1} -weight 1

    foreach var {fname pageNum havePDF} {
        foreach t [trace info variable S($var)] {
            eval trace remove variable S($var) $t
        }
        trace variable S($var) w Tracer
    }
    set pdfs [glob -nocomplain *.pdf]
    set S(fname) [lindex $pdfs [expr {int(rand()*[llength $pdfs])}]]
 }
 proc Tracer {var1 var2 op} {
    global S

    if {$var1 ne "S"} return
    if {$var2 eq "fname"} {
        .file.go config -state \
            [expr {[file isfile $S(fname)] ? "normal" : "disabled"}]
    } elseif {$var2 eq "pageNum" || $var2 eq "havePDF"} {
        set how disabled
        if {$S(havePDF) && [string is integer -strict $S(pageNum)]} {
            set how normal
        }
        .fpage.go config -state $how
    }
 }
 proc ChooseFile {} {
    global S

    set fname [tk_getOpenFile -parent . -defaultextension ".pdf" \
                   -initialfile $S(fname) -filetypes {{{PDF Files} .pdf}} \
                   -title "PDF Parser Open File"]
    if {$fname eq ""} return
    set S(fname) $fname
 }
 proc ArrayToListBox {arrName var} {
    global S
    upvar 1 $arrName arr

    set S($var) {}
    set max 0
    foreach name [array names arr] {
        if {[string length $name] > $max} {
            set max [string length $name]
        }
    }
    incr max 1
    if {[info exists arr(/Type)]} {
        lappend S($var) [format "  %-*s = %s" $max /Type $arr(/Type)]
    }

    foreach name [lsort -dictionary [array names arr]] {
        if {$name eq "/Type"} continue
        lappend S($var) [format "  %-*s = %s" $max $name $arr($name)]
    }
 }
 proc Go {} {
    global S CATALOG PAGES

    set S(catalog) [set S(pages) [set S(page) {}]]
    if {! [file isfile $S(fname)]} return

    ParsePDF.File $S(fname)
    ArrayToListBox CATALOG catalog
    ArrayToListBox PAGES pages
    set S(havePDF) 1
    GoPage
 }
 proc About {} {
    set txt "Parsing PDF\nby Keith Vetter, Oct 2007\n\n"
    append txt "This program demos code that can parse PDF files. The\n"
    append txt "code displays a pdf's document structure including its\n"
    append txt "catalog and the root of its page tree. It will also show\n"
    append txt "the page object for any page.\n\n"

    append txt "For more info on PDF format see\n"
    append txt "http://partners.adobe.com/asn/tech/pdf/specifications.jsp"

    tk_messageBox -message $txt
 }
 proc GoPage {} {
    global S PAGES PAGE

    if {! $S(havePDF)} return
    if {! [string is integer -strict $S(pageNum)]} return
    if {$S(pageNum) < 1} {
        set S(page) {{Page number too small}}
        return
    } elseif {$S(pageNum) > $PAGES(/Count)} {
        set a [expr {$PAGES(/Count) == 1 ? "page" : "pages"}]
        set S(page) [list [list Only $PAGES(/Count) $a]]
        return
    }

    set id [GetPage $S(pageNum)]
    if {$id eq ""} {
        set S(page) "{bad page number}"
    } else {
        ArrayToListBox PAGE page
        set S(page) [concat [list "Object id: $id"] {---------} $S(page)]
    }

 }
 DoDisplay
 return

Category Printing