EpubDump

Keith Vetter 2014-03-14 : Along with the recently posted epubCreator, here's another tool for working with epubs . This one opens up an epub and extracts all the metadata associated with it.

If you run it using tclsh, it will print the results to the terminal. If you run it using wish or specify "--tk" it will display the results in a window along with the book's cover image.

##+##########################################################################
#
# epubDump.tsh -- dumps out metadata in an epub file
# by Keith Vetter 2014-03-14
#

package require Tcl 8.6
package require vfs::zip
package require tdom

set epubMount /__epub

set opf_namespaces {ns "http://www.idpf.org/2007/opf"
    xsi "http://www.w3.org/2001/XMLSchema-instance"
    opf "http://www.idpf.org/2007/opf"
    dcterms "http://purl.org/dc/terms/"
    calibre "http://calibre.kovidgoyal.net/2009/metadata"
    dc "http://purl.org/dc/elements/1.1/"}
set meta_namespaces {mm urn:oasis:names:tc:opendocument:xmlns:container}

proc DumpEpub {epubFile} {
    set mnt [::vfs::zip::Mount [file normalize $epubFile] $::epubMount]
    try {
        DumpEpub2 $epubFile
    } finally {
        ::vfs::zip::Unmount $mnt $::epubMount
    }
}

proc DumpEpub2 {epubFile} {
    set opf_file [FindOPF]
    set opf_dir [file dirname $opf_file]
    set opf [ReadAllData [file join $::epubMount $opf_file]] ; list
    
    dom parse $opf doc
    $doc selectNodesNamespaces $::opf_namespaces
    set root [$doc documentElement]

    set keys [list epub title author "epub version"]
    set values {}
    lappend values $epubFile
    lappend values [[[$root selectNodes //dc:title] firstChild] nodeValue]
    set authors {}
    foreach node [$root selectNodes //dc:creator] {
        lappend authors [[$node firstChild] nodeValue]
    }
    lappend values [join $authors ", "]
    lappend values [$root getAttribute version]
    
    for {set metaNode [[$root selectNodes //ns:metadata] firstChild]} \
        {$metaNode ne ""} \
        {set metaNode [$metaNode nextSibling]} {
        set nodeName [$metaNode nodeName]
        if {$nodeName in {dc:title dc:creator}} continue
        
        if {$nodeName eq "meta"} {
            if {[$metaNode hasAttribute name]} {
                set name [$metaNode getAttribute name]
                set content [$metaNode getAttribute content]
                lappend keys "meta/$name"
                lappend values $content
            }
        } elseif {[string match "dc:*" $nodeName]} {
            set nodeName [string range $nodeName 3 end]
            lappend keys $nodeName
            set value [[$metaNode firstChild] nodeValue]
            if {$nodeName eq "identifier" && [$metaNode hasAttribute opf:scheme]} {
                set scheme [$metaNode getAttribute opf:scheme]
                lappend values "$scheme $value"
            } else {
                lappend values $value
            }
        }
    }

    # Cover image
    lappend keys "cover image"
    set value ""
    set coverName ""
    while {1} {
        # Epub 3.0 way of specifying the cover image
        set coverNode [$root selectNodes {//ns:item[@properties="cover-image"]}]
        if {$coverNode ne ""} {
            set href [$coverNode getAttribute href]
            set coverName [file join $opf_dir $href]
            set value $coverName
            break
        }

        # Epub 2.0 way of specifying the cover image
        set coverNode [$root selectNodes {//ns:meta[@name="cover"]}]
        if {$coverNode ne ""} {
            set coverId [$coverNode getAttribute content]
            set coverItemNode [$root selectNodes //ns:item\[@id=\"$coverId\"\]]
            set href [$coverItemNode getAttribute href]
            set coverName [file join $opf_dir $href]
            set value $coverName
            break
        }

        # non-standard ways of specifying cover image
        set guessCover [$root selectNodes {
            //ns:item[translate(@href,"MSRCOVE","msrcove")="msrcover.jpg"
                      and starts-with(@media-type,"image/")]}]
        if {$guessCover ne ""} {
            set href [$guessCover getAttribute href]
            set coverName [file join $opf_dir $href]
            set value "? $coverName  ?"
            break
        }
        set guess [$root selectNodes {//ns:item[contains(translate(@href,"COVER","cover"),"cover")
                                                and starts-with(@media-type,"image/")]}]
        if {$guess eq ""} {
            set guess [$root selectNodes {//ns:item[contains(@href,"_msr_cvi_r")
                                                    and starts-with(@media-type,"image/")]}]
        }            
        if {[llength $guess] > 0} {
            set href [[lindex $guess 0] getAttribute href]
            set coverName [file join $opf_dir $href]
            set value "?? $coverName  ??"
            break
        }
        # No luck finding cover image
        break
    }
    lappend values $value
    
    # Spine items
    lappend keys "spine items"
    lappend values [llength [[$root selectNodes //ns:spine] childNodes]]
    
    unset doc
    ShowResult $keys $values $coverName
}
proc ShowResult {keys values coverName} {
    if {[info exists ::tk_version]} {
        set results [PrettyPrint $keys $values]
        ShowCover [file tail [lindex $values 0]] $coverName $results
    } else {
        puts [PrettyPrint $keys $values]
        puts ""
    }
}

proc PrettyPrint {keys values} {
    set max 0
    foreach key $keys { set max [expr {max($max,[string length $key])}] }
    set max [expr {min($max,15)}]
    set maxValue [expr {55-$max}]
    set result "[file tail [lindex $values 0]]\n\n"
    foreach key $keys value $values {
        if {[string length $value] > $maxValue} { set value "[string range $value 0 $maxValue-3]..." }
        append result [format "%-${max}s : %s\n" $key $value]
    }
    return $result
}
proc ReadAllData {fname} {
    set fin [open $fname r]
    set data [read $fin] ;list
    close $fin
    return $data
}
proc FindOPF {} {
    set container [ReadAllData [file join $::epubMount META-INF container.xml]] ; list
    dom parse $container doc
    set root [$doc documentElement]
    set opf_file [[$root selectNodes -namespace $::meta_namespaces //mm:rootfile] \
                      getAttribute full-path]
    unset doc
    return $opf_file
}
proc ShowCover {epubName coverName result} {
    set x [expr {[lindex [concat [.c bbox all] x x -5 x] 2] + 5}]
    set item [.c create text $x 0 -anchor nw -text [string trim $result] -font textFont]
    lassign [.c bbox $item] x0 y0 x1 y1
    incr y1 5

    if {$coverName ne ""} {
        set iname [image create photo -file [file join $::epubMount $coverName]]
        
        if {[image width $iname] < ($x1-$x0)} {
            set xmid [expr {($x0+$x1)/2}]
            set item [.c create image $xmid $y1 -anchor n -image $iname]
        } else {
            set item [.c create image $x $y1 -anchor nw -image $iname]
        }
        .c create rect [.c bbox $item] -fill {} -outline black -width 1
    }
    
    lassign [.c bbox all] . . x1 y1
    .c lower [.c create rect $x 0 $x1 $y1 -tag a -fill white -outline white]

    # Must fit this entry
    .c config -width [expr {max($x1-$x,[.c cget -width])}]
    .c config -height [expr {max($y1,[.c cget -height])}]
    .c config -scrollregion [.c bbox all]
}
proc DoDisplay {} {
    if {! [info exists ::tk_version]} return
    package require Img
    wm deiconify .
    wm title . epubDump
    ::ttk::scrollbar .sb_x -command [list .c xview] -orient horizontal
    canvas .c -xscrollcommand [list .sb_x set] -highlightthickness 0 -bg red
    pack .sb_x -side bottom -fill x
    pack .c -side top -fill both -expand 1

    bind .c <2> [bind Text <2>]         ;# Enable dragging w/ <2>
    bind .c <B2-Motion>         [bind Text <B2-Motion>]

    if {"textFont" ni [font names]} {
        font create textFont -family Courier -size 8 -weight bold
    }
}    

################################################################
set epubFile ~/Downloads/On_Basilisk_Station.epub
if {"--tk" in $argv} { package require Tk }
DoDisplay
if {$tcl_interactive} return
foreach epubFile $argv {
    if {$epubFile eq "--tk"} continue
    DumpEpub $epubFile
}
return