[Keith Vetter] 2018-08-31 - This package provides a scripted interface to some of the data on the fan-fiction website https://archiveofourown.org/%|%AO3 (Archive of Our Own)%|%. I've been wanting this data for a better reading experience while reading the stories on AO3, such as having all parts of the story together, being available while off-line and a better book reading interface than a web browser. I wanted to gather all the parts and create an epub out of them, and then use a book reading app. An official API for AO3 data has been on the https://archiveofourown.org/admin_posts/295%|%roadmap%|% for years but it's not out yet. There's a python package, https://pypi.org/project/ao3/%|%ao3%|%, that provides an interface using BeautifulSoup to scrape the web pages. So I decided to create my own AO3 scripting interface. Fortunately, the AO3's story web pages have a very consistent format making web scraping a fragile, but viable option. It uses [tdom] and its [xpath] interface to extract data from the AO3 web pages. Documentation is provided in the package. ---- see also: [EpubCreator] -- tool to create an epub from html pages ---- ====== namespace eval ::AO3 { # This package provides a scripted interface to the stories on AO3 (Archive of Our Own). # This is NOT an official API but rather scrapes the web site for the data. # It is inspired by the python ao3 package at https://pypi.org/project/ao3/. # by Keith Vetter 2018-08-29 # # Sample usage: # set ao3 [::AO3::New 258626] # puts "Title: [$ao3 title]" # puts "Author: [$ao3 author]" # puts "Words: [$ao3 words]" # set storyHtml [$aos story] # $ao3 cleanup # # API Documentation # ================= # set aos [::AO3::New $story_id] # Creates interface object for parsing Archive of Our Own stories # # $ao3 cleanup # Frees all the resources associated with this AO3 object # # $ao3 title # Returns the title of the story # # $ao3 author # Returns the author of the story # # $ao3 summary # Returns an html summary of the story # # $ao3 story # Returns the html of the story # # $ao3 chapter ## html|summary|count # For multi-chapter stories, extract html or summary for a specified chapter # # $ao3 additional_tags # Returns a list of additional tags for the story # # $ao3 bookmarks # Returns a count of the number of bookmarks for the story # # $ao3 category # Returns a list of categories for the story # # $ao3 chapters # Returns how many chapters written and planned in the story, e.g. 5/15 # # $ao3 characters # Returns a list of characters in the story # # $ao3 comments # Returns a count of the number of comments for the story # # $ao3 fandoms # Returns a list of the fandoms this story is in # # $ao3 hits # Returns a count of the number of hits for this story # # $ao3 kudos # Returns a count of the number of kudos for this story # # $ao3 kudos_left_by # Returns a list of all users who left kudos for this story # # $ao3 language # Returns the language the story is written in # # $ao3 published # Returns the date the story was published # # $ao3 rating # Returns a list of the ratings for this story # # $ao3 relationships # Returns a list of relationships in this story # # $ao3 warnings # Returns a list of warnings for this story # # $ao3 words # Returns a count of words in this story # # $ao3 html # Returns the raw html for the story # # $ao3 id # Returns this story's id # # $ao3 json # Returns most of the metadata about this story wrapped in a json object # # $ao3 url # Returns the url to this story's page on Archive of Our Own package require tdom package require http package require tls http::register https 443 [list ::tls::socket -tls1 1] variable assertions off proc New {id {verbose 0} {rawHtml ""}} { # Creates a new instance of our AO3 object for the requested story set me [_uniqueName] set rawHtml [_getHtml $id $verbose $rawHtml] variable $me [dict create html $rawHtml] set dom [::dom parse -html $rawHtml] set this [dict create id $id dom $dom me $me verbose $verbose] set commandMap [_buildCommandMap $this] namespace ensemble create -command $me -map $commandMap return $me } # metadata stats that all have similar format in the file set properties { rating category {fandoms fandom} {relationships relationship} {characters character} {additional_tags freeform} language words comments kudos hits published bookmarks chapters } foreach property $properties { lassign [concat $property $property] func keyword set body "return \[_lookupStat \$this $keyword\]" proc [namespace current]::$func {this} $body } proc _LOG {this level message} { set lvl [lsearch -exact {ALWAYS INFO DEBUG} $level] if {$lvl == -1 || $lvl > [dict get $this verbose]} return puts stderr "[string index $level 0]: $message" } proc _getHtml {id verbose rawHtml} { # Either download the story's html or read it from file or cache if {$rawHtml eq "cache"} { set rawHtml "" if {[file exists "$id.html"]} { set rawHtml "$id.html" } } if {$rawHtml eq ""} { set rawHtml [_downloadStory [dict create id $id verbose $verbose]] } elseif {[file exists $rawHtml]} { _LOG [dict create verbose $verbose] INFO "reading html from file $rawHtml" set rawHtml [::tDOM::xmlReadFile $rawHtml] } if {[string first "" $rawHtml] == -1} { error "ERROR: looks like bad html: '[string range $rawHtml 0 50]...'" } return $rawHtml } proc _downloadStory {this} { # Downloads html from AO3 set id [dict get $this id] set url "https://archiveofourown.org/works/$id?view_full_work=true&view_adult=true" _LOG $this INFO "downloading $url" set token [::http::geturl $url] set ncode [::http::ncode $token] set html [::http::data $token] _LOG $this DEBUG "download done: $ncode [string length $html] bytes" ::http::cleanup $token if {$ncode != 200} { error "ERROR: download failed: $ncode url: $url" } return $html } proc _uniqueName {} { # Find an unused name for our new namespace ensemble set existing [info commands [namespace current]::_obj*] for {set cnt [llength $existing]} {1} {incr cnt} { set me "[namespace current]::_obj$cnt" if {$me ni $existing} break } return $me } proc _buildCommandMap {this} { # Creates ensemble mapping from command to function set commandMap {} set cmds [lmap cmd [info commands [namespace current]::*] {namespace tail $cmd}] foreach cmd $cmds { if {$cmd eq "New" || [string match "_*" $cmd]} continue lappend commandMap $cmd [list $cmd $this] } return $commandMap } proc _assert {script expected {emsg ""}} { # Simple assertion mechanism with lazy evaluation if {$::AO3::assertions ne "on"} return set actual [uplevel 1 $script] if {$actual == $expected} return if {$emsg eq ""} { set emsg "$actual != $expected"} error $emsg } proc _FindAllInDom {this tag attribute value} { # Uses xpath to search the dom for tag/attribute/value triplet # If attribute is "id" we do an exact match, otherwise use contains() set dom [dict get $this dom] if {$attribute eq "id"} { set xpath "//$tag\[@$attribute='$value'\]" } else { set xpath "//$tag\[contains(@$attribute,'$value')\]" } _LOG $this DEBUG "xpath: $xpath" set nodes [$dom selectNodes $xpath] return $nodes } proc _innerHtml {html} { # Peels off the outer most tag from the html regsub {^.*?>\s*} $html "" html regsub {^(.*)\s*\s*} $html {\1} html return $html } } proc ::AO3::id {this} { return [dict get $this id] } proc ::AO3::url {this} {return "https://archiveofourown.org/works/[dict get $this id]"} proc ::AO3::this {this} {return $this} proc ::AO3::html {this} {return [dict get [set [dict get $this me]] html]} proc ::AO3::cleanup {this} { unset -nocomplain [dict get $this me] ;# Delete the raw html [dict get $this dom] delete ;# Delete the dom rename [dict get $this me] {} ;# Delete the ensemble object } proc ::AO3::save {this fname} { _LOG $this INFO "saving html to $fname" set fout [open $fname w] puts -nonewline $fout [::AO3::html $this] close $fout } proc ::AO3::title {this} { # The title of the work is stored in an
# [author_summary_html] ##
# [username1] # [username2] # ... #
#