Aug. 2009 by [rmax]
[http://mikrocontroller.net/%|%mikrocontroller.net%|%] is a popular German forum for microcontroller hobbyists working with controllers like [http://atmel.com/%|%AVR%|%] or [http://www.microchip.com%|%PIC%|%].
It allows users to subscribe to discussion threads and get a notification email when something new has been posted. Unfortunately these emails only contain a link to the new posting, but not the posted text.
This script can be used as a filter in a [http://procmail.org/%|%procmail%|%] rule to replace the notification body with the actual text of the new posting. It uses the Tcl core's [http] package to fetch the discussion page and the [tdom] package to parse the HTML.
----
package require http
package require tdom
fconfigure stdout -encoding utf-8
# for debugging - dump a DOM subtree to stdout
proc dump {node {space {}}} {
set name [$node nodeName]
if {$name eq "Protocol"} return
puts $space[$node nodeName]:[$node nodeValue]
foreach attr [$node attributes] {
puts "$space $attr=[$node @$attr]"
}
foreach child [$node childNodes] {
dump $child "$space "
}
}
# print a string after trimming and collapsing whitespace
proc print {string} {
puts [regsub -all {\s+} [string trim $string] { }]
}
# Return the text representation of a
element of a speciffic class
proc divclass {node class} {
return [[$node selectNodes [format {.//div[@class='%s']} $class]] asText]
}
# Pass on the mail header
while {[gets stdin line] != 0} {
puts $line
}
puts ""
# Read the mail body and grab the URL from it
regexp {(https?://[^\#]*)\#([0-9]+)} [read stdin] -> url rel
regsub {^https} $url {http} url
# Fetch the whole thread
set token [http::geturl $url]
set html [http::data $token]
http::cleanup $token
# Parse the HTML and select the
with the new message
set dom [dom parse -html $html]
set doc [$dom documentElement]
set div [$doc selectNodes \
[format {//div[@class='post box gainlayout ' and .//a[@name='%s']]} $rel]]
# Print the subject
print [divclass $div "subject"]
# Print the author of the new message
print [divclass $div "author"]
# Print the time stamp of the new message
print [divclass $div "date"]
# Print the names of attachments, if any
foreach F [$div selectNodes {.//div[@class='attachment']}] {
print $F
}
# Print the full URL
puts "$url#$rel"
puts ""
# Print the text of the message
set t [$div selectNodes {.//div[@class='text gainlayout']}]
foreach n [$t selectNodes {.//text()}] {
set data [$n data]
regsub {(.*)\n} $data {\1} data
regsub {\n(.*)} $data {\1} data
puts $data
}
<>Web Scraping