ucnetgrab

Aug. 2009 by rmax

mikrocontroller.net is a popular German forum for microcontroller hobbyists working with controllers like AVR or PIC .

It allows users to subscribe to discussion threads and get a notification email when something new has been posted. Unfortunately these emails only contain a link to the new posting, but not the posted text.

This script can be used as a filter in a procmail rule to replace the notification body with the actual text of the new posting. It uses the Tcl core's http package to fetch the discussion page and the tdom package to parse the HTML.


 package require http
 package require tdom

 fconfigure stdout -encoding utf-8

 # for debugging - dump a DOM subtree to stdout
 proc dump {node {space {}}} {
    set name [$node nodeName]
    if {$name eq "Protocol"} return
    puts $space[$node nodeName]:[$node nodeValue]
    foreach attr [$node attributes] {
        puts "$space $attr=[$node @$attr]"
    }
    foreach child [$node childNodes] {
        dump $child "$space  "
    }
 }

 # print a string after trimming and collapsing whitespace
 proc print {string} {
    puts [regsub -all {\s+} [string trim $string] { }]
 }

 # Return the text representation of a <div> element of a speciffic class
 proc divclass {node class} {
    return [[$node selectNodes [format {.//div[@class='%s']} $class]] asText]
 }

 # Pass on the mail header
 while {[gets stdin line] != 0} {
    puts $line
 }
 puts ""

 # Read the mail body and grab the URL from it
 regexp {(https?://[^\#]*)\#([0-9]+)} [read stdin] -> url rel
 regsub {^https} $url {http} url

 # Fetch the whole thread
 set token [http::geturl $url]
 set html [http::data $token]
 http::cleanup $token

 # Parse the HTML and select the <div> with the new message
 set dom [dom parse -html $html]
 set doc [$dom documentElement]
 set div [$doc selectNodes \
    [format {//div[@class='post box gainlayout ' and .//a[@name='%s']]} $rel]]

 # Print the subject
 print [divclass $div "subject"]

 # Print the author of the new message
 print [divclass $div "author"]

 # Print the time stamp of the new message
 print [divclass $div "date"]

 # Print the names of attachments, if any
 foreach F [$div selectNodes {.//div[@class='attachment']}] {
    print $F
 }
 # Print the full URL
 puts "$url#$rel"
 puts ""

 # Print the text of the message
 set t [$div selectNodes {.//div[@class='text gainlayout']}]
 foreach n [$t selectNodes {.//text()}] {
    set data [$n data]
    regsub {(.*)\n} $data {\1} data
    regsub {\n(.*)} $data {\1} data
    puts $data
 }