Version 1 of Scraping timeentry.kforce.com

Updated 2012-12-09 08:24:37 by RLE

Summary

timeentry.kforce.com doesn't provide an API, so this web-scraping script was born

Code

#! /bin/env tclsh

package require tls
package require http
package require tdom

package require sha256

proc form token {
    set data [http::data $token]
    dom parse -html $data html
    $html documentElement element
    $element normalize
    set fields [dict create]
    foreach input [$element getElementsByTagName input] {
        if {[$input hasAttribute name]} {
            set name [$input getAttribute name]
        } elseif {[$input hasAttribute id]} {
            set name [$input getAttribute id]
        }
        puts stderr "input: [$input asList]"
        if {[catch {set value [$input getAttribute value]}]} {
            set value ""
        }
        if {[info exists name]} {
            dict set fields $name $value 
        }
    }
    return [dict create data $data html $html fields $fields ]
}

proc cookies {t} {
   set cookies [list]

   foreach {k v} [http::meta $t] {
       if {[string tolower $k] eq "set-cookie"} {
           set v [split $v \;]
           set v [lindex $v 0]
           if {[lindex [split $v =] 1] == ""} continue
           lappend cookies $v
       }
   }
   return $cookies
}

proc formatCookies cookies {
   return [list Cookie [join $cookies "; "]]
}

proc timecard_id node {
    set href [$node getAttribute href]
    regexp {/TimeEntry\.Web/Shared/Timecard.aspx\?ID=([^&]+)&} $href -> id
    return $id
}

proc clean dict {
    dict with dict {
        dict for {key val} $dict {
            if {$val eq "\xa0"} {
                set val {} 
            }
        }
    }
    return $dict
}

proc search1 {node pattern} {
    set node [$node selectNodes {//*[contains(text(),$pattern)]/../../../following-sibling::*[1]/table/tr/td}]
    return [$node text]
}

proc search2 {node pattern} {
    set node [$node selectNodes {//*[contains(text(),$pattern)]/../../..}]
    set node [[[$node nextSibling node] nextSibling node] selectNodes {table/tr/td}]
    return [$node text]
}

proc search3 {node pattern} {
    set node [$node selectNodes {//*[contains(text(),$pattern)]/../../../..}]
    #$node parentNode node
    [[[$node previousSibling node] firstChild node] nextSibling node] nextSibling node
    set node [$node selectNodes {table/tr/td}]
    return [$node asText]
}

proc search4 {node pattern} {
    set node [$node selectNodes {//*[contains(text(),$pattern)]}]
    $node parentNode node
    $node parentNode node
    $node parentNode node
    $node parentNode node
    #$node parentNode node
    $node nextSibling node
    $node firstChild node
    $node nextSibling node
    $node nextSibling node
    set node [$node selectNodes {table/tr/td}]
    return [$node asText]
}

proc search5 {node pattern} {
    set node [$node selectNodes {//*[contains(text(),$pattern)]/../following-sibling::*[1]}]
    return [$node asText]
}

proc search6 {node pattern} {
    set node [$node selectNodes {//*[contains(text(),$pattern)]/../../../..}]
    $node nextSibling node
    $node nextSibling node
    return [$node asText]
}

proc hours {node pattern} {
    set table [list]
    set rowheaders {Hours {Regular IC} {Daily Total}}
    set node [$node selectNodes {//*[contains(text(),$pattern)]/../..}]
    if {$node eq {} } {
        raise -code error "Hours should not be parsed on a timecard with no hours"
        return
    }
    for {set i 0} {$i<3} {incr i} {
        set row [list]
        foreach child [$node childNodes] {
            set values [list]
            foreach text [$child selectNodes */text()] {
                lappend values [$text asText]
            }
            lappend row $values
        }
        lappend table $row
        $node nextSibling node
    }
    return $table 
}

http::register https 443 ::tls::socket

http::config -useragent "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:17.0) Gecko/20100101 Firefox/17.0"

puts stderr "enter username: "
gets stdin username
puts stderr "enter password: "
gets stdin password

set creds [dict create {ctl00$ContentPlaceHolder1$txtUserName} $username {ctl00$ContentPlaceHolder1$txtPassword} $password]

set token [http::geturl https://timeentry.kforce.com/TimeEntry.Web/Login.aspx]
set cookies [cookies $token]
puts stderr "\ncookies1: $cookies"
set fields [dict get [form $token] fields]
http::cleanup $token

set other_inputs {
    {ctl00$ContentPlaceHolder1$ImageButton2.x} 0
    {ctl00$ContentPlaceHolder1$ImageButton2.y} 0
    ctl00_RoleRadMenu_ClientState {}
}

set login [dict merge $fields $other_inputs $creds]
puts stderr $login


set token [http::geturl https://timeentry.kforce.com/TimeEntry.Web/Login.aspx \
    -headers [formatCookies $cookies] \
    -query [http::formatQuery {*}$login]]
set cookies [dict merge $cookies [cookies $token]]
puts stderr "\ncookies2: $cookies"
http::cleanup $token

set token [http::geturl https://timeentry.kforce.com/TimeEntry.Web/Consultant/TimecardHistory.aspx \
    -headers [formatCookies $cookies] \
]
set cookies [concat $cookies [cookies $token]]
puts stderr "\ncookies3: $cookies"
set fields [dict get [form $token] fields]
http::cleanup $token

set newfields {
    __EVENTARGUMENT {FireCommand:ctl00$ContentPlaceHolder1$grdTimecardHistory$ctl00;PageSize;50}
    __EVENTTARGET {ctl00$ContentPlaceHolder1$grdTimecardHistory}
    ctl00$ContentPlaceHolder1$grdTimecardHistory$ctl00$ctl03$ctl01$PageSizeComboBox {50}
    ctl00_ContentPlaceHolder1_grdTimecardHistory_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState {{"logEntries":[],"value":"50","text":"50","enabled":true}}
    ctl00_ContentPlaceHolder1_historyDetailView_ClientState {}
}
set fields [dict merge $fields $newfields]
puts stderr "\ncookies4: $cookies"
set token [http::geturl https://timeentry.kforce.com/TimeEntry.Web/Consultant/TimecardHistory.aspx \
    -headers [formatCookies $cookies] -query [http::formatQuery {*}$fields] \
]

set data [http::data $token]
http::cleanup $token
dom parse -html $data html
$html documentElement root
$root normalize
set timecard_history [$root getElementById ctl00_ContentPlaceHolder1_grdTimecardHistory_ctl00]
set timecard_history [$timecard_history selectNodes tbody]
set timecard_history [$timecard_history selectNodes tr]
set columns {company assignment status notes}
set timecards [list]
set timecard [dict create] 
foreach row $timecard_history {
    set fields [lassign [$row selectNodes td] id]
    set date [[$id selectNodes a] text]
    dict set timecard id  [timecard_id [$id selectNodes a]]
    for {set i 0} {$i<[llength $columns]} {incr i} {
        dict set timecard [lindex $columns $i] [[lindex $fields $i] text]
    }

    set timecard [clean $timecard]
    #temporary, for testing"
    #if {[dict get $timecard id] == 972874} {
        lappend timecards $timecard
    #}
}

        #https://timeentry.kforce.com/TimeEntry.Web/Shared/Timecard.aspx?ID=[dict get $timecard id]&Mode=Edit 
foreach timecard $timecards[set timecards [list]] {
    set token [http::geturl \
        https://timeentry.kforce.com/TimeEntry.Web/Shared/PrintTimecard.aspx?TimecardID=[dict get $timecard id]&ReportFormat=html \
        -headers [formatCookies $cookies]]
    set data [http::data $token]
    http::cleanup $token
    dom parse -html $data html
    $html documentElement root
    $root normalize

    dict set timecard consultant [search1 $root "Consultant Name:"]
    dict set timecard employee_id [search2 $root "Employee ID:"]
    dict set timecard client [search1 $root "Client:"]
    dict set timecard week [search2 $root "Week Ending:"]
    dict set timecard assignment [search3 $root "Assignment ID:"]
    dict set timecard fax [search2 $root "Fax Number:"]
    dict set timecard location [search4 $root "Client Location:"]
    dict set timecard title [search2 $root "Job Title:"]
    dict set timecard cost_center [search5 $root "Cost Center:"]
    dict set timecard purchase_order [search5 $root "Purchase Order:"]
    if {[dict get $timecard status] eq "Complete"} {
        dict set timecard hours [hours $root "Hours"]
    }
    dict set timecard notes [search6 $root "Special Timecard Notes"]

    set timecard [clean $timecard]

    lappend timecards $timecard

    #set val [$root selectNodes {//*[contains(text(),"key:")]}]
    #puts "path1: [$val toXPath]"
    #set val [$root selectNodes {//*[contains(text(),"val")]}]
    #puts "path2: [$val toXPath]"
}

foreach timecard $timecards {
    set sig [::sha2::sha256 -hex $timecard]
    set chan [open $sig w]
    puts $chan $timecard
    close $chan
}