Version 13 of hkassem

Updated 2018-06-26 08:24:49 by hkassem

Hkassem useful tool

Some tcl function needed in the data parsing.

tools

proc aread {path} {
        set fid [open $path r]
        set data [read $fid]
        close $fid
        return $data
}


proc uread {path} {
        set fid [open $path r]
        fconfigure $fid -encoding utf-8
        set data [read $fid]
        close $fid
        return $data
}


proc bread {path} {
        set fid [open $path r]
    fconfigure $fid -translation binary
        set data [read $fid]
        close $fid
        return $data
}

proc uwrite {data name} {
        set fid [open $name w]
        fconfigure $fid -encoding utf-8
        puts $fid $data 
        close $fid
}

proc awrite {data name} {
        set fid [open $name a]
        fconfigure $fid -encoding utf-8
        puts $fid $data 
        close $fid
}

proc striphtml { text } {
 
     # filter out scripts, stylesheets, tags, and most escaped characters
     set text [regsub -all -nocase {<script[^>]*?>.*?</script>} $text " "]
     set text [regsub -all -nocase {<style[^>]*?>.*?</style>} $text " "]
     set text [regsub -all -nocase {<[\/\!]*?[^<>]*?>} $text " "]
     set text [regsub -all -nocase {([\r\n])[\s]+} $text "\\1"]
     set text [regsub -all -nocase {%&(quot|#34);} $text "\""]
     set text [regsub -all -nocase {&(amp|#38);} $text "&"]
     set text [regsub -all -nocase {&(lt|#60);} $text "<"]
     set text [regsub -all -nocase {&(gt|#62);} $text ">"]
     set text [regsub -all -nocase {&(nbsp|#160);} $text " "]
     set text [regsub -all -nocase {&(iexcl|#161);} $text "\xa1"]
     set text [regsub -all -nocase {&(cent|#162);} $text "\xa2"]
     set text [regsub -all -nocase {&(pound|#163);} $text "\xa3"]
     set text [regsub -all -nocase {&(copy|#169);} $text "\xa9"]
 
     # and last, catch arbitrary sequences
     set text [string map {[ \\[ ] \\] $ \\$ \\ \\\\} $text]
     set text [regsub -all -nocase {&#(\d+);} $text {[format c \1]}]
     set text [subst $text]
 
     return $text
 
}

package require http
proc download {url} {
    set token [http::geturl $url -timeout 200]
    eval set data $$token\(body\)
    return $data
}

package require http
proc download2 {url} {
    set token [http::geturl $url]
    eval set data $$token\(body\)
   #return $data
   set fid [open ./tmp/$x.html w]
   fconfigure $fid -encoding utf-8
   puts $fid $data
   close $fid
}


proc download3 {url} {
    catch {set token [http::geturl $url]} res 
        
        if { $res == "couldn't open socket: connection timed out" } {
            puts stderr "$res loop ..."
            download $x
                return
        }
    eval set data $$token\(body\)
   #return $data
   set fid [open ./tmp/$x.html w]
   fconfigure $fid -encoding utf-8
   puts $fid $data
   close $fid
}

Quick Parse huge file

fforeach : file foreach is my implementation to speed up the file parsing line by line.
fforeach will manage the open close, don't break it by return inside.
Feel free to change the encoding : fconfigure $fforeach_fid -encoding utf-8
Here utf-8 support all world chars
# hkassem at gmail dot com - 2016
proc fforeach {fforeach_line_ref fforeach_file_path fforeach_body} {
    upvar $fforeach_line_ref fforeach_line
        set fforeach_fid [open $fforeach_file_path r]
    fconfigure $fforeach_fid -encoding utf-8
    while {[gets $fforeach_fid fforeach_line] >= 0} {
        # ------- FOREACH BODY ------------<
            uplevel $fforeach_body
        # ------END FOREACH BODY----------->
    }          
        close $fforeach_fid
 }

usage:

fforeach aLine "./mybigfile.txt" {
    # actions: do something   with the line
    puts $aLine  
}

dbohdan 2017-02-14: Note that if you have access to Tcllib, ::fileutil::foreachLine implements the same functionality.

Array sort


# hkassem at gmail dot com - 2016
proc array_sort {index val _foreach_sorting_array_ref foreachsorting_command} {
    # _foreach_sorting_array_ref is a reference this mean equivalent to &array in C
    upvar $_foreach_sorting_array_ref arrref
    upvar $index i
    upvar $val v
        
    set x [list]
    foreach {k vl} [array get arrref] {
        lappend x [list $k $vl]
    }
        
        foreach e [lsort -integer -decreasing -index 1 $x] {
        #puts "$i,$v"
                set i [lindex $e 0]
                set v [lindex $e 1]
                # ------- FOREACH BODY ------------<
        uplevel $foreachsorting_command
        # ------END FOREACH BODY----------->
        }  
}

usage:
set myarr(1) 20
set myarr(2) 10
set myarr(3) 30
array_sort index value myarr {
  # actions
   puts "$index $value"
}

output:
3 30
1 20
2 10

Unique data remove duplicated elements

# hkassem at gmail dot com - 2016
 proc uniq {data} {
   array set uniq_arr ""
   foreach e $data {
      set uniq_arr($e) ""
   }
   set res ""
   foreach {index val} [array get uniq_arr] {
       append res "$index "
   }
   return $res   
}

usage:

% uniq " A B A A B C"
Res:
A B C

Quick grep

# hkassem at gmail dot com - 2016
proc grep {data pattern} {
     set res ""
     foreach e [split $data \n] {            
            if { [regsub -all "$pattern" $e {} data2] >= 1} {
               append res "$e\n"
            }
         }
        regsub "\\n$" $res "" res
         return $res
}

usage:

%set data "hello grep\nfind me"
%set res [grep $data "me"]
%puts $res
find me

dbohdan 2017-02-14: It is better to use lsearch -inline instead since it's a fast native command that's a drop-in replacement:

% lsearch -inline -regexp [split $data \n] me
find me