Version 0 of strimj segmentation

Updated 2001-07-17 04:14:15

Richard Suchenwirth -- This is a birthday present I made to myself, but I'll gladly share it with all Tclworld. Given a binary BMP image scanned from a font sample book, the task is to segment it into lines, and each line into characters, and return the list of characters as strimjes (see strimj - string image routines). It's not perfect, segmentation requires full white lines between segments, and it takes some time, but the results are already quite amazing: one page of Tcl code produces mostly usable images as further input for strimj fonts - or playing OCR...

 namespace eval strimj {
    proc fromBMP2  bmp {
        #-- make a strimj from a binary BMP image file
            set             fp [open $bmp]
            fconfigure     $fp -translation binary
            set data [read $fp [file size $bmp]]
            close          $fp
            set offset 62
            binary scan $data @18ii width height
            set nbytes [expr {(($width+31)/32)*4}] ;# 4-byte aligned
            set nbits  [expr {(($width+7)/8)*8}]   ;# byte-aligned
            set res ""
            for {set y [expr {$height-1}]} {$y>=0} {incr y -1} {
                binary scan $data @[expr {$y*$nbytes+$offset}]B$nbits line
                append res [string range $line 0 [expr {$width-1}]]\n
            }
            string map {1 . 0 @} $res
    }
    proc segment si {
        #-- segment a binary text image into a list of character images
        set res {}
        foreach linesi [lseg $si] {
            foreach charsi [lseg [rotate $linesi 270]] {
                lappend res [vcrop [rot90 $charsi]]
            }
        }
        set res
    }
    proc vcrop si {
        #-- vertical crop, remove empty lines at top & bottom
        set lines [lines $si]
        set core [regexp -indices -inline @.+@ [pp2 $lines]]
        foreach {from to} [lindex $core 0] break
        join [lrange $lines $from $to] \n
    }
    proc lseg si {
        #-- segment a binary text image into a list of line images
        set lines  [lines $si]
        set ranges [regexp -all -indices -inline @+ [pp2 $lines]]
        set res {}
        foreach pair $ranges {
            foreach {from to} $pair break
            set height [expr {$to-$from+1}]
            if {$height>2 && $height<100} {
                lappend res [join [lrange $lines $from $to] \n]
            }
        }
        set res
    }
    proc pp2 lines {
        #-- vertical binary projection profile
        set res ""
        foreach line $lines {
            append res [expr {[string first @ $line]>=0? "@":"."}]
        }
        set res
    }
 }
 proc testseg {{bmpfile schwabacher2.bmp}} {
    join [strimj::segment [strimj::fromBMP2 $bmpfile]] \n\n
 }

Arts and crafts of Tcl-Tk programming