Version 2 of BlastFile

Updated 2005-02-21 10:57:41 by DDG

snittype for parsing of Blast [L1 ] output files

 package require snit 0.97
 package require oomk 
 package require md5pure

 snit::type BlastFile {
    # public options
    # BLAST resultfile 
    option -filename ""
    # metakit indexfile optional argument
    # defaults to $options(-filename).mk
    option -indexfile ""
    # number of hits stored inside the indexfile
    option -hits 10
    # private variables 
    variable db
    # the views inside the database
    variable pvPositions
    variable pvTop1
    variable pvTopX
    constructor {args} {
        $self configurelist $args
        if {$options(-filename) eq ""} {
            error "`-filename blastfile' argument is required"
        }
        if {$options(-indexfile) eq ""} {
            set options(-indexfile) $options(-filename).mk
        }
        set db [mkstorage %AUTO% $options(-indexfile)]
        if {[file size $options(-indexfile)] < 2 } {
            $self CreateIndex
        } else {
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
        }
    } 
    destructor {
        $db close
    }


    # public methods (are lowercase)
    method getQueryIds {} {
        # returns a list with query ids 
        # each item is a list with the 
        # * query id 
        # * a flag indicating if this query id has a hit 

        #[$db view top1] as pvTop1
        set res [list]
        $pvTop1 loop c {
            lappend res [array get c]
        }
        return $res
    }
    method getMatchIds {id args} {
        set arg(-md5) false
        array set arg $args
        set res [list]
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} { 
                set id [$pSel get 0 id]
            } else { 
                return [list]
            }
        }
        [$pvTopX select -exact id $id] as pSel
        $pSel loop c {
            lappend res [array get c]
        }
        return $res
    }
    method hasHit {id args} {
        # return true or false 
        # if we have BlastHits for a certain queryid
        set arg(-md5) false
        array set arg $args
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} { 
                set id [$pSel get 0 id]
            } else { 
                return false
            }
        }
        [$pvTop1 select -exact id $id] as pSel
        if {[$pSel size] > 0} { 
            set res [$pSel get 0 match]
        } else {
            set res 0
        }
        if {$res eq "0"} {
            return false
        } else {
            return true
        }
    }
    method getBlast {id args} {
        set arg(-md5) false
        array set arg $args
        set blast ""
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
        }  else {
            [$pvPositions select -exact id $id] as pSel
        }
        if {[$pSel size] > 0} { 
            set pos [$pSel get 0 pos]
        } else { 
            return "" 
        }
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else { 
            seek $infh $pos
            set flag false
            while {[gets $infh line] >= 0} {
                if {[regexp {^T?BLAST[XNP]} $line] && $flag} {
                    break
                } elseif {[regexp {^T?BLAST[XNP]} $line]} {
                    set flag true
                }
                append blast "$line\n"
            }
            close $infh
        }
        return $blast
    }
    # private methods (are uppercase)
    method CreateIndex {} {
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else {
            $db layout positions {id md5 pos}
            $db layout top1 {id match score:I evalue}
            $db layout topX {id match score:I evalue}
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
            set hit -1
            set prog [Progress %AUTO% -file $options(-filename)]
            while {[gets $infh line] >= 0} {
                if {[regexp {^T?BLAST[XNP]} $line]} {
                    set pos [tell $infh]
                    $prog progress $pos
                    incr pos [expr [string length $line] * -1 - 1]
                } elseif {[regexp {^Query= ([^\s]+)} $line -> id]} {
                    set md5 [md5pure::md5 $id]
                    $pvPositions append id $id md5 $md5 pos $pos
                } elseif {[regexp {No hits found} $line]} {
                    $pvTop1 append id $id match 0 score 0 evalue 1
                    set hit -1
                } elseif {[regexp {^Sequences producing significant alignments} $line]} { 
                    set hit 0 
                } elseif {[regexp {^>([^\s]+)(.+)} $line]} { 
                    set hit -1 ;
                } elseif {$hit >= 0 && [regexp {^([^\s]+).+\s([0-9]+)\s+([-e0-9\.]+)} $line -> matchID score evalue]} { 
                    incr hit 
                    regsub {^e} $evalue {1e} evalue
                    if {$hit == 1} {
                        $pvTop1 append id $id match $matchID score $score evalue $evalue
                    }
                    if {$hit <= $options(-hits)} {
                        $pvTopX append id $id match $matchID score $score evalue $evalue
                    }
                }


            }
            close $infh
            $db commit
         }
    }

}

 # test
 proc test {} {
    source [file join [file dirname [info script]] Progress.tcl]
    BlastFile bfp -filename E:/links/project/goblet/data/goblet-results/20050209-143332--nrprot-test_blastp_nrprot
    #puts [bfp getBlast 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]
    foreach x {0 1 2 3 4} {
        puts "ci010013000$x [bfp hasHit ci010013000$x]"
    }
    puts "27ccfed00cacd48a458a2057eec4e3c8 [bfp hasHit 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]"
    puts [bfp getQueryIds]          
    puts [bfp getMatchIds ci0100130000]
 }