BlastFile

DESCRIPTION

Snit::type for parsing of Blast [L1 ] plain text output files. Part of the coming biotcl package.

HISTORY

DDG 2005-03-10 More useful error message if the indexfile is tried to build in a non writable directory

DDG 2005-02-25 Making options readonly, they can be only set at creation time. That avoids confusing changes of the filename but not changing the indexfile

DDG 2005-02-22 Added method blastInfo and documentation.

package require snit 0.97
package require oomk
package require md5pure

snit::type BlastFile {
    # public options
    # BLAST resultfile
    option -filename -default "" -readonly true
    # metakit indexfile optional argument
    # defaults to $options(-filename).mk
    option -indexfile -default "" -readonly true
    # number of hits stored inside the indexfile
    option -hits -default 10 -readonly true

    # private variables
    variable db
    # the views inside the database
    variable pvPositions
    variable pvTop1
    variable pvTopX
    variable pvInfo
    # `BlastFile bf -filename blastfile ?-indexfile,-hits?' --
    #            constructor for the BlastFile type
    #  Arguments:
    #           `-filename blastfile' the BLAST resultfile for one or many query ids
    #           `?-indexfile? indexfile.mk' the metakit indexfile, defaults to <blastfile>.mk
    #                     Please not that you must have write privileges for the directory 
    #                     if the indexfile was not yet created
    #           `?-hits n? number of top hits to be stored inside the indexfile, default: 10
    # -----------------------------------------------
    constructor {args} {
        $self configurelist $args
        if {$options(-filename) eq ""} {
            error "`-filename blastfile' argument is required"
        }
        if {$options(-indexfile) eq ""} {
            set options(-indexfile) $options(-filename).mk
        }
        if {[file exists $options(-indexfile)] && ![file writable $options(-indexfile)]} {
            set db [mkstorage %AUTO% $options(-indexfile) -readonly]
        } elseif {![file exists $options(-indexfile)] && ![file writable [file join [file dirname $options(-indexfile)] .]]} {
            error "You can't write the indexfile into directory [file dirname $options(-indexfile)]."
        } else {
            set db [mkstorage %AUTO% $options(-indexfile)]
        }
        if {[file size $options(-indexfile)] < 2 } {
            $self CreateIndex
        } else {
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
            [$db view blastinfo] as pvInfo
        }
    }
    destructor {
        $db close
    }

    # `blast' getQueryIds --
    #          getting a list of all query ids, together with the top matches
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          a list of lists with the top match ids where each sublist consisits of key/value pairs
    #          for the match properties
    # ------------------------------------------------------------ 
    method getQueryIds {} {
        set result [list]
        $pvTop1 loop c {
            lappend result [array get c]
        }
        return $result
    }
    # `blast' getMatchIds --
    #          getting a list of top match ids for a certain query id
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          a list of lists with match ids where each sublist consisits of key/value pairs
    #          for the match properties
    # ------------------------------------------------------------ 

    method getMatchIds {id args} {
        set arg(-md5) false
        array set arg $args
        set res [list]
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} {
                set id [$pSel get 0 id]
            } else {
                return [list]
            }
        }
        [$pvTopX select -exact id $id] as pSel
        $pSel loop c {
            lappend res [array get c]
        }
        return $res
    }
    # `blast' hasHit --
    #  returns if a query id has a match against the database
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          true| false indicating if query id has a valid hit
    # ---------------------------------------------------------
    method hasHit {id args} {
        # return true or false
        # if we have BlastHits for a certain queryid
        set arg(-md5) false
        array set arg $args
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} {
                set id [$pSel get 0 id]
            } else {
                return false
            }
        }
        [$pvTop1 select -exact id $id] as pSel
        if {[$pSel size] > 0} {
            set res [$pSel get 0 match]
        } else {
            set res 0
        }
        if {$res eq "0"} {
            return false
        } else {
            return true
        }
    }
    # `blast' blastInfo --
    #          various informations about blastparameters
    #  Arguments:
    #          none
    #  Returns: 
    #          list with key/value pairs for blastparameters
    #          keys are: blasttype, database, cutoff
    # ---------------------------------------------------------
    method blastInfo {} {
        set result [list]
        $pvInfo loop c {
            lappend result [array get c]
        }
        return $result
    }
   
    # `blast' getBlast --
    #  returns the blast result for a query id
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          text for the actual blast result for a given id
    # ---------------------------------------------------------
    method getBlast {id args} {
        set arg(-md5) false
        array set arg $args
        set blast ""
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
        }  else {
            [$pvPositions select -exact id $id] as pSel
        }
        if {[$pSel size] > 0} {
            set pos [$pSel get 0 pos]
        } else {
            return ""
        }
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else {
            seek $infh $pos
            set flag false
            while {[gets $infh line] >= 0} {
                if {[regexp {^T?BLAST[XNP]} $line] && $flag} {
                    break
                } elseif {[regexp {^T?BLAST[XNP]} $line]} {
                    set flag true
                }
                append blast "$line\n"
            }
            close $infh
        }
        return $blast
    }
    # Private Methods (are uppercase)
    # CreateIndex (private method)
    #         creates a metakit database for the blastfile
    # Arguments: 
    #          none
    # Returns:
    #          nothing
    method CreateIndex {} {
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else {
            $db layout positions {id md5 pos}
            $db layout top1 {id match score:I evalue}
            $db layout topX {id match score:I evalue}
            $db layout blastinfo {key value}
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
            [$db view blastinfo] as pvInfo
            set hit -1
            set prog [Progress %AUTO% -file $options(-filename)]
            set x 0
            while {[gets $infh line] >= 0} {
                if {[regexp {^(T?BLAST[XNP]) +([\.0-9]+)} $line -> blasttype blastversion]} {
                    incr x
                    if {$x == 1} {
                        $pvInfo append key blasttype value [string tolower $blasttype]
                        $pvInfo append key blastversion value $blastversion
                    }
                    set pos [tell $infh]
                    $prog progress $pos
                    incr pos [expr [string length $line] * -1 - 1]
                } elseif {$x == 1 && [regexp {^Database: ([^\s]+)} $line -> val]} {
                     $pvInfo append key database value $val
                } elseif {$x == 1 && [regexp {^Matrix: ([^\s]+)} $line -> val]} {
                     $pvInfo append key matrix value $val
                } elseif {$x == 1 && [regexp {^Number of Sequences: ([^\s]+)} $line -> val]} {
                     $pvInfo append key sequences value $val
                } elseif {$x == 1 && [regexp {^Number of sequences better than ([^\s]+):} $line -> val]} {
                     $pvInfo append key cutoff value $val
                } elseif {$x == 1 && [regexp {^length of database: ([^\s]+)} $line -> val]} {
                     $pvInfo append key length value $val
                } elseif {[regexp {^Query= ([^\s]+)} $line -> id]} {
                    set md5 [md5pure::md5 $id]
                    $pvPositions append id $id md5 $md5 pos $pos
                } elseif {[regexp {No hits found} $line]} {
                    $pvTop1 append id $id match 0 score 0 evalue 1
                    set hit -1
                } elseif {[regexp {^Sequences producing significant alignments} $line]} {
                    set hit 0
                } elseif {[regexp {^>([^\s]+)(.+)} $line]} {
                    set hit -1 ;
                } elseif {$hit >= 0 && [regexp {^([^\s]+).+\s([0-9]+)\s+([-e0-9\.]+)} $line -> matchID score evalue]} {
                    incr hit
                    regsub {^e} $evalue {1e} evalue
                    if {$hit == 1} {
                        $pvTop1 append id $id match $matchID score $score evalue $evalue
                    }
                    if {$hit <= $options(-hits)} {
                        $pvTopX append id $id match $matchID score $score evalue $evalue
                    }
                }

            }
            close $infh
            $db commit
         }
    }
}


proc test {} {
    source [file join [file dirname [info script]] Progress.tcl]
    BlastFile bfp -filename E:/links/project/goblet/data/goblet-results/20050209-143332--nrprot-test_blastp_nrprot
    #puts [bfp getBlast 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]
    foreach x {0 1 2 3 4} {
        
        puts "ci010013000$x hasHit: [bfp hasHit ci010013000$x]"
    }
    puts "27ccfed00cacd48a458a2057eec4e3c8 hasHit: [bfp hasHit 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]"
    puts "QueryIDs: [bfp getQueryIds]"
    puts "MatchIDs for ci0100130000: [bfp getMatchIds ci0100130000]"
    puts "BlastInfo: [bfp blastInfo]"
}

schlenk Those BLAST files are nice, but how about using the XML and/or ASN.1 output of the blastall program. Makes parsing some things easier, if you produce those instead of the plaintext version. (sadly the NCBI DTD's are all totally out of date and incorrect).


DDG Never used XML and ASN.1. XML seems to me too verbose for large files. But ASN.1 should be an urgent requirement. So an option -format asn.1|text might be required. Maybe we need methods like getBlastASN1', getBlastText', where `getBlast' just forwards in dependence of the option -format.