Version 7 of BlastFile

Updated 2005-02-22 07:54:20 by DDG

DESCRIPTION

snittype for parsing of Blast [L1 ] plain text output files

HISTORY

DDG 2004-02-22 Added method blastInfo and documentation.

 package require snit 0.97
 package require oomk
 package require md5pure

 snit::type BlastFile {
    # public options
    # BLAST resultfile
    option -filename ""
    # metakit indexfile optional argument
    # defaults to $options(-filename).mk
    option -indexfile ""
    # number of hits stored inside the indexfile
    option -hits 10
    # private variables
    variable db
    # the views inside the database
    variable pvPositions
    variable pvTop1
    variable pvTopX
    variable pvInfo
    # `BlastFile bf -filename blastfile ?-indexfile,-hits?' --
    #            constructor for the BlastFile type
    #  Arguments:
    #           `-filename blastfile' the BLAST resultfile for one or many query ids
    #           `?-indexfile? indexfile.mk' the metakit indexfile, defaults to blastfile.mk
    #           `?-hits n? number of top hits to be stored inside the indexfile, default: 10
    # -----------------------------------------------
    constructor {args} {
        $self configurelist $args
        if {$options(-filename) eq ""} {
            error "`-filename blastfile' argument is required"
        }
        if {$options(-indexfile) eq ""} {
            set options(-indexfile) $options(-filename).mk
        }
        set db [mkstorage %AUTO% $options(-indexfile)]
        if {[file size $options(-indexfile)] < 2 } {
            $self CreateIndex
        } else {
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
            [$db view blastinfo] as pvInfo
        }
    }
    destructor {
        $db close
    }

    # `blast' getQueryIds --
    #          getting a list of all query ids, together with the top matches
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          a list of lists with the top match ids where each sublist consisits of key/value pairs
    #          for the match properties
    # ------------------------------------------------------------ 
    method getQueryIds {} {
        set result [list]
        $pvTop1 loop c {
            lappend result [array get c]
        }
        return $result
    }
    # `blast' getMatchIds --
    #          getting a list of top match ids for a certain query id
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          a list of lists with match ids where each sublist consisits of key/value pairs
    #          for the match properties
    # ------------------------------------------------------------ 

    method getMatchIds {id args} {
        set arg(-md5) false
        array set arg $args
        set res [list]
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} {
                set id [$pSel get 0 id]
            } else {
                return [list]
            }
        }
        [$pvTopX select -exact id $id] as pSel
        $pSel loop c {
            lappend res [array get c]
        }
        return $res
    }
    # `blast' hasHit --
    #  returns if a query id has a match against the database
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          true| false indicating if query id has a valid hit
    # ---------------------------------------------------------
    method hasHit {id args} {
        # return true or false
        # if we have BlastHits for a certain queryid
        set arg(-md5) false
        array set arg $args
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} {
                set id [$pSel get 0 id]
            } else {
                return false
            }
        }
        [$pvTop1 select -exact id $id] as pSel
        if {[$pSel size] > 0} {
            set res [$pSel get 0 match]
        } else {
            set res 0
        }
        if {$res eq "0"} {
            return false
        } else {
            return true
        }
    }
    # `blast' blastInfo --
    #          various informations about blastparameters
    #  Arguments:
    #          none
    #  Returns: 
    #          list with key/value pairs for blastparameters
    #          keys are: blasttype, database, cutoff
    # ---------------------------------------------------------
    method blastInfo {} {
        set result [list]
        $pvInfo loop c {
            lappend result [array get c]
        }
        return $result
    }

    # `blast' getBlast --
    #  returns the blast result for a query id
    #  Arguments:
    #          `id' a query id or a md5 hash of it
    #          `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash
    #  Returns: 
    #          text for the actual blast result for a given id
    # ---------------------------------------------------------
    method getBlast {id args} {
        set arg(-md5) false
        array set arg $args
        set blast ""
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
        }  else {
            [$pvPositions select -exact id $id] as pSel
        }
        if {[$pSel size] > 0} {
            set pos [$pSel get 0 pos]
        } else {
            return ""
        }
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else {
            seek $infh $pos
            set flag false
            while {[gets $infh line] >= 0} {
                if {[regexp {^T?BLAST[XNP]} $line] && $flag} {
                    break
                } elseif {[regexp {^T?BLAST[XNP]} $line]} {
                    set flag true
                }
                append blast "$line\n"
            }
            close $infh
        }
        return $blast
    }
    # Private Methods (are uppercase)
    # CreateIndex (private method)
    #         creates a metakit database for the blastfile
    # Arguments: 
    #          none
    # Returns:
    #          nothing
    method CreateIndex {} {
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else {
            $db layout positions {id md5 pos}
            $db layout top1 {id match score:I evalue}
            $db layout topX {id match score:I evalue}
            $db layout blastinfo {key value}
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
            [$db view blastinfo] as pvInfo
            set hit -1
            set prog [Progress %AUTO% -file $options(-filename)]
            set x 0
            while {[gets $infh line] >= 0} {
                if {[regexp {^(T?BLAST[XNP]) +([\.0-9]+)} $line -> blasttype blastversion]} {
                    incr x
                    if {$x == 1} {
                        $pvInfo append key blasttype value [string tolower $blasttype]
                        $pvInfo append key blastversion value $blastversion
                    }
                    set pos [tell $infh]
                    $prog progress $pos
                    incr pos [expr [string length $line] * -1 - 1]
                } elseif {$x == 1 && [regexp {^Database: ([^\s]+)} $line -> val]} {
                     $pvInfo append key database value $val
                } elseif {$x == 1 && [regexp {^Matrix: ([^\s]+)} $line -> val]} {
                     $pvInfo append key matrix value $val
                } elseif {$x == 1 && [regexp {^Number of Sequences: ([^\s]+)} $line -> val]} {
                     $pvInfo append key sequences value $val
                } elseif {$x == 1 && [regexp {^Number of sequences better than ([^\s]+):} $line -> val]} {
                     $pvInfo append key cutoff value $val
                } elseif {$x == 1 && [regexp {^length of database: ([^\s]+)} $line -> val]} {
                     $pvInfo append key length value $val
                } elseif {[regexp {^Query= ([^\s]+)} $line -> id]} {
                    set md5 [md5pure::md5 $id]
                    $pvPositions append id $id md5 $md5 pos $pos
                } elseif {[regexp {No hits found} $line]} {
                    $pvTop1 append id $id match 0 score 0 evalue 1
                    set hit -1
                } elseif {[regexp {^Sequences producing significant alignments} $line]} {
                    set hit 0
                } elseif {[regexp {^>([^\s]+)(.+)} $line]} {
                    set hit -1 ;
                } elseif {$hit >= 0 && [regexp {^([^\s]+).+\s([0-9]+)\s+([-e0-9\.]+)} $line -> matchID score evalue]} {
                    incr hit
                    regsub {^e} $evalue {1e} evalue
                    if {$hit == 1} {
                        $pvTop1 append id $id match $matchID score $score evalue $evalue
                    }
                    if {$hit <= $options(-hits)} {
                        $pvTopX append id $id match $matchID score $score evalue $evalue
                    }
                }

            }
            close $infh
            $db commit
         }
    }
 }


 proc test {} {
    source [file join [file dirname [info script]] Progress.tcl]
    BlastFile bfp -filename E:/links/project/goblet/data/goblet-results/20050209-143332--nrprot-test_blastp_nrprot
    #puts [bfp getBlast 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]
    foreach x {0 1 2 3 4} {

        puts "ci010013000$x hasHit: [bfp hasHit ci010013000$x]"
    }
    puts "27ccfed00cacd48a458a2057eec4e3c8 hasHit: [bfp hasHit 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]"
    puts "QueryIDs: [bfp getQueryIds]"
    puts "MatchIDs for ci0100130000: [bfp getMatchIds ci0100130000]"
    puts "BlastInfo: [bfp blastInfo]"
 }

schlenk Those BLAST files are nice, but how about using the XML and/or ASN.1 output of the blastall program. Makes parsing some things easier, if you produce those instead of the plaintext version. (sadly the NCBI DTD's are all totally out of date and incorrect).


DDG Never used XML and ASN.1. XML seems to me to verbose for large files. But ASN.1 should be an urgent requirement. So an option -format asn.1|text might be required. May be we need then methods like getBlastASN1', getBlastText' where `getBlast' just forwards in dependence of the option -format.