'''DESCRIPTION''' [Snit]type for parsing of Blast [http://www.ncbi.nlm.nih.gov/BLAST/] plain text output files. Part of the coming [biotcl] package. '''HISTORY''' [DDG] 2004-02-22 Added method blastInfo and documentation. package require snit 0.97 package require oomk package require md5pure snit::type BlastFile { # public options # BLAST resultfile option -filename "" # metakit indexfile optional argument # defaults to $options(-filename).mk option -indexfile "" # number of hits stored inside the indexfile option -hits 10 # private variables variable db # the views inside the database variable pvPositions variable pvTop1 variable pvTopX variable pvInfo # `BlastFile bf -filename blastfile ?-indexfile,-hits?' -- # constructor for the BlastFile type # Arguments: # `-filename blastfile' the BLAST resultfile for one or many query ids # `?-indexfile? indexfile.mk' the metakit indexfile, defaults to blastfile.mk # `?-hits n? number of top hits to be stored inside the indexfile, default: 10 # ----------------------------------------------- constructor {args} { $self configurelist $args if {$options(-filename) eq ""} { error "`-filename blastfile' argument is required" } if {$options(-indexfile) eq ""} { set options(-indexfile) $options(-filename).mk } set db [mkstorage %AUTO% $options(-indexfile)] if {[file size $options(-indexfile)] < 2 } { $self CreateIndex } else { [$db view positions] as pvPositions [$db view top1] as pvTop1 [$db view topX] as pvTopX [$db view blastinfo] as pvInfo } } destructor { $db close } # `blast' getQueryIds -- # getting a list of all query ids, together with the top matches # Arguments: # `id' a query id or a md5 hash of it # `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash # Returns: # a list of lists with the top match ids where each sublist consisits of key/value pairs # for the match properties # ------------------------------------------------------------ method getQueryIds {} { set result [list] $pvTop1 loop c { lappend result [array get c] } return $result } # `blast' getMatchIds -- # getting a list of top match ids for a certain query id # Arguments: # `id' a query id or a md5 hash of it # `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash # Returns: # a list of lists with match ids where each sublist consisits of key/value pairs # for the match properties # ------------------------------------------------------------ method getMatchIds {id args} { set arg(-md5) false array set arg $args set res [list] if {$arg(-md5)} { [$pvPositions select -exact md5 $id] as pSel if {[$pSel size] > 0} { set id [$pSel get 0 id] } else { return [list] } } [$pvTopX select -exact id $id] as pSel $pSel loop c { lappend res [array get c] } return $res } # `blast' hasHit -- # returns if a query id has a match against the database # Arguments: # `id' a query id or a md5 hash of it # `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash # Returns: # true| false indicating if query id has a valid hit # --------------------------------------------------------- method hasHit {id args} { # return true or false # if we have BlastHits for a certain queryid set arg(-md5) false array set arg $args if {$arg(-md5)} { [$pvPositions select -exact md5 $id] as pSel if {[$pSel size] > 0} { set id [$pSel get 0 id] } else { return false } } [$pvTop1 select -exact id $id] as pSel if {[$pSel size] > 0} { set res [$pSel get 0 match] } else { set res 0 } if {$res eq "0"} { return false } else { return true } } # `blast' blastInfo -- # various informations about blastparameters # Arguments: # none # Returns: # list with key/value pairs for blastparameters # keys are: blasttype, database, cutoff # --------------------------------------------------------- method blastInfo {} { set result [list] $pvInfo loop c { lappend result [array get c] } return $result } # `blast' getBlast -- # returns the blast result for a query id # Arguments: # `id' a query id or a md5 hash of it # `?-md5 true|false?' defaults to false, flag to indicate if id is a md5-hash # Returns: # text for the actual blast result for a given id # --------------------------------------------------------- method getBlast {id args} { set arg(-md5) false array set arg $args set blast "" if {$arg(-md5)} { [$pvPositions select -exact md5 $id] as pSel } else { [$pvPositions select -exact id $id] as pSel } if {[$pSel size] > 0} { set pos [$pSel get 0 pos] } else { return "" } if [catch {open $options(-filename) r} infh] { error "Cannot open $options(-filename): $infh" } else { seek $infh $pos set flag false while {[gets $infh line] >= 0} { if {[regexp {^T?BLAST[XNP]} $line] && $flag} { break } elseif {[regexp {^T?BLAST[XNP]} $line]} { set flag true } append blast "$line\n" } close $infh } return $blast } # Private Methods (are uppercase) # CreateIndex (private method) # creates a metakit database for the blastfile # Arguments: # none # Returns: # nothing method CreateIndex {} { if [catch {open $options(-filename) r} infh] { error "Cannot open $options(-filename): $infh" } else { $db layout positions {id md5 pos} $db layout top1 {id match score:I evalue} $db layout topX {id match score:I evalue} $db layout blastinfo {key value} [$db view positions] as pvPositions [$db view top1] as pvTop1 [$db view topX] as pvTopX [$db view blastinfo] as pvInfo set hit -1 set prog [Progress %AUTO% -file $options(-filename)] set x 0 while {[gets $infh line] >= 0} { if {[regexp {^(T?BLAST[XNP]) +([\.0-9]+)} $line -> blasttype blastversion]} { incr x if {$x == 1} { $pvInfo append key blasttype value [string tolower $blasttype] $pvInfo append key blastversion value $blastversion } set pos [tell $infh] $prog progress $pos incr pos [expr [string length $line] * -1 - 1] } elseif {$x == 1 && [regexp {^Database: ([^\s]+)} $line -> val]} { $pvInfo append key database value $val } elseif {$x == 1 && [regexp {^Matrix: ([^\s]+)} $line -> val]} { $pvInfo append key matrix value $val } elseif {$x == 1 && [regexp {^Number of Sequences: ([^\s]+)} $line -> val]} { $pvInfo append key sequences value $val } elseif {$x == 1 && [regexp {^Number of sequences better than ([^\s]+):} $line -> val]} { $pvInfo append key cutoff value $val } elseif {$x == 1 && [regexp {^length of database: ([^\s]+)} $line -> val]} { $pvInfo append key length value $val } elseif {[regexp {^Query= ([^\s]+)} $line -> id]} { set md5 [md5pure::md5 $id] $pvPositions append id $id md5 $md5 pos $pos } elseif {[regexp {No hits found} $line]} { $pvTop1 append id $id match 0 score 0 evalue 1 set hit -1 } elseif {[regexp {^Sequences producing significant alignments} $line]} { set hit 0 } elseif {[regexp {^>([^\s]+)(.+)} $line]} { set hit -1 ; } elseif {$hit >= 0 && [regexp {^([^\s]+).+\s([0-9]+)\s+([-e0-9\.]+)} $line -> matchID score evalue]} { incr hit regsub {^e} $evalue {1e} evalue if {$hit == 1} { $pvTop1 append id $id match $matchID score $score evalue $evalue } if {$hit <= $options(-hits)} { $pvTopX append id $id match $matchID score $score evalue $evalue } } } close $infh $db commit } } } proc test {} { source [file join [file dirname [info script]] Progress.tcl] BlastFile bfp -filename E:/links/project/goblet/data/goblet-results/20050209-143332--nrprot-test_blastp_nrprot #puts [bfp getBlast 27ccfed00cacd48a458a2057eec4e3c8 -md5 true] foreach x {0 1 2 3 4} { puts "ci010013000$x hasHit: [bfp hasHit ci010013000$x]" } puts "27ccfed00cacd48a458a2057eec4e3c8 hasHit: [bfp hasHit 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]" puts "QueryIDs: [bfp getQueryIds]" puts "MatchIDs for ci0100130000: [bfp getMatchIds ci0100130000]" puts "BlastInfo: [bfp blastInfo]" } ---- [schlenk] Those BLAST files are nice, but how about using the XML and/or ASN.1 output of the blastall program. Makes parsing some things easier, if you produce those instead of the plaintext version. (sadly the NCBI DTD's are all totally out of date and incorrect). ---- [DDG] Never used XML and ASN.1. XML seems to me to verbose for large files. But ASN.1 should be an urgent requirement. So an option -format asn.1|text might be required. May be we need then methods like `getBlastASN1', `getBlastText' where `getBlast' just forwards in dependence of the option -format.