Currently just a starter done by [DDG]. Please add comments and suggestions. package require snit 0.97 package require oomk package require md5pure # a type for retrieving sequences from fasta files # usage: # FastaFile ff -filename filename # puts [ff getSeq id] snit::type FastaFile { option -filename "" option -indexfile "" constructor {args} { $self configurelist $args if {$options(-indexfile) eq ""} { set options(-indexfile) $options(-filename).mk } if {![file exists $options(-indexfile)]} { $self CreateIndex } }; method CreateIndex {} { if [catch {open $options(-filename) r} infh] { puts stderr "Cannot open $options(-filename): $infh" return 0 } else { set db [mkstorage %AUTO% $options(-indexfile)] $db layout positions {id md5 pos} [$db view positions] as pvPositions while {[gets $infh line] >= 0} { if {[regexp {^>([^\s]+)} $line -> id]} { set pos [tell $infh] incr pos [expr [string length $line] * -1 - 2] set md5 [md5pure::md5 $id] $pvPositions append id $id md5 $md5 pos $pos } } close $infh $db commit $db close } }; method getSeq {id args} { # get the sequence either via id or via the md5 hash of the id set arg(-md5) false array set arg $args set db [mkstorage %AUTO% $options(-indexfile)] [$db view positions] as pvPositions if {$arg(-md5)} { [$pvPositions select -exact md5 $id] as pSel if {[$pSel size] > 0} { set id [$pSel get 0 id] } } [$pvPositions select -exact id $id] as pSel if {[$pSel size] > 0} { set pos [$pSel get 0 pos] } if {[info exists pos]} { incr pos -1 if [catch {open $options(-filename) r} infh] { puts stderr "Cannot open $options(-filename): $infh" return 0 } else { seek $infh $pos set flag false while {[gets $infh line] >= 0} { if {[regexp {^>} $line] && $flag} { break } elseif {[regexp {^>} $line]} { set flag true } append seq "$line\n" } close $infh return $seq } } else { return "" } }; } proc test {} { set file 20050210-134509--ciona2.seq set sf [FastaFile %AUTO% -filename /project/goblet/data/goblet-results/$file] puts [$sf getSeq ci0100130002] # example with a md5hash puts [$sf getSeq 1d206c9b9b91badbd93fb9f78594a9d4 -md5 true] } ---- [schlenk] Nice. I have some FASTA utils in Tcl too, (reading, splitting heading/body, joining head/body, validation, an iterator over a large multi sequence file, splitting and joining multi sequence files) all pure Tcl. Not sure if i can open source them though, have to ask. One comment on your code: why don't you use the tcllib md5 package, its more or less identical to md5pure and you have a good chance its already installed by a lot of people. --- [DDG] Agreed about possible use of md5. However I saw some bugs, problems in the tcllib mailing lists. And we need only to digest small strings. So may be the smaller md5pure is enough. My idea is also to use starkits as the main delivery platform. Then md5pure will be included as well. No tclkit user should have the need to install anything. Just start a tclkit-shell and begin working by fetching the latest biotcl.kit via http::geturl. I will release a first kit this week. Regarding your methods for FastaFile: It should be not difficult to implement them from scratch. ---- [Category Science]