Version 6 of FastaFile

Updated 2005-02-21 09:37:51 by DDG

Currently just a starter done by DDG. Please add comments and suggestions.

 package require snit 0.97
 package require oomk 
 package require md5pure
 # a type for retrieving sequences from fasta files
 # usage:
 # FastaFile ff -filename filename
 # puts [ff getSeq id]
 snit::type FastaFile {
    option -filename ""
    option -indexfile ""

    constructor {args} {
        $self configurelist $args
        if {$options(-indexfile) eq ""} {
            set options(-indexfile) $options(-filename).mk
        }
        if {![file exists $options(-indexfile)]} {
            $self CreateIndex
        } 
    };
    method CreateIndex {} {
        if [catch {open $options(-filename) r} infh] {
            puts stderr "Cannot open $options(-filename): $infh"
            return 0
        } else {
            set db [mkstorage %AUTO% $options(-indexfile)]
            $db layout positions {id md5 pos}
            [$db view positions] as pvPositions
            while {[gets $infh line] >= 0} {
                if {[regexp {^>([^\s]+)} $line -> id]} {
                    set pos [tell $infh]
                    incr pos [expr [string length $line] * -1 - 2]
                    set md5 [md5pure::md5 $id]
                    $pvPositions append id $id md5 $md5 pos $pos
                } 
            }
            close $infh
            $db commit 
            $db close

        }
    };
    method getSeq {id args} {
        # get the sequence either via id or via the md5 hash of the id
        set arg(-md5) false
        array set arg $args
        set db [mkstorage %AUTO% $options(-indexfile)]
        [$db view positions] as pvPositions
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} { 
                set id [$pSel get 0 id]
            } 
        }
        [$pvPositions select -exact id $id] as pSel
        if {[$pSel size] > 0} { 
            set pos [$pSel get 0 pos]
        } 
        if {[info exists pos]} {
            incr pos -1
            if [catch {open $options(-filename) r} infh] {
                puts stderr "Cannot open $options(-filename): $infh"
                return 0
            } else { 
                seek $infh $pos
                set flag false
                while {[gets $infh line] >= 0} {
                    if {[regexp {^>} $line] && $flag} {
                        break
                    } elseif {[regexp {^>} $line]} {
                        set flag true
                    }
                    append seq "$line\n"
                }
                close $infh
                return $seq
            } 
        } else {
            return ""
        }
    };
 }
 proc test {} {
    set file 20050210-134509--ciona2.seq
    set sf [FastaFile %AUTO% -filename /project/goblet/data/goblet-results/$file]
    puts [$sf getSeq ci0100130002]
    # example with a md5hash
    puts [$sf getSeq 1d206c9b9b91badbd93fb9f78594a9d4 -md5 true]

 }

schlenk Nice. I have some FASTA utils in Tcl too, (reading, splitting heading/body, joining head/body, validation, an iterator over a large multi sequence file, splitting and joining multi sequence files) all pure Tcl. Not sure if i can open source them though, have to ask. One comment on your code: why don't you use the tcllib md5 package, its more or less identical to md5pure and you have a good chance its already installed by a lot of people.


DDG Agreed about possible use of md5. However I saw some bugs, problems in the tcllib mailing lists. And we need only to digest small strings. So may be the smaller md5pure is enough. My idea is also to use starkits as the main delivery platform. Then md5pure will be included as well. No tclkit user should have the need to install anything. Just start a tclkit-shell and begin working by fetching the latest biotcl.kit via http::geturl. I will release a first kit this week. Regarding your methods for FastaFile: It should be not difficult to implement them from scratch.


Category Science