Version 1 of FastaFile

Updated 2005-02-18 14:35:54 by DDG

Currently just a starter done by DDG. Please add comments and suggestions.

 package require snit 0.97
 package require oomk 
 package require md5pure
 # a type for retrieving sequences from fasta files
 # usage:
 # FastaFile ff -filename filename
 # puts [ff getSeq id]
 snit::type FastaFile {
    option -filename ""
    option -indexfile ""

    constructor {args} {
        $self configurelist $args
        if {$options(-indexfile) eq ""} {
            set options(-indexfile) $options(-filename).mk
        }
        if {![file exists $options(-indexfile)]} {
            $self CreateIndex
        } 
    };
    method CreateIndex {} {
        if [catch {open $options(-filename) r} infh] {
            puts stderr "Cannot open $options(-filename): $infh"
            return 0
        } else {
            set db [mkstorage %AUTO% $options(-indexfile)]
            $db layout positions {id md5 pos}
            [$db view positions] as pvPositions
            while {[gets $infh line] >= 0} {
                if {[regexp {^>([^\s]+)} $line -> id]} {
                    set pos [tell $infh]
                    incr pos [expr [string length $line] * -1 - 2]
                    set md5 [md5pure::md5 $id]
                    $pvPositions append id $id md5 $md5 pos $pos
                } 
            }
            close $infh
            $db commit 
            $db close

        }
    };
    method getSeq {id args} {
        # get the sequence either via id or via the md5 hash of the id
        set arg(-md5) false
        array set arg $args
        set db [mkstorage %AUTO% $options(-indexfile)]
        [$db view positions] as pvPositions
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} { 
                set id [$pSel get 0 id]
            } 
        }
        [$pvPositions select -exact id $id] as pSel
        if {[$pSel size] > 0} { 
            set pos [$pSel get 0 pos]
        } 
        if {[info exists pos]} {
            incr pos -1
            if [catch {open $options(-filename) r} infh] {
                puts stderr "Cannot open $options(-filename): $infh"
                return 0
            } else { 
                seek $infh $pos
                set flag false
                while {[gets $infh line] >= 0} {
                    if {[regexp {^>} $line] && $flag} {
                        break
                    } elseif {[regexp {^>} $line]} {
                        set flag true
                    }
                    append seq "$line\n"
                }
                close $infh
                return $seq
            } 
        } else {
            return ""
        }
    };
 }
 proc test {} {
    set file 20050210-134509--ciona2.seq
    set sf [FastaFile %AUTO% -filename /project/goblet/data/goblet-results/$file]
    puts [$sf getSeq ci0100130002]

 }