Version 2 of tdelta

Updated 2004-10-28 23:07:12 by SEH

SEH tdelta.tcl:

Produce an rdiff-style delta signature of one file with respect to another, and re-create one file by applying the delta to the other.

Strong md5 checksums of file segments are stored in the delta signature, which reduces efficiency but enhances security.

 # Usage:

 # tdelta <reference file> <target file>
 #        Returns a delta of the target file with respect to the reference file. 
 #        i.e., using patch to apply the delta to the target file will re-create the reference file.

 # patch <target file> <delta signature> <output file (duplicate of reference file)>
 #        Reconstitute original reference file by applying delta to target file.

 # global variable: blockSize
 #        Size of file segments to compare.
 #         Smaller blockSize tends to create smaller delta.
 #         Larger blockSize tends to take more time to compute delta.



 if ![info exists blockSize] {variable blockSize 100}
 if ![info exists Mod] {variable Mod [expr pow(2,16)]}

 variable temp
 if ![info exists temp] {
        catch {set temp $env(TMP)}
        catch {set temp $env(TEMP)}
        catch {set temp $env(TRSYNC_TEMP)}
        if [catch {file mkdir $temp}] {set temp [pwd]}
 }
 if ![file writable $temp] {error "temp location not writable"}

 proc Backup {args} {
        return
 }

 proc ConstructFile {copyinstructions {eolNative 0} {backup {}}} {
        package require md5 1

        set fileToConstruct [lindex $copyinstructions 0]
        set existingFile [lindex $copyinstructions 1]
        set blockSize [lindex $copyinstructions 2]
        array set fileStats [lindex $copyinstructions 3]
        array set digestInstructionArray [lindex $copyinstructions 4]
        array set dataInstructionArray [lindex $copyinstructions 5]
        unset copyinstructions

        set existingFile [FileNameNormalize $existingFile]
        if {$fileToConstruct == {}} {file delete -force $existingFile ; return}
        set fileToConstruct [FileNameNormalize $fileToConstruct]

        if $eolNative {set eolNative [string is ascii -strict [array get dataInstructionArray]]}


        set temp $::temp

        set constructTag "trsync.[md5::md5 "[clock seconds] [clock clicks]"]"
        catch {
                set existingID [open $existingFile r]
                fconfigure $existingID -translation binary
        }        

        set constructID [open $temp/$constructTag w]
        fconfigure $constructID -translation binary

        set filePointer 1
        while {$filePointer <= $fileStats(size)} {
                if {[array names dataInstructionArray $filePointer] != {}} {
                        puts -nonewline $constructID $dataInstructionArray($filePointer)
                        set segmentLength [string length $dataInstructionArray($filePointer)]
                        array unset dataInstructionArray $filePointer
                        set filePointer [expr $filePointer + $segmentLength]
                } elseif {[array names digestInstructionArray $filePointer] != {}} {
                        if ![info exists existingID] {error "Corrupt copy instructions."}
                        set blockNumber [lindex $digestInstructionArray($filePointer) 0]
                        set blockMd5Sum [lindex $digestInstructionArray($filePointer) 1]

                        seek $existingID [expr $blockNumber * $blockSize]

                        set existingBlock [read $existingID $blockSize]
                        set existingBlockMd5Sum [md5::md5 $existingBlock]
                        if ![string equal -nocase $blockMd5Sum $existingBlockMd5Sum] {error "digest file contents mismatch"}
                        puts -nonewline $constructID $existingBlock

                        if $eolNative {set eolNative [string is ascii -strict $existingBlock]}

                        unset existingBlock
                        set filePointer [expr $filePointer + $blockSize]
                } else {
                        error "Corrupt copy instructions."
                }
        }
        close $constructID
        catch {close $existingID}

        if $eolNative {
                fcopy [set fin [open $temp/$constructTag r]] [set fout [open $temp/${constructTag}fcopy w]]
                close $fin
                close $fout
                file delete -force $temp/$constructTag
                set constructTag "${constructTag}fcopy"
        }

        catch {file attributes $temp/$constructTag -readonly 0} result
        catch {file attributes $temp/$constructTag -permissions rw-rw-rw-} result
        catch {file attributes $temp/$constructTag -owner $fileStats(uid)} result
        catch {file attributes $temp/$constructTag -group $fileStats(gid)} result
        catch {file mtime $temp/$constructTag $fileStats(mtime)} result
        catch {file atime $temp/$constructTag $fileStats(atime)} result
        catch {file attributes $existingFile -readonly 0} result
        catch {file attributes $existingFile -permissions rw-rw-rw-} result

        Backup $backup $fileToConstruct

        file mkdir [file dirname $fileToConstruct]
        file rename -force $temp/$constructTag $fileToConstruct
        array set attributes $fileStats(attributes)
        foreach attr [array names attributes] {
                catch {file attributes $fileToConstruct $attr $attributes($attr)} result
        }
        catch {file attributes $fileToConstruct -permissions $fileStats(mode)} result
        return
 }

 proc CopyInstructions {filename digest} {
        package require md5 1
        set filename [FileNameNormalize $filename]
        file stat $filename fileStats
        array set fileAttributes [file attributes $filename]
        set arrayadd attributes ; lappend arrayadd [array get fileAttributes] ; array set fileStats $arrayadd
        set fileSize $fileStats(size)
        set digestFileName [lindex $digest 0]
        set blockSize [lindex $digest 1]
        set digest [lrange $digest 2 end]

        if {[lsearch -exact $digest fingerprints] > -1} {
                set fingerPrints [lindex $digest end]
                set digest [lrange $digest 0 end-2]
                set f [open $filename r]
                fconfigure $f -translation binary
                set fileContents [read $f]
                set matchCount 0
                foreach fP $fingerPrints {
                        if {[string first $fP $fileContents] > -1} {incr matchCount}
                        if {$matchCount > 3} {break}
                }
                unset fileContents
                close $f
                if {$matchCount < 3} {set digest {}}
        }  

        set digestLength [llength $digest]
        for {set i 0} {$i < $digestLength} {incr i} {
                set arrayadd [lindex [lindex $digest $i] 1]
                lappend arrayadd $i
                array set Checksums $arrayadd
        }
        set f [open $filename r]
        fconfigure $f -translation binary
        set digestInstructions {}
        set dataInstructions {}
        set weakChecksum {}
        set startBlockPointer 0
        set endBlockPointer 0

        if ![array exists Checksums] {
                set dataInstructions 1
                lappend dataInstructions [read $f]
                set endBlockPointer $fileSize
        }

        while {$endBlockPointer < $fileSize} {
                set endBlockPointer [expr $startBlockPointer + $blockSize]
                incr startBlockPointer
                if {$weakChecksum == {}} {
                        set blockContents [read $f $blockSize]
                        set blockNumberSequence [SequenceBlock $blockContents]
                        set weakChecksumInfo [WeakChecksum $blockNumberSequence]
                        set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]]
                        set startDataPointer $startBlockPointer
                        set endDataPointer $startDataPointer
                        set dataBuffer {}
                }
                if {[array names Checksums $weakChecksum] != {}} {
                        set md5Sum [md5::md5 $blockContents]
                        set blockIndex $Checksums($weakChecksum)
                        set digestmd5Sum [lindex [lindex $digest $blockIndex] 0]
                        if [string equal -nocase $digestmd5Sum $md5Sum] {
                                if {$endDataPointer > $startDataPointer} {
                                        lappend dataInstructions $startDataPointer
                                        lappend dataInstructions $dataBuffer
                                }
                                lappend digestInstructions $startBlockPointer
                                lappend digestInstructions "$blockIndex $md5Sum"
                                set weakChecksum {}
                                set startBlockPointer $endBlockPointer
                                continue
                        }
                }
                if {$endBlockPointer >= $fileSize} {
                        lappend dataInstructions $startDataPointer
                        lappend dataInstructions $dataBuffer$blockContents
                        break
                }
                set rollChar [read $f 1]
                binary scan $rollChar c* rollNumber
                set rollNumber [expr ($rollNumber + 0x100)%0x100]
                lappend blockNumberSequence $rollNumber
                set blockNumberSequence [lrange $blockNumberSequence 1 end]

                binary scan $blockContents a1a* rollOffChar blockContents
                set blockContents $blockContents$rollChar
                set dataBuffer $dataBuffer$rollOffChar
                incr endDataPointer

                set weakChecksumInfo "[eval RollChecksum [lrange $weakChecksumInfo 1 5] $rollNumber] [lindex $blockNumberSequence 0]"
                set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]]
        }
        close $f

        lappend copyInstructions $filename
        lappend copyInstructions $digestFileName
        lappend copyInstructions $blockSize
        lappend copyInstructions [array get fileStats]
        lappend copyInstructions $digestInstructions
        lappend copyInstructions $dataInstructions
        return $copyInstructions
 }

 proc Digest {filename blockSize {sizecheck 0} {fingerprint 0}} {
        package require md5 1
        set filename [FileNameNormalize $filename]
        set digest "[list $filename] $blockSize"
        if {!([file isfile $filename] && [file readable $filename])} {return $digest}
        set fileSize [file size $filename]
        if {$sizecheck && ($fileSize < [expr $blockSize * 5])} {return $digest}

        set charsRead 0
        set f [open $filename r]
        fconfigure $f -translation binary
        while {$charsRead < $fileSize} {
                set blockContents [read $f $blockSize]
                set charsRead [expr $charsRead + $blockSize]
                set md5Sum [md5::md5 $blockContents]
                set blockNumberSequence [SequenceBlock $blockContents]
                set weakChecksum [lindex [WeakChecksum $blockNumberSequence] 0]
                lappend digest "$md5Sum [format %.0f $weakChecksum]"
        }

        if $fingerprint {
                set fileIncrement [expr $fileSize/10]
                set fpLocation [expr $fileSize - 21]
                set i 0
                while {$i < 10} {
                        if {$fpLocation < 0} {set fpLocation 0}
                        seek $f $fpLocation
                        lappend fingerPrints [read $f 20]
                        set fpLocation [expr $fpLocation - $fileIncrement]
                        incr i
                }
                lappend digest fingerprints
                lappend digest [lsort -unique $fingerPrints]
        }
        close $f
        return $digest
 }

 proc FileNameNormalize {filename} {
        file normalize $filename
 }

 proc RollChecksum {a(k,l)_ b(k,l)_ k l Xsub_k Xsub_l+1 } {
        set Mod $trsync::Mod

        set a(k+1,l+1)_ [expr ${a(k,l)_} - $Xsub_k + ${Xsub_l+1}]
        set b(k+1,l+1)_ [expr ${b(k,l)_} - (($l - $k + 1) * $Xsub_k) + ${a(k+1,l+1)_}]

        set a(k+1,l+1)_ [expr fmod(${a(k+1,l+1)_},$Mod)]
        set b(k+1,l+1)_ [expr fmod(${b(k+1,l+1)_},$Mod)]
        set s(k+1,l+1)_ [expr ${a(k+1,l+1)_} + ($Mod * ${b(k+1,l+1)_})]
        return "${s(k+1,l+1)_} ${a(k+1,l+1)_} ${b(k+1,l+1)_} [incr k] [incr l]"
 }

 proc SequenceBlock {blockcontents} {
        binary scan $blockcontents c* blockNumberSequence
        set blockNumberSequenceLength [llength $blockNumberSequence]
        for {set i 0} {$i < $blockNumberSequenceLength} {incr i} {
                set blockNumberSequence [lreplace $blockNumberSequence $i $i [expr ([lindex $blockNumberSequence $i] + 0x100)%0x100]]
        }
        return $blockNumberSequence
 }

 proc WeakChecksum {Xsub_k...Xsub_l} {
        set a(k,i)_ 0
        set b(k,i)_ 0
        set Mod $trsync::Mod
        set k 1
        set l [llength ${Xsub_k...Xsub_l}]
        for {set i $k} {$i <= $l} {incr i} {
                set Xsub_i [lindex ${Xsub_k...Xsub_l} [expr $i - 1]]
                set a(k,i)_ [expr ${a(k,i)_} + $Xsub_i]
                set b(k,i)_ [expr ${b(k,i)_} + (($l - $i + 1) * $Xsub_i)]
        }
        set a(k,l)_ [expr fmod(${a(k,i)_},$Mod)]
        set b(k,l)_ [expr fmod(${b(k,i)_},$Mod)]
        set s(k,l)_ [expr ${a(k,l)_} + ($Mod * ${b(k,l)_})]
        return "${s(k,l)_} ${a(k,l)_} ${b(k,l)_} $k $l [lindex ${Xsub_k...Xsub_l} 0]"
 }

 proc tdelta {referenceFile targetFile blockSize} {
        set signature [Digest $targetFile $blockSize]
        return [CopyInstructions $referenceFile $signature]
 }

 proc patch {targetFile copyInstructions fileToConstruct} {
        set copyInstructions [lreplace $copyInstructions 0 1 $fileToConstruct $targetFile]
        return [ConstructFile $copyInstructions]
 }

 set trsync::Mod $Mod