Finding duplicate files

MJ - While trying to find duplicate files in my images directory, I decided to hack together a small Tcl script to determine which files are duplicates. The script requires tcllib for md5 and fileutil. The script finds duplicates based on md5 hash (skipping large files for performance reasons) and stores the hashes array in an output file.


 package require Tcl 8.5
 package require md5
 package require fileutil

 if {$argc != 1} {
        puts stderr "usage [file tail [info script]] output"
        exit 1
 } else {
        lassign $argv output
 }

 set max_size 10e7

 proc file_md5 {filename} {
        variable hashes
        variable max_size
        # set fully qualified filename to be able to manage files starting with ~
        set filename [pwd]/$filename
        if {![file isdirectory $filename] && [file size $filename] < $max_size} {  
           set md5 [md5::md5 -hex -filename $filename]
           lappend hashes($md5) $filename
           puts "$md5: $filename" 
        }
        return 0
 }

 fileutil::find . file_md5
 puts "Storing hashes in $output"
 set f [open $output w]
 puts $f [array get hashes]
 close $f 

And a script to delete the duplicate files (except the first one found). If you only specify the input file, a lists of duplicates is displayed and nothing is deleted. Use at your own risk.

 package require Tcl 8.5

 if {$argc != 1 && $argc !=2 } {
        puts stderr "usage [file tail [info script]] input ?delete?"
        exit 1
 } else {
        lassign $argv input delete
 }

 set num_duplicates 0
 set num_deleted 0
 set f [open $input r]
 array set hashes [read $f]
 close $f

 foreach {hash files} [array get hashes] {
   if {[llength $files] > 1} {
        incr num_duplicates
        puts "Duplicates $files"
        if {$delete ne {}} {
          puts " Keeping first file, deleting rest..."
          foreach file [lrange $files 1 end] {
            incr num_deleted
            puts "  Deleting $file"
            file delete $file
        } 
      } 
   }
 }

puts "$num_duplicates distinct duplicates found, $num_deleted duplicates deleted"