#!/opt/bin/perl # # Find identical files using a hash of MD5 digests. # Give the starting directory on the command line, and it will recursively # inspect everything underneath that. # v.0.1: It works, but there's a few misfeatures. use strict; use warnings; use Digest::MD5; if ($#ARGV != 0) { print "Usage:\n\tfident directory\n"; exit(1); } our %md5s; pdir($ARGV[0]); outdups(); # pdir # # Process a directory. Go through the contents of a directory, pushing # directory names onto a stack, and processing files. Later recurse # each directory. sub pdir { my $dir = pop(@_); my $file; my @dirlist; opendir(DIR, $dir) or return; # Should do something better with bad directories (dangling symlinks) while (defined($file = readdir(DIR))) { if (-f "$dir/$file") { pfile("$dir/$file"); } elsif (($file eq ".") || ($file eq "..")) { # Skip funny directories next; } else { push @dirlist, "$dir/$file"; } } closedir(DIR); while ($#dirlist > 0) { pdir(pop(@dirlist)); } } # pfile # # Calculate an MD5 for a file, and add the name of the file to a hash # keyed by the MD5. sub pfile { my $file = pop(@_); my $ctx = Digest::MD5->new; my $digest; open IN, "<$file" or return; $ctx->addfile(*IN); close IN; $digest = $ctx->hexdigest(); if (defined($md5s{$digest})) { $md5s{$digest} .= ":$file"; } else { $md5s{$digest} = "$file"; } } # outdups # # Go through the md5 hash, identifying values with more than one filename # and displaying those filenames. sub outdups { my $md5; my $fnames; while (($md5, $fnames) = each %md5s) { my @fnames = split(/:/, $fnames); if ($#fnames > 0) { print $md5, "\n"; for (my $i = 0; $i <= $#fnames; $i++) { print "\t", $fnames[$i], "\n"; } } } }