#!/opt/bin/perl
#
# Find identical files using a hash of MD5 digests.
# Give the starting directory on the command line, and it will recursively
# inspect everything underneath that.
# v.0.1: It works, but there's a few misfeatures.

use strict;
use warnings;
use Digest::MD5;

if ($#ARGV != 0) {
    print "Usage:\n\tfident directory\n";
    exit(1);
}

our %md5s;

pdir($ARGV[0]);
outdups();

# pdir
#
# Process a directory. Go through the contents of a directory, pushing
# directory names onto a stack, and processing files. Later recurse
# each directory.

sub pdir {
    my $dir = pop(@_);
    my $file;
    my @dirlist;

    opendir(DIR, $dir) or return;
    # Should do something better with bad directories (dangling symlinks)
    while (defined($file = readdir(DIR))) {
        if (-f "$dir/$file") {
	    pfile("$dir/$file");
	} elsif (($file eq ".") || ($file eq "..")) {
	    # Skip funny directories
	    next;
	} else {
	    push @dirlist, "$dir/$file";
        }
    }
    closedir(DIR);

    while ($#dirlist > 0) {
        pdir(pop(@dirlist));
    }
}

# pfile
#
# Calculate an MD5 for a file, and add the name of the file to a hash
# keyed by the MD5.

sub pfile {
    my $file = pop(@_);
    my $ctx = Digest::MD5->new;
    my $digest;

    open IN, "<$file" or return;
    $ctx->addfile(*IN);
    close IN;

    $digest = $ctx->hexdigest();
    if (defined($md5s{$digest})) {
        $md5s{$digest} .= ":$file";
    } else {
	$md5s{$digest} = "$file";
    }
}

# outdups
#
# Go through the md5 hash, identifying values with more than one filename
# and displaying those filenames.

sub outdups {
    my $md5;
    my $fnames;

    while (($md5, $fnames) = each %md5s) {
        my @fnames = split(/:/, $fnames);
	if ($#fnames > 0) {
	    print $md5, "\n";
	    for (my $i = 0; $i <= $#fnames; $i++) {
		print "\t", $fnames[$i], "\n";
	    }
        }
    }
}