#!/usr/bin/perl

# $Id: mddupes,v 1.14 2022/11/20 16:49:59 bscott Exp bscott $

# Find duplicate messages in Maildirs by Message-ID.
#
# Given a list of directory names, read every file in those directories.
# Treat every file as an RFC-822 message, and remember the Message-ID.
# Report on stdout any messages which duplicate a seen Message-ID.
# Each report line lists both file names, separated by tab.
# Warn if a Message-ID line is not found in a file.
# Progress and problems are reported to stderr.
#
# Assumes every entry in each directory is a message file.
# Subdirectories and/or non-message files will confuse it, at best.
# Feed it the names of the Maildir "new" and/or "cur" directories,
# while nothing else is using those directories, and it should be OK.
# The output of "findmaildirs -d" is suitable as an argument list.

########################################################################
# imports

use strict;
use warnings;
use English qw( -no_match_vars );
use autodie qw( :all );
use 5.012; # so readdir assigns to $_ in a lone while test

########################################################################
# global constants

use constant {

# open() file I/O modes
READ_FROM => "<",  # read from an existing file
APPEND_TO => ">>", # append to an existing file
PIPE_TO   => "|-", # pipe to another program (we write to it)
PIPE_FROM => "-|", # pipe from another program (we read from it)

}; # constants

########################################################################
# main program

die "too few arguments" unless (@ARGV);

# hash of seen Message-ID's
# key = Message-ID
# value = file name of first message which had that ID
my %msgs;

# counters for summary stats
my $filect = 0;
my $msgct = 0;
my $dupes = 0;

DIR: for my $dirname (@ARGV) {

	# progress to stderr
	warn "processing dir: $dirname\n";

	opendir (my $dir, $dirname);

	MSG: while (readdir $dir) {
	
		die  "bogus readdir" if not defined $_;

		# skip the Unix directory glue
		next MSG if ($_ eq '.') or ($_ eq '..');
		
		my $msgfile = $dirname . '/' . $_;
		my $msgID;

		# I suppose I could stat() everything to see if it was a file
		# but I didn't want the overhead of all those syscalls.
		# Non-message-files would be just as bad for it, and
		# stat() wouldn't catch those.

		open (my $msg, READ_FROM, $msgfile);

		$filect++;

		# look for a message-ID in this msg file
		LINE: while (<$msg>) {
			last LINE if m/^$/; # blank line = end-of-headers
			next LINE unless m/^Message-ID:\s*(.+)$/i;
			$msgID = $1;
			last LINE;
			}

		# if we didn't find a message ID, complain and skip
		unless ($msgID) {
			warn "no Message-ID: $msgfile\n";
			next MSG;
			}

		$msgct++;

		# now check/add that ID to the msgs hash
		unless (exists $msgs{$msgID}) {
			$msgs{$msgID} = $msgfile;
			}
		else {
			# Message-ID seen before - report both file names
			$dupes++;
			say "$msgs{$msgID}\t$msgfile";
			}

		} # MSG

	closedir $dir;
	
	} # DIR

$msgct = 'all were' if ($msgct eq $filect);

# summary stats to stderr
warn "$filect files, $msgct messages, $dupes duplicates\n";

# END
########################################################################
