#!/usr/bin/perl

# $Id: ckmddupes,v 1.87 2022/11/20 22:33:11 bscott Exp bscott $

# Confirm messages are duplicates in content, not just Message-ID.
#
# Read stdin, consisting of pairs of file names, each file assumed to be
# individual RFC-822 messages.  (The output of mbdupes is suitable as input.)
# Compare the pairs to see if they really are the same message.  Ignores
# headers which can vary (different transport paths for same msg).  Ignores
# if one file ends first and the other just has blank lines to EOF after that.
# Otherwise messages must match, all lines, headers and body, in same order.
# Report to stderr messages which differ in some way (not actually a dupe).
# Report to stdout both names of identical messages (true dupes).
#
# Will fail miserably if filenames contain embedded tabs or newlines.  Spaces
# should be OK.
#
# Scenarios:
#
# - If some lame mail system is generating duplicate Message-IDs for entirely
#   different messages, this will weed those out of the dupe list.
# - Bugs in the duplicate-detection scripts will hopefully be defused.
# - Two otherwise identical messages but with a footer added to one will not
#   be seen as dupes.
#
# Whether or not that last one is the right thing or not, depends on your
# scenario.
#
# This script weeds out false positives (not actually a dupe).  It does
# nothing to help find false negatives (two otherwise identical messages with
# different Message-IDs).

########################################################################
# imports

use strict;
use warnings;
use 5.012; # so readdir assigns to $_ in a lone while test
use English qw( -no_match_vars );
use autodie qw( :all );
no warnings qw( experimental::signatures );
use feature qw( signatures );

########################################################################
# globals

use constant {

# open() file I/O modes
READ_FROM => "<",  # read from an existing file
APPEND_TO => ">>", # append to an existing file
PIPE_TO   => "|-", # pipe to another program (we write to it)
PIPE_FROM => "-|", # pipe from another program (we read from it)


}; # constants

# option to print details of the difference with -d
my $detail = 0;

########################################################################
# main program

# command line argument
my $argc = scalar @ARGV;
die "too many arguments" if ($argc > 2);
if ($argc == 1) {
       my $arg = $ARGV[0];
	if ($arg eq '-d') {
		$detail = 1;
		shift;
		}
	else {
		die "unrecognized argument: $arg";
		}
	}

# read pairs of filenames from stdin
while (<>) {

	chomp;
	my @names = split /\t/;

	die "wrong number of file names" if (@names != 2);

	ckpair (@names);

}

########################################################################
# subroutines

# ----------------------------------------------------------------------
sub ckpair ($leftFN, $rightFN) {
# check a pair of message files, to see if they are identical
# if not, report reason and both names to stderr
# if same, report to both names stdout

my $good = 0; # was this pair of files actually a dupe?

open (my $leftFH,  READ_FROM, $leftFN);
open (my $rightFH, READ_FROM, $rightFN);

# for internal use by read_msg_line() across invocations
my $leftHDR = my $rightHDR = 1; 

# go through each file together, line by line
LINEPAIR: while (1) {

	my $left  = read_msg_line($leftFH, \$leftHDR);
	my $right = read_msg_line($rightFH, \$rightHDR);
	
	if ((not defined $left) and (not defined $right)) {
		# EOF on both, at the same time
		$good = 1;
		last LINEPAIR;
		}

	if ((not defined $left) or (not defined $right)) {
		# EOF on one, but not both
		# one file is shorter than the other
		
		my $shortfile;
		$shortfile = $leftFH  if (not defined $left);
		$shortfile = $rightFH if (not defined $right);
		if (ends_in_blanks($shortfile)) {
			$good = 1;
			}
		else {
			warn "lengths: $leftFN\t$rightFN\n";
			if ($detail) {
				warn "   LEFT: shorter\n" if not defined $left;
				warn "  RIGHT: shorter\n" if not defined $right;
				}
			}
		last LINEPAIR;
		}
	    
	if ($left ne $right) {
		warn "content: $leftFN\t$rightFN\n";
		if ($detail) {
			warn "   LEFT: $left\n";
			warn "  RIGHT: $right\n";
			}
		last LINEPAIR;
		}

	} # while

close ($leftFH);
close ($rightFH);

say "$leftFN\t$rightFN" if $good;

} # ckpair


# ----------------------------------------------------------------------
sub read_msg_line ($FH, $HDR) {
# Read a line from a file, assumed to be an RFC-822 message.
# Filter out selected headers, including folded continuation lines.

# $HDR is a ref to a boolean we use to track if we are in the message headers
# or body.  Caller sets HDR to true when the file is opened.  Once we see the
# end-of-headers break, we set HDR to false.  While HDR is true we look for
# headers we want to skip (and skip them).  Once HDR becomes false we stop	
# looking and just return every line of the body.

# Headers to skip, and thus ignore in the comparison.
# These can vary between otherwise identical messages, typically because
# the message took a different path to the destination, and/or, was
# processed by a different mail system, and thus had different tracking or
# identification headers added.
my $skip = qr{
	# headers start at the start of line, per the RFC
	^(
	# local flags that vary with message read status
	 (X-)?(Mozilla-)?Status	
	|(X-)?T?UIDL?	# sometimes X-UID, sometimes X-TUID, sometimes UIDL
	|(X-)?GM(AIL)?-(MSG|THR)ID
	|(X-)?GM(AIL)?-LABELS
	|(X-)?Gm-Message-State
	|X-IMAPbase
	|Thread-Index
	|X-AuditID
	|X-ASG-Debug-ID
	|X-ASG-Orig-Subj
	|X-Authentication-Warning
	# size parameters often added by later mail processing
	|Content-Length
	|Lines
	# tracking/tracing headers
	|(X-)?((Gmail|Default)-)?Received(-SPF)?
	|Return-Path
	|Delivered-To
	|X-Peer-Info
	|X-Greylist
	|X-Barracuda-URL
	|X-IronPort-Anti-Spam-(Filtered|Result)
	|X-bfccomputing-MailScanner(-SpamCheck)?
	|X-Spam-(Status|Level|Checker-Version)
	|(ARC-)?Authentication-Results
	|DomainKey-(Status|Signature)
	|DKIM-Signature
	|ARC-(Seal|Message-Signature)
	|X-OriginalArrivalTime
	|(X-)?(Keywords|Originating-IP|Google-Smtp-Source|(Original|Errors)-To)
	|X-getmail-retrieved-from-mailbox
	# colon at end of header name (rest of line ignored)
	):
	}xi;

# RFC "WSP" whitespace = space or tab
# continuation line = starts with 1-or-more WSP, followed by not-WSP
my $continuation = qr{^[\t ]+[^\t ]};

# Divides end-of-headers and start-of-body.  A blank line.
my $break = qr{^$};

# Flag to track if we skipped previous header, so we can skip continuations.
my $skipped = 0;

# The LINE loop is so if we read a line we want to skip, we can read another
# If the first line we read is a keeper, the loop only executes once.
# So "last LINE" means "keep this line", while "next LINE" means "skip one".
# If at any point we hit EOF, the EOF/undef value will get returned.
LINE: while (<$FH>) {

	chomp;

	# if we're not in the headers, just return the body line
	last LINE if not ${$HDR};

	# we are still in headers, do header processing

	if (m/$break/) {
		# end-of-headers, no need to check for more skips
		${$HDR} = 0;
		# keep and return "header break" line, as a line we read
		last LINE;
		}
	
	if (m/$skip/) {
		# We'll skip this line and try again.  Remember we skipped,
		# in case next LINE is a continuation of this one.
		$skipped = 1;
		next LINE;
		}

	next LINE if $skipped and m/$continuation/;

	# This header was not the break, and not a header to skip,
	# and not a continuation of a skipped header, so it must be
	# a header we want to keep.
	last LINE;

	}

return $_;

} # read_msg_line

# ----------------------------------------------------------------------
sub ends_in_blanks ($FH) {
# reads a file handle, returns true if only blank lines to end
	while(<$FH>) {
		chomp;
		next if m/^$/; # blank line
		return 0; # not a blank line
		}
	# reached end-of-file without seeing a non-blank line
	return 1;
} # ends_in_blanks


# ----------------------------------------------------------------------
sub cat ($FN) {
# test code - like cat(1) while filtering the headers

warn "---------- begin <$FN>";
open (my $FH,  READ_FROM, $FN);
while (defined (my $line = read_msg_line $FH)) {
	say "<$line>";
	}
warn "---------- end   <$FN>";

} # cat


# END
########################################################################
