#!/usr/bin/perl -w

# Pick (sets of) good starting words for Wordle.
# Based on letter frequency calculated previously (hard coded, see $alphabet).
# First argument is number of words in a set, to be used together.
# Remaining arguments, or standard input, is the word list (corpus).
# Output is possible wordsets, one per line.
#
# This proved to not be as useful as I hoped.
# Looking for just 1 word, it's just a slow shitty grep.
# Looking for 2 word sets, it found a few hundred combos, but I found one of
#   them manually just using grep an eyeball search, and one is all you need.
# Looking for 3 word sets, it runs forever until I kill it.
# I think to be really useful it would need to score words somehow.
# Or maybe use something smarter than brute force copies and searches.

use strict;
use warnings;
use English;
use feature qw( say );
use autodie qw( :all );

# Definitions:
# wordset
#	An array of strings.  Each string is a word.
#	Each word in a wordset is a possible starting word for the game,
# 	with the wordset taken as a whole for multi-word starter strategies.
# metaset
#	An array of refs to wordsets.
#	That is, an array of refs to arrays of words.
#	Each wordset in the metaset is a set of possible starters, as above.
#	The metaset is all the possible starter sets we came up with.
# Example literal wordset:
#	( 'foo', 'bar' )
# Example literal metaset:
#	( ['foo', 'bar'], ['ding', 'dong'], ['here', 'there'] )

# ----------------------------------------------------------------------
# constants

my $true = 1;
my $false = 0;

# target length of words we are expecting
my $targlen = 5;

# all letters, sorted by frequency in the corpus, most frequent first
my $alphabet = 'seaoriltnudycpmhgbkfwvzjxq';

# single ASCII space
my $space = ' ';

# ----------------------------------------------------------------------
# utility functions

sub uniq {
	# returns unique elements of the given list
	# https://stackoverflow.com/a/7657/6884780
	my %seen;
	grep !$seen{$_}++, @_;
	}

# ----------------------------------------------------------------------
# prepare

my $want = shift;
die "*** missing argument: number of starter words" if not defined $want;
die "*** argument must be integer word count" if ($want =~ m/[^0-9]/);

# slurp in all words from input
my @all = <>;
chomp @all;
die '*** no input words found' if (not @all) or ($#all < 0);

# stats for sanity checking
warn '* count of input words is ' . ($#all + 1) . "\n";
warn "* first word is <$all[0]>\n";
warn "* last  word is <$all[$#all]>\n";

# ----------------------------------------------------------------------
# this function will be called recursively as the main body of program

sub fish($$@);
sub fish($$@) {

my $need = shift;
my $letters = shift;
my @words = @_;

# this will be our list of refs to wordsets, to eventually return
my @metaset = ();

# regexp pattern: character class of letters to search
# starts out as just letters, then a regexp string, then compiled
my $pat;

# Number-of-words times desired-length gives us the top X letters to
# distribute across all words.  We'll search for words in that space.
$pat = substr $letters, 0, ($targlen * $need);

# wrap $pat in the regexp syntax to search for words made of those characters
$pat = "^[$pat]{$targlen}\$";

# compile to a regular expression
# don't use /o or the next function call it will fuck up
$pat = qr/$pat/;

# Not using foreach as we need the index value for splice().
WORD: for (my $wordidx = 0; $wordidx <= $#words; $wordidx++) {

	my $word = $words[$wordidx];
	
	# skip words that don't match
	next WORD if $word !~ m/$pat/;

	# split into letters, remove dupes, scalar to count unique letters
	# if less than $targlen, must be repeated letter(s), skip
	next WORD if (scalar uniq split //, $word) != $targlen;

	#warn "<$word> <$wordidx> <$need> <$letters>"; # DEBUG

	# This will be used to build the wordset to add to @metaset.
	# Exactly how we do that varies, since we may need more words.
	my @wordset;
	
	# do we have all the words we need?
	if ($need <= 1) {
		# This word is all we need, so we'll just be using that.
		# It's still an array, so it can be used as a wordset.
		@wordset = $word;
		}
	else {
		# we need more words after this one
		
		# strip the matching letters from $letters
		my $remain_ltrs = $letters;
		$remain_ltrs =~ s/[$word]//g;
	
		# make a copy of @words, then delete the matching word
		my @remain_words = @words;
		splice( @remain_words, $wordidx, 1 );

		# search the remaining words, using the remaining letters
		my @child_metaset =
			fish( ($need - 1), $remain_ltrs, @remain_words );

		# First, make sure we found something, there may be no more.
		# If not, skip this word entirely, we need them all.
		next WORD if not @child_metaset;

		# @subsets should be a list of wordsets of possible starters,
		# with the end case having wordsets with one word in each set.
		# We'll need to pick out each returned wordset from @subsets,
		# build a new wordset with the current $word first,
		# and add the new wordset to our @wordsets for eventual
		# return.
		foreach my $child_wordset_ref (@child_metaset) {
			@wordset = ($word, @$child_wordset_ref);
			}

		} # end else

		# either way, we add a ref to our @wordset to the @metaset
		push @metaset, \@wordset;
	
	} # WORD

return @metaset;

} # sub fish()
# ----------------------------------------------------------------------

# call fish() on input; after, process the returns to produce output
my @metaset = fish($want, $alphabet, @all);

die '*** zero wordsets found' if (not @metaset) or ($#metaset < 0);

# output - go through outer array, print each member set
#	each set is words separated by spaces
#	each member set is one per line, separate by newlines
foreach my $wordset_ref (@metaset) {
	say join $space, @$wordset_ref;
}

warn '* count of wordsets is ' . ($#metaset + 1) . "\n";

# END
