#! /usr/bin/perl -p

##############################################################################
#
# PURPOSE
#
# Fixes Microsoft-isms in what should be standard ASCII text.
# Also does a few general cleanups (end-of-line markers, for example),
#
# USAGE
# 
# Standard Unix filter.
#
# CREDITS
#
# Original program by Benjamin Scott <code@dragonhawk.org>, 1 Jan 2002.
# 
# Some character codes were obtained from "demoroniser", which was written by
# John Walker, and placed in the public domain in January of 1998.
#
# LEGAL
#
# This program was placed in the public domain on 1 Jan 2002.
# This program has NO WARANTY.  USE IT STRICTLY AT YOUR OWN RISK.
#
##############################################################################

# fix end-of-line markers
s(\x0D$)();	# CR at Perl EOL - CR+LF EOL-markers - remove CR leaving NL
s(\x0D)(\n)g;	# CR in Perl mid-line - CR EOL-markers - replace CR with NL

# translate Microsoft non-characters to ASCII standard

# multi-byte runs
s(\xE2\x80\x93)(--)g	; # em dash
s(\xE2\x80\x98)(')g	; # opening single quote
s(\xE2\x80\x99)(')g	; # closing single quote or apostrophe
s(\xE2\x80\x9C)(")g	; # opening double quote
s(\xE2\x80\x9D)(")g	; # closing double quote
s(\xE2\x80\xA6)(...)g	; # ellipsis

# one-to-one character translations
s(\x7E)(!)g	; # exclaimation mark (bang)
s(\x82)(,)g	; # comma
s(\x84)(,,)g	; # double comma
s(\x85)(...)g	; # ellipsis
s(\x88)(^)g	; # caret
s(\x8B)(<)g	; # less-than
s(\x9B)(>)g	; # greater-than
s(\x91)(')g	; # opening single quote
s(\x92)(')g	; # closing single quote
s(\x93)(")g	; # opening double quote
s(\x94)(")g	; # closing double quote
s(\x95)(*)g	; # star/asterisk
s(\x96)(-)g	; # short dash (en dash)
s(\x96)(--)g	; # long dash (em dash)
s(\x97)(--)g	; # another long dash (em dash)
s(\x98)(~)g	; # superscript tilde
s{\x99}{(TM)}g	; # trademark symbol
s(\x83)(f)g	; # fancy "f"
s(\x8C)(Oe)g	; # dipthong
s(\x9C)(oe)g	; # dipthong
s(\xB4)(')g	; # apostrophe
s(\xD2)(")g	; # opening double quote
s(\xD3)(")g	; # closing double quote
s(\xD5)(')g	; # apostrophe
s(\xDF)(')g	; # apostrophe
s(\xA0)( )g	; # not really sure, but whitespace seems to be okay
s(\xE9)(e)g	; # fancy "e"
s(\xC2)()g	; # a null associated with apostrophes (?)

# if any remaining non-characters are found:
# - warn
# - convert to text representation (hexidecimal base) of ordinal value
# match pattern is character class of non-printable characters
while (m/([\x00-\x08\x10-\x1F\x7E-\xFF])/) {
	$char = $1;
	$val = unpack ('C', $char);		# ordinal value
	$str = sprintf ("[0x%X]", $val);	# hex with 0x prefix
	warn("no translation for $str");
	s/$char/$str/g;
}

# END OF FILE