#!/usr/bin/perl
# ------------------------------------------------------------------
#                          2-Word-post-v2.0
# Perl post-processor for MS Word RFC/Internet-draft template output
#
#                              J. Touch
#                        touch@strayalpha.com
#                     http://www.strayalpha.com
#
#                   Copyright (c) 2022
#
# Revision date: Mar. 1, 2022
# ------------------------------------------------------------------
#
# Copyright (c) 2020-2022 by J. Touch
# Copyright (c) 2004-2016 by the University of Southern California.
# All rights reserved.
#
# Permission to use, copy, modify, and distribute this software and
# its documentation in source and binary forms for non-commercial
# purposes and without fee is hereby granted, provided that the above
# copyright notice appear in all copies and that both the copyright
# notice and this permission notice appear in supporting
# documentation, and that any documentation, advertising materials,
# and other materials related to such distribution and use acknowledge
# that the software was developed by the University of Southern
# California, Information Sciences Institute and augmented by
# J. Touch.  The name of the University and J. Touch may not be used
# to endorse or promote products derived from this software without
# specific prior written permission.
#
# THE UNIVERSITY OF SOUTHERN CALIFORNIA AND J. TOUCH MAKE NO
# REPRESENTATIONS ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY
# PURPOSE.  THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS
# OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
# ------------------------------------------------------------------
#
# usage:  
#        2-Word-post-v2.0.pl [inputfile.txt] > [outputfile.txt]
#
# function:
#     replaces -^M    - with - (regardless of space after ^M) (seen 2016)
#     removes indent on each line (blank print margin, typ. 5 chars)
#     converts cr/lf to cr 
#     converts 'smart quotes' to regular quotes (single and double)
#       this includes converting '' to "
#     converts 'smart hyphens' (EM-dash, EN-dash) to regular hyphen
#     omits blank lines between footer and next-page header
#     inserts formfeed (ff) between footer and next-page header
#     removes end-of-line whitespace
#     checks for illegal chars (not printable ASCII, cr, lf, ff)
#     checks for page lengths exceeded
#     checks for line lengths exceeded
#     prints errors indicating page and line on that page
#
#        illegal character errors are posted to STDERR
#
#        returns the logical OR of codes indicating errors found:
#                0x00 no error
#                0x01 if any illegal characters found
#                0x02 if any page length exceeds $maxpagelen
#                0x04 if any line length exceeds $maxlinelen
#
# ------------------------------------------------------------------

# ------------------------------------------------------------------
# VARIABLES
# ------------------------------------------------------------------

$pagenum = 1;          # start on page 1, not 0

$maxpagelen = 66;      # max lines per page

$maxlinelen = 73;      # max chars per line (includes trailing CR)

                       # specific error codes
%codes = (
    'none' => 0x00,
    'char' => 0x01,
    'page' => 0x02,
    'line' => 0x04,
    );

%codestrings = (
    'none' => '(no error)',
    'char' => 'invalid character code',
    'page' => "exceeded $maxpagelen lines per page",
    'line' => "exceeded $maxlinelen chars per line",
    );

$errorcode = $codes{'none'};

$indentlen = -1;       # how many spaces to eat from the beginning
                       # of each line; ought to be 5. negative flag
                       # means it is not yet initialized

$indentstr = "     ";  # until known otherwise, assume 5 spaces

$killwhite = 1;        # flag kills space between footer, header
                       # start in 'between footer and header' mode,
                       # so eats all whitespace before the first line


# ------------------------------------------------------------------
# ERROR SUBROUTINE
# ------------------------------------------------------------------
sub printerr ($) {
  my ($errstring) = shift;

  print STDERR "ERROR: $codestrings{$errstring} ", 
    "on line $linenum on page $pagenum of text input file\n";
  $errorcode |= $codes{$errstring};
  return;
}


# ------------------------------------------------------------------
# MAIN
# ------------------------------------------------------------------

while ($line = <>) {
    $line =~ s/\-\r\s\s+\-/\-/g; # remove odd hyphen spacing seen in 2016
    $line =~ s/\r//g;         # remove Unix-style end-of-line
    # if this line is NOT empty, start printing again (see below)
    if ($line !~ /^\s*$/) {
	$killwhite = 0;
	if ($indentlen < 0) {
	    # discover margin indent
	    $line =~ /^((\s)*)/;
	    $indentstr = $1;
	    $indentlen = length($indentstr);
	}
    }
    # remove the margin indent
    $line =~ s/^($indentstr)//;
    # change special hyphens, quotes to regular ones
    $line =~ s/\221\221/\"/g;
    $line =~ s/\222\222/\"/g;
    $line =~ tr/\140\221\222\223\224\226\227\255/\'\'\'\"\"\-\-\-/;
    # omit end-of-line whitespace
    $line =~ s/\s+\n/\n/g;
    # print unless we're between the end of one page
    # and the beginning of the next
    if ($killwhite != 1) { 
	# check to see if we have any invalid characters left
	# 012 = new line, 014 = form feed, 015 = carriage return
	# 040-176 = printable ASCIIs  
	if ($line =~ /([^\012\014\015\040-\176])/) {
	    printerr('char');
	    # note - we don't abort processing here, to find all
        # the unprintable characters in the doc in one pass
	} 
	$linenum++;
	if ($linenum > $maxpagelen) {
	    printerr('page');
	}
	if (length($line) > $maxlinelen) {
	    printerr('line');
	}
	print $line;
    }
    # check to see if this is the end of a page; 
    # if so, then print a form feed (ctl-L), and
    # kill the printing of subsequent empty lines
    if ($line =~ /\[Page \d+\]\s+$/) {
	print "\f\n";
	$killwhite = 1;
	$linenum = 0;
	$pagenum++;
    }
}
exit($errorcode);