: Use /bin/sh
#
# $Id: findaffix.X,v 1.15 1994/01/25 07:11:29 geoff Exp $
#
# Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. All modifications to the source code must be clearly marked as
#    such.  Binary redistributions based on modified source code
#    must be clearly marked as modified versions in the documentation
#    and/or other materials provided with the distribution.
# 4. All advertising materials mentioning features or use of this software
#    must display the following acknowledgment:
#      This product includes software developed by Geoff Kuenning and
#      other unpaid contributors.
# 5. The name of Geoff Kuenning may not be used to endorse or promote
#    products derived from this software without specific prior
#    written permission.
#
# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
#	Find possible affixes for use with ispell
#
#	Usage:
#
#	findaffix [-p | -s] [-f] [-c] [-m min] [-M max] [-e elim] [-l low] \
#	  [-t tabchar] [files]
#
#	Each common prefix (-p) or suffix (-s, default) is presented, along
#	with statistics to indicate how useful such an affix might be in
#	reducing the size of the input file.  Only those affixes which
#	produce a legal root (one found in the original input) are reported.
#
#	If the "-c" option is not given, the output lines are in the
#	following format:
#
#		strip/add/count/bytes
#
#	where "strip" is the string that should be stripped from a root
#	word before adding the affix, "add" is the affix to be added, "count"
#	is a count of the number of times that this "strip/add" combination
#	appears, and "bytes" is an estimate of the number of bytes that
#	will be saved in the raw dictionary file if this combination is
#	added to the affix file.  The field separator in the output will
#	normally be the tab character specified by the "-t" switch;  the
#	default is a slash ("/").
#
#	If the "-c" ("clean output") option is given, the appearance of
#	the output is made cleaner by changing it to:
#
#		-strip+add<tab>count<tab>bytes
#
#	where "strip," "add," "count," and "bytes" are as before, and "<tab>"
#	represents the ASCII tab character.
#
#	The method used to generate possible affixes will also generate
#	longer affixes which have common headers or trailers.  For example,
#	the two words "moth" and "mother" will generate not only the obvious
#	substition "+er" but also "-h+her" and "-th+ther" (and possibly
#	even longer ones, depending on the value of "min").  To prevent
#	cluttering the output with such affixes, any affix pair that shares
#	a common header (or, for prefixes, trailer) string longer than
#	"elim" characters (default 1) will be suppressed.  You may want to
#	set "elim" to a value greater than 1 if your language has string
#	characters;  usually the need for this parameter will become obvious
#	when you examine the output of your findaffix run.
#
#	Normally, the output is sorted on the "bytes" field.  If the "-f"
#	flag is given, the output is sorted according to the "count" field.
#
#	No affix longer than "max" characters (default 8) will be reported.
#	Smaller values of "max" will make the script run faster.
#
#	Affixes which appear fewer than "low" times (default 10) are
#	suppressed.  This significantly reduces the size of the output file.
#
#	Affixes which generate stems shorter than "min" characters (default 3)
#	are suppressed.  (A stem is the word after the "strip" string has
#	been removed, and before the "add" string has been added.)  This
#	reduces both the running time and the size of the output file.  "Min"
#	should only be set to 1 if you have a *lot* of free time and disk
#	space.
#
#	The script requires a non-blank field-separator character for internal
#	use.  Normally, this character is a slash ("/"), but if the slash
#	appears as a character in the input word list, a different character
#	can be specified with the "-t" switch.
#
#	If the input files are ispell dictionaries, they should be expanded
#	before being fed to this script.
#
#	If the input files contains characters other than [A-Za-z], they
#	should be translated to lowercase before being fed to this script.
#
# $Log: findaffix.X,v $
# Revision 1.15  1994/01/25  07:11:29  geoff
# Get rid of all old RCS log lines in preparation for the 3.1 release.
#
#
TDIR=${TMPDIR-/usr/tmp}
TMP=${TDIR}/faff$$
SORTTMP="-T ${TDIR}"			# !!SORTTMP!!
USAGE='Usage:  findaffix [-p | -s] [-f] [-c] [-e elim] [-m min] [-M max] [-l low] [-t tabch] [files]'
LOOP='
    i = len - maxlim + 1
    if (i < minstem + 1)
	i = minstem + 1
    for (  ;  i <= len;  i++)
	print substr ($0, 1, i - 1) tabch substr ($0, i) tabch len
    print $0 tabch tabch len'
ELIM='$1!=$2 \
    {
    if (substr ($1, 1, elimlen) != substr ($2, 1, elimlen))
	print
    }'
maxlim=8
minstem=3
elimlen=1
lowcount=10
cleanout=no
finalsortopts='+3rn -4 +2rn -3 +1 -2 +0 -1'
tabch=/
while :
do
    case "$1" in
	-p)
	    LOOP='
		lim = len - minstem
		if (lim > maxlim)
		    lim = maxlim
		for (i = 1;  i <= lim;  i++)
		    print substr ($0, i + 1) tabch substr ($0, 1, i) tabch len
		print $0 tabch tabch len'
	    ELIM='$1!=$2 \
		{
		if (substr ($1, length ($1), elimlen) \
		  != substr ($2, length ($2), elimlen))
		    print
		}'
	    shift
	    ;;
	-s)
	    shift
	    ;;
	-f)
	    finalsortopts='+2rn -3 +3rn -4 +1 -2 +0 -1'
	    shift
	    ;;
	-c)
	    cleanout=yes
	    shift
	    ;;
	-e)
	    elimlen=$2
	    shift; shift
	    ;;
	-m)
	    minstem=$2
	    shift; shift
	    ;;
	-M)
	    maxlim=$2
	    shift; shift
	    ;;
	-l)
	    lowcount=$2
	    shift; shift
	    ;;
	-t)
	    tabch="$2"
	    shift; shift
	    ;;
	-*)
	    echo "$USAGE" 1>&2
	    exit 1
	    ;;
	*)
	    break
	    ;;
    esac
done
trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15
trap "/bin/rm -f ${TMP}*; exit 0" 13
#
# We are ready to do the work.  First, we collect all input, translate it
# to lowercase, sort it (dropping duplications), and save it for later.
#
if [ $# -ne 0 ]
then
    cat "$@" | tr '[A-Z]' '[a-z]'
else
    tr '[A-Z]' '[a-z]'
fi \
  | sort -u $SORTTMP > ${TMP}a
#
# Now the monstrous pipeline.  The awk command produces several lines for
# each input word.  Each line contains a possible stem (first field),
# a possible affix, and the length of the original word.  The loop which
# does this was placed into the LOOP variable by the code above (q.v.).
#
# The first sort puts this output into an order appropriate for feeding
# to 'join'.  The join command then combines stems and affixes, and for
# each puts out an affix to strip, an affix to add, and the length of
# the word before and after modification.
#
# From here on out the job is relatively easy.  The second 'awk' gets rid
# of lines that have the same strip and add affixes, and also eliminates
# lines where the strip and add affix have a common leading (for suffixes)
# or trailing (for prefixes) substring, or where the strip affix is longer
# than the add affix (this is all done by the $ELIM variable, which is also
# set up by the code above.  The second sort collects identical affixes;
# the third 'awk' functions like 'uniq -c', replacing duplicate affixes
# with a count and summing the estimate of bytes saved.  It also eliminates
# any affixes which appear less frequently than the minimum ("lowcount").
# Finally, the third sort ($finalsortopts) rearranges the list in the chosen
# sort order.
#
awk "BEGIN{minstem=$minstem; maxlim=$maxlim; tabch="'"'"$tabch"'"}
    {
    len = length ($0)
    if (len < 2)
	next
    '"$LOOP"'
    }' < ${TMP}a \
  | sort "-t$tabch" +0 -1 +1 $SORTTMP -o ${TMP}a
join "-t$tabch" -o 1.2 2.2 2.3 ${TMP}a ${TMP}a \
  | awk "-F$tabch" "BEGIN{elimlen=$elimlen}$ELIM" \
  | sort "-t$tabch" +1 -2 +0 -1 $SORTTMP \
  | awk "-F$tabch" 'BEGIN{tabch="'"$tabch"'"; lowcount='"$lowcount"'}
	{
	if ($1 == last1  &&  $2 == last2)
	    {
	    count++
	    totchars += $3
	    }
	else
	    {
	    if ((last1 != ""  ||  last2 != "")  &&  count >= lowcount)
		print last1 tabch last2 tabch count tabch totchars
	    count = 1
	    last1 = $1
	    last2 = $2
	    totchars = $3
	    }
	}
    END {
	if ((last1 != ""  ||  last2 != "")  &&  count >= lowcount)
	    print last1 tabch last2 tabch count tabch totchars
	}' \
  | sort "-t$tabch" $finalsortopts $SORTTMP \
  | if [ "$cleanout" = "yes" ]
    then
	case "$tabch" in
	    /)
		sedsub=/
		sedsep=';'
		;;
	    .|\*|\[|\^|\$|\\)
		sedsub="\\$tabch"
		sedsep=/
		;;
	    *)
		sedsub="$tabch"
		sedsep=/
		;;
	esac
	exec sed -e "s$sedsep$sedsub$sedsep	${sedsep}g" \
	  -e 's/	/+/' -e 's/^/-/' \
	  -e 's/^-+/+/' -e 's/+	/	/'
    else
	exec cat
    fi
/bin/rm -f ${TMP}?