#!/bin/sh # # $Id: findaffix.X,v 1.15 1994/01/25 07:11:29 geoff Exp $ # # Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. All modifications to the source code must be clearly marked as # such. Binary redistributions based on modified source code # must be clearly marked as modified versions in the documentation # and/or other materials provided with the distribution. # 4. All advertising materials mentioning features or use of this software # must display the following acknowledgment: # This product includes software developed by Geoff Kuenning and # other unpaid contributors. # 5. The name of Geoff Kuenning may not be used to endorse or promote # products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # Find possible affixes for use with ispell # # Usage: # # findaffix [-p | -s] [-f] [-c] [-m min] [-M max] [-e elim] [-l low] \ # [-t tabchar] [files] # # Each common prefix (-p) or suffix (-s, default) is presented, along # with statistics to indicate how useful such an affix might be in # reducing the size of the input file. Only those affixes which # produce a legal root (one found in the original input) are reported. # # If the "-c" option is not given, the output lines are in the # following format: # # strip/add/count/bytes # # where "strip" is the string that should be stripped from a root # word before adding the affix, "add" is the affix to be added, "count" # is a count of the number of times that this "strip/add" combination # appears, and "bytes" is an estimate of the number of bytes that # will be saved in the raw dictionary file if this combination is # added to the affix file. The field separator in the output will # normally be the tab character specified by the "-t" switch; the # default is a slash ("/"). # # If the "-c" ("clean output") option is given, the appearance of # the output is made cleaner by changing it to: # # -strip+addcountbytes # # where "strip," "add," "count," and "bytes" are as before, and "" # represents the ASCII tab character. # # The method used to generate possible affixes will also generate # longer affixes which have common headers or trailers. For example, # the two words "moth" and "mother" will generate not only the obvious # substition "+er" but also "-h+her" and "-th+ther" (and possibly # even longer ones, depending on the value of "min"). To prevent # cluttering the output with such affixes, any affix pair that shares # a common header (or, for prefixes, trailer) string longer than # "elim" characters (default 1) will be suppressed. You may want to # set "elim" to a value greater than 1 if your language has string # characters; usually the need for this parameter will become obvious # when you examine the output of your findaffix run. # # Normally, the output is sorted on the "bytes" field. If the "-f" # flag is given, the output is sorted according to the "count" field. # # No affix longer than "max" characters (default 8) will be reported. # Smaller values of "max" will make the script run faster. # # Affixes which appear fewer than "low" times (default 10) are # suppressed. This significantly reduces the size of the output file. # # Affixes which generate stems shorter than "min" characters (default 3) # are suppressed. (A stem is the word after the "strip" string has # been removed, and before the "add" string has been added.) This # reduces both the running time and the size of the output file. "Min" # should only be set to 1 if you have a *lot* of free time and disk # space. # # The script requires a non-blank field-separator character for internal # use. Normally, this character is a slash ("/"), but if the slash # appears as a character in the input word list, a different character # can be specified with the "-t" switch. # # If the input files are ispell dictionaries, they should be expanded # before being fed to this script. # # If the input files contains characters other than [A-Za-z], they # should be translated to lowercase before being fed to this script. # # $Log: findaffix.X,v $ # Revision 1.15 1994/01/25 07:11:29 geoff # Get rid of all old RCS log lines in preparation for the 3.1 release. # # TDIR=${TMPDIR-/usr/tmp} TMP=${TDIR}/faff$$ SORTTMP="-T ${TDIR}" # !!SORTTMP!! USAGE='Usage: findaffix [-p | -s] [-f] [-c] [-e elim] [-m min] [-M max] [-l low] [-t tabch] [files]' LOOP=' i = len - maxlim + 1 if (i < minstem + 1) i = minstem + 1 for ( ; i <= len; i++) print substr ($0, 1, i - 1) tabch substr ($0, i) tabch len print $0 tabch tabch len' ELIM='$1!=$2 \ { if (substr ($1, 1, elimlen) != substr ($2, 1, elimlen)) print }' maxlim=8 minstem=3 elimlen=1 lowcount=10 cleanout=no finalsortopts='+3rn -4 +2rn -3 +1 -2 +0 -1' tabch=/ while : do case "$1" in -p) LOOP=' lim = len - minstem if (lim > maxlim) lim = maxlim for (i = 1; i <= lim; i++) print substr ($0, i + 1) tabch substr ($0, 1, i) tabch len print $0 tabch tabch len' ELIM='$1!=$2 \ { if (substr ($1, length ($1), elimlen) \ != substr ($2, length ($2), elimlen)) print }' shift ;; -s) shift ;; -f) finalsortopts='+2rn -3 +3rn -4 +1 -2 +0 -1' shift ;; -c) cleanout=yes shift ;; -e) elimlen=$2 shift; shift ;; -m) minstem=$2 shift; shift ;; -M) maxlim=$2 shift; shift ;; -l) lowcount=$2 shift; shift ;; -t) tabch="$2" shift; shift ;; -*) echo "$USAGE" 1>&2 exit 1 ;; *) break ;; esac done trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15 trap "/bin/rm -f ${TMP}*; exit 0" 13 # # We are ready to do the work. First, we collect all input, translate it # to lowercase, sort it (dropping duplications), and save it for later. # if [ $# -ne 0 ] then cat "$@" | tr '[A-Z]' '[a-z]' else tr '[A-Z]' '[a-z]' fi \ | sort -u $SORTTMP > ${TMP}a # # Now the monstrous pipeline. The awk command produces several lines for # each input word. Each line contains a possible stem (first field), # a possible affix, and the length of the original word. The loop which # does this was placed into the LOOP variable by the code above (q.v.). # # The first sort puts this output into an order appropriate for feeding # to 'join'. The join command then combines stems and affixes, and for # each puts out an affix to strip, an affix to add, and the length of # the word before and after modification. # # From here on out the job is relatively easy. The second 'awk' gets rid # of lines that have the same strip and add affixes, and also eliminates # lines where the strip and add affix have a common leading (for suffixes) # or trailing (for prefixes) substring, or where the strip affix is longer # than the add affix (this is all done by the $ELIM variable, which is also # set up by the code above. The second sort collects identical affixes; # the third 'awk' functions like 'uniq -c', replacing duplicate affixes # with a count and summing the estimate of bytes saved. It also eliminates # any affixes which appear less frequently than the minimum ("lowcount"). # Finally, the third sort ($finalsortopts) rearranges the list in the chosen # sort order. # awk "BEGIN{minstem=$minstem; maxlim=$maxlim; tabch="'"'"$tabch"'"} { len = length ($0) if (len < 2) next '"$LOOP"' }' < ${TMP}a \ | sort "-t$tabch" +0 -1 +1 $SORTTMP -o ${TMP}a join "-t$tabch" -o 1.2 2.2 2.3 ${TMP}a ${TMP}a \ | awk "-F$tabch" "BEGIN{elimlen=$elimlen}$ELIM" \ | sort "-t$tabch" +1 -2 +0 -1 $SORTTMP \ | awk "-F$tabch" 'BEGIN{tabch="'"$tabch"'"; lowcount='"$lowcount"'} { if ($1 == last1 && $2 == last2) { count++ totchars += $3 } else { if ((last1 != "" || last2 != "") && count >= lowcount) print last1 tabch last2 tabch count tabch totchars count = 1 last1 = $1 last2 = $2 totchars = $3 } } END { if ((last1 != "" || last2 != "") && count >= lowcount) print last1 tabch last2 tabch count tabch totchars }' \ | sort "-t$tabch" $finalsortopts $SORTTMP \ | if [ "$cleanout" = "yes" ] then case "$tabch" in /) sedsub=/ sedsep=';' ;; .|\*|\[|\^|\$|\\) sedsub="\\$tabch" sedsep=/ ;; *) sedsub="$tabch" sedsep=/ ;; esac exec sed -e "s$sedsep$sedsub$sedsep ${sedsep}g" \ -e 's/ /+/' -e 's/^/-/' \ -e 's/^-+/+/' -e 's/+ / /' else exec cat fi /bin/rm -f ${TMP}?