Commits

Miki Tebeka committed 4d25fb3

flattening

Comments (0)

Files changed (81)

libstemmer_c/MANIFEST

-README
-src_c/stem_ISO_8859_1_danish.c
-src_c/stem_ISO_8859_1_danish.h
-src_c/stem_ISO_8859_1_dutch.c
-src_c/stem_ISO_8859_1_dutch.h
-src_c/stem_ISO_8859_1_english.c
-src_c/stem_ISO_8859_1_english.h
-src_c/stem_ISO_8859_1_finnish.c
-src_c/stem_ISO_8859_1_finnish.h
-src_c/stem_ISO_8859_1_french.c
-src_c/stem_ISO_8859_1_french.h
-src_c/stem_ISO_8859_1_german.c
-src_c/stem_ISO_8859_1_german.h
-src_c/stem_ISO_8859_1_hungarian.c
-src_c/stem_ISO_8859_1_hungarian.h
-src_c/stem_ISO_8859_1_italian.c
-src_c/stem_ISO_8859_1_italian.h
-src_c/stem_ISO_8859_1_norwegian.c
-src_c/stem_ISO_8859_1_norwegian.h
-src_c/stem_ISO_8859_1_porter.c
-src_c/stem_ISO_8859_1_porter.h
-src_c/stem_ISO_8859_1_portuguese.c
-src_c/stem_ISO_8859_1_portuguese.h
-src_c/stem_ISO_8859_1_spanish.c
-src_c/stem_ISO_8859_1_spanish.h
-src_c/stem_ISO_8859_1_swedish.c
-src_c/stem_ISO_8859_1_swedish.h
-src_c/stem_ISO_8859_2_romanian.c
-src_c/stem_ISO_8859_2_romanian.h
-src_c/stem_KOI8_R_russian.c
-src_c/stem_KOI8_R_russian.h
-src_c/stem_UTF_8_danish.c
-src_c/stem_UTF_8_danish.h
-src_c/stem_UTF_8_dutch.c
-src_c/stem_UTF_8_dutch.h
-src_c/stem_UTF_8_english.c
-src_c/stem_UTF_8_english.h
-src_c/stem_UTF_8_finnish.c
-src_c/stem_UTF_8_finnish.h
-src_c/stem_UTF_8_french.c
-src_c/stem_UTF_8_french.h
-src_c/stem_UTF_8_german.c
-src_c/stem_UTF_8_german.h
-src_c/stem_UTF_8_hungarian.c
-src_c/stem_UTF_8_hungarian.h
-src_c/stem_UTF_8_italian.c
-src_c/stem_UTF_8_italian.h
-src_c/stem_UTF_8_norwegian.c
-src_c/stem_UTF_8_norwegian.h
-src_c/stem_UTF_8_porter.c
-src_c/stem_UTF_8_porter.h
-src_c/stem_UTF_8_portuguese.c
-src_c/stem_UTF_8_portuguese.h
-src_c/stem_UTF_8_romanian.c
-src_c/stem_UTF_8_romanian.h
-src_c/stem_UTF_8_russian.c
-src_c/stem_UTF_8_russian.h
-src_c/stem_UTF_8_spanish.c
-src_c/stem_UTF_8_spanish.h
-src_c/stem_UTF_8_swedish.c
-src_c/stem_UTF_8_swedish.h
-src_c/stem_UTF_8_turkish.c
-src_c/stem_UTF_8_turkish.h
-runtime/api.c
-runtime/api.h
-runtime/header.h
-runtime/utilities.c
-libstemmer/libstemmer.c
-libstemmer/libstemmer_utf8.c
-libstemmer/modules.h
-libstemmer/modules_utf8.h
-include/libstemmer.h

libstemmer_c/Makefile

-include mkinc.mak
-CFLAGS=-Iinclude
-all: libstemmer.o stemwords
-libstemmer.o: $(snowball_sources:.c=.o)
-	$(AR) -cru $@ $^
-stemwords: examples/stemwords.o libstemmer.o
-	$(CC) -o $@ $^
-clean:
-	rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o

libstemmer_c/README

-libstemmer_c
-============
-
-This document pertains to the C version of the libstemmer distribution,
-available for download from:
-
-http://snowball.tartarus.org/dist/libstemmer_c.tgz
-
-
-Compiling the library
-=====================
-
-A simple makefile is provided for Unix style systems.  On such systems, it
-should be possible simply to run "make", and the file "libstemmer.o"
-and the example program "stemwords" will be generated.
-
-If this doesn't work on your system, you need to write your own build
-system (or call the compiler directly).  The files to compile are
-all contained in the "libstemmer", "runtime" and "src_c" directories,
-and the public header file is contained in the "include" directory.
-
-The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
-sets.  To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
-"libstemmer.c".
-
-For convenience "mkinc.mak" is a makefile fragment listing the source files and
-header files used to compile the standard version of the library.
-"mkinc_utf8.mak" is a comparable makefile fragment listing just the source
-files for the UTF-8 only version of the library.
-
-
-Using the library
-=================
-
-The library provides a simple C API.  Essentially, a new stemmer can
-be obtained by using "sb_stemmer_new".  "sb_stemmer_stem" is then
-used to stem a word, "sb_stemmer_length" returns the stemmed
-length of the last word processed, and "sb_stemmer_delete" is
-used to delete a stemmer.
-
-Creating a stemmer is a relatively expensive operation - the expected
-usage pattern is that a new stemmer is created when needed, used
-to stem many words, and deleted after some time.
-
-Stemmers are re-entrant, but not threadsafe.  In other words, if
-you wish to access the same stemmer object from multiple threads,
-you must ensure that all access is protected by a mutex or similar
-device.
-
-libstemmer does not currently incorporate any mechanism for caching the results
-of stemming operations.  Such caching can greatly increase the performance of a
-stemmer under certain situations, so suitable patches will be considered for
-inclusion.
-
-The standard libstemmer sources contain an algorithm for each of the supported
-languages.  The algorithm may be selected using the english name of the
-language, or using the 2 or 3 letter ISO 639 language codes.  In addition,
-the traditional "Porter" stemming algorithm for english is included for
-backwards compatibility purposes, but we recommend use of the "English"
-stemmer in preference for new projects.
-
-(Some minor algorithms which are included only as curiosities in the snowball
-website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
-included in the standard libstemmer sources.  These are not really supported by
-the snowball project, but it would be possible to compile a modified libstemmer
-library containing these if desired.)
-
-
-The stemwords example
-=====================
-
-The stemwords example program allows you to run any of the stemmers
-compiled into the libstemmer library on a sample vocabulary.  For
-details on how to use it, run it with the "-h" command line option.
-
-
-Using the library in a larger system
-====================================
-
-If you are incorporating the library into the build system of a larger
-program, I recommend copying the unpacked tarball without modification into
-a subdirectory of the sources of your program.  Future versions of the
-library are intended to keep the same structure, so this will keep the
-work required to move to a new version of the library to a minimum.
-
-As an additional convenience, the list of source and header files used
-in the library is detailed in mkinc.mak - a file which is in a suitable
-format for inclusion by a Makefile.  By including this file in your build
-system, you can link the snowball system into your program with a few
-extra rules.
-
-Using the library in a system using GNU autotools
-=================================================
-
-The libstemmer_c library can be integrated into a larger system which uses the
-GNU autotool framework (and in particular, automake and autoconf) as follows:
-
-1) Unpack libstemmer_c.tgz in the top level project directory so that there is
-   a libstemmer_c subdirectory of the top level directory of the project.
-
-2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
-   
-noinst_LTLIBRARIES = libstemmer.la
-include $(srcdir)/mkinc.mak
-noinst_HEADERS = $(snowball_headers)
-libstemmer_la_SOURCES = $(snowball_sources) 
-
-(You may also need to add other lines to this, for example, if you are using
-compiler options which are not compatible with compiling the libstemmer
-library.)
-
-3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
-   configure.ac file.
-
-4) Add to the top level makefile the following lines (or modify existing
-   assignments to these variables appropriately):
-
-AUTOMAKE_OPTIONS = subdir-objects
-AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
-SUBDIRS=libstemmer_c
-<name>_LIBADD = libstemmer_c/libstemmer.la
-
-(Where <name> is the name of the library or executable which links against
-libstemmer.) 
-

libstemmer_c/examples/stemwords.c

-/* This is a simple program which uses libstemmer to provide a command
- * line interface for stemming using any of the algorithms provided.
- */
-
-#include <stdio.h>
-#include <stdlib.h> /* for malloc, free */
-#include <string.h> /* for memmove */
-#include <ctype.h>  /* for isupper, tolower */
-
-#include "libstemmer.h"
-
-const char * progname;
-static int pretty = 1;
-
-static void
-stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
-{
-#define INC 10
-    int lim = INC;
-    sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
-
-    while(1) {
-        int ch = getc(f_in);
-        if (ch == EOF) {
-            free(b); return;
-        }
-        {
-            int i = 0;
-	    int inlen = 0;
-            while(1) {
-                if (ch == '\n' || ch == EOF) break;
-                if (i == lim) {
-                    sb_symbol * newb;
-		    newb = (sb_symbol *)
-			    realloc(b, (lim + INC) * sizeof(sb_symbol));
-		    if (newb == 0) goto error;
-		    b = newb;
-                    lim = lim + INC;
-                }
-		/* Update count of utf-8 characters. */
-		if (ch < 0x80 || ch > 0xBF) inlen += 1;
-                /* force lower case: */
-                if (isupper(ch)) ch = tolower(ch);
-
-                b[i] = ch;
-		i++;
-                ch = getc(f_in);
-            }
-
-	    {
-		const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
-                if (stemmed == NULL)
-                {
-                    fprintf(stderr, "Out of memory");
-                    exit(1);
-                }
-                else
-		{
-		    if (pretty == 1) {
-			fwrite(b, i, 1, f_out);
-			fputs(" -> ", f_out);
-		    } else if (pretty == 2) {
-			fwrite(b, i, 1, f_out);
-			if (sb_stemmer_length(stemmer) > 0) {
-			    int j;
-			    if (inlen < 30) {
-				for (j = 30 - inlen; j > 0; j--)
-				    fputs(" ", f_out);
-			    } else {
-				fputs("\n", f_out);
-				for (j = 30; j > 0; j--)
-				    fputs(" ", f_out);
-			    }
-			}
-		    }
-
-		    fputs((char *)stemmed, f_out);
-		    putc('\n', f_out);
-		}
-            }
-        }
-    }
-error:
-    if (b != 0) free(b);
-    return;
-}
-
-/** Display the command line syntax, and then exit.
- *  @param n The value to exit with.
- */
-static void
-usage(int n)
-{
-    printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
-	  "\n"
-	  "The input file consists of a list of words to be stemmed, one per\n"
-	  "line. Words should be in lower case, but (for English) A-Z letters\n"
-	  "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
-	  "used.\n"
-	  "\n"
-	  "If -c is given, the argument is the character encoding of the input\n"
-          "and output files.  If it is omitted, the UTF-8 encoding is used.\n"
-	  "\n"
-	  "If -p is given the output file consists of each word of the input\n"
-	  "file followed by \"->\" followed by its stemmed equivalent.\n"
-	  "If -p2 is given the output file is a two column layout containing\n"
-	  "the input words in the first column and the stemmed eqivalents in\n"
-	  "the second column.\n"
-	  "Otherwise, the output file consists of the stemmed words, one per\n"
-	  "line.\n"
-	  "\n"
-	  "-h displays this help\n",
-	  progname);
-    exit(n);
-}
-
-int
-main(int argc, char * argv[])
-{
-    char * in = 0;
-    char * out = 0;
-    FILE * f_in;
-    FILE * f_out;
-    struct sb_stemmer * stemmer;
-
-    char * language = "english";
-    char * charenc = NULL;
-
-    char * s;
-    int i = 1;
-    pretty = 0;
-
-    progname = argv[0];
-
-    while(i < argc) {
-	s = argv[i++];
-	if (s[0] == '-') {
-	    if (strcmp(s, "-o") == 0) {
-		if (i >= argc) {
-		    fprintf(stderr, "%s requires an argument\n", s);
-		    exit(1);
-		}
-		out = argv[i++];
-	    } else if (strcmp(s, "-i") == 0) {
-		if (i >= argc) {
-		    fprintf(stderr, "%s requires an argument\n", s);
-		    exit(1);
-		}
-		in = argv[i++];
-	    } else if (strcmp(s, "-l") == 0) {
-		if (i >= argc) {
-		    fprintf(stderr, "%s requires an argument\n", s);
-		    exit(1);
-		}
-		language = argv[i++];
-	    } else if (strcmp(s, "-c") == 0) {
-		if (i >= argc) {
-		    fprintf(stderr, "%s requires an argument\n", s);
-		    exit(1);
-		}
-		charenc = argv[i++];
-	    } else if (strcmp(s, "-p2") == 0) {
-		pretty = 2;
-	    } else if (strcmp(s, "-p") == 0) {
-		pretty = 1;
-	    } else if (strcmp(s, "-h") == 0) {
-		usage(0);
-	    } else {
-		fprintf(stderr, "option %s unknown\n", s);
-		usage(1);
-	    }
-	} else {
-	    fprintf(stderr, "unexpected parameter %s\n", s);
-	    usage(1);
-	}
-    }
-
-    /* prepare the files */
-    f_in = (in == 0) ? stdin : fopen(in, "r");
-    if (f_in == 0) {
-	fprintf(stderr, "file %s not found\n", in);
-	exit(1);
-    }
-    f_out = (out == 0) ? stdout : fopen(out, "w");
-    if (f_out == 0) {
-	fprintf(stderr, "file %s cannot be opened\n", out);
-	exit(1);
-    }
-
-    /* do the stemming process: */
-    stemmer = sb_stemmer_new(language, charenc);
-    if (stemmer == 0) {
-        if (charenc == NULL) {
-            fprintf(stderr, "language `%s' not available for stemming\n", language);
-            exit(1);
-        } else {
-            fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
-            exit(1);
-        }
-    }
-    stem_file(stemmer, f_in, f_out);
-    sb_stemmer_delete(stemmer);
-
-    if (in != 0) (void) fclose(f_in);
-    if (out != 0) (void) fclose(f_out);
-
-    return 0;
-}
-

libstemmer_c/include/libstemmer.h

-
-/* Make header file work when included from C++ */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct sb_stemmer;
-typedef unsigned char sb_symbol;
-
-/* FIXME - should be able to get a version number for each stemming
- * algorithm (which will be incremented each time the output changes). */
-
-/** Returns an array of the names of the available stemming algorithms.
- *  Note that these are the canonical names - aliases (ie, other names for
- *  the same algorithm) will not be included in the list.
- *  The list is terminated with a null pointer.
- *
- *  The list must not be modified in any way.
- */
-const char ** sb_stemmer_list(void);
-
-/** Create a new stemmer object, using the specified algorithm, for the
- *  specified character encoding.
- *
- *  All algorithms will usually be available in UTF-8, but may also be
- *  available in other character encodings.
- *
- *  @param algorithm The algorithm name.  This is either the english
- *  name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
- *  language.  Note that case is significant in this parameter - the
- *  value should be supplied in lower case.
- *
- *  @param charenc The character encoding.  NULL may be passed as
- *  this value, in which case UTF-8 encoding will be assumed. Otherwise,
- *  the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
- *  "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian).  Note that
- *  case is significant in this parameter.
- *
- *  @return NULL if the specified algorithm is not recognised, or the
- *  algorithm is not available for the requested encoding.  Otherwise,
- *  returns a pointer to a newly created stemmer for the requested algorithm.
- *  The returned pointer must be deleted by calling sb_stemmer_delete().
- *
- *  @note NULL will also be returned if an out of memory error occurs.
- */
-struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
-
-/** Delete a stemmer object.
- *
- *  This frees all resources allocated for the stemmer.  After calling
- *  this function, the supplied stemmer may no longer be used in any way.
- *
- *  It is safe to pass a null pointer to this function - this will have
- *  no effect.
- */
-void                sb_stemmer_delete(struct sb_stemmer * stemmer);
-
-/** Stem a word.
- *
- *  The return value is owned by the stemmer - it must not be freed or
- *  modified, and it will become invalid when the stemmer is called again,
- *  or if the stemmer is freed.
- *
- *  The length of the return value can be obtained using sb_stemmer_length().
- *
- *  If an out-of-memory error occurs, this will return NULL.
- */
-const sb_symbol *   sb_stemmer_stem(struct sb_stemmer * stemmer,
-				    const sb_symbol * word, int size);
-
-/** Get the length of the result of the last stemmed word.
- *  This should not be called before sb_stemmer_stem() has been called.
- */
-int                 sb_stemmer_length(struct sb_stemmer * stemmer);
-
-#ifdef __cplusplus
-}
-#endif
-

libstemmer_c/libstemmer/libstemmer.c

-
-#include <stdlib.h>
-#include <string.h>
-#include "../include/libstemmer.h"
-#include "../runtime/api.h"
-#include "modules.h"
-
-struct sb_stemmer {
-    struct SN_env * (*create)(void);
-    void (*close)(struct SN_env *);
-    int (*stem)(struct SN_env *);
-
-    struct SN_env * env;
-};
-
-extern const char **
-sb_stemmer_list(void)
-{
-    return algorithm_names;
-}
-
-static stemmer_encoding_t
-sb_getenc(const char * charenc)
-{
-    struct stemmer_encoding * encoding;
-    if (charenc == NULL) return ENC_UTF_8;
-    for (encoding = encodings; encoding->name != 0; encoding++) {
-	if (strcmp(encoding->name, charenc) == 0) break;
-    }
-    if (encoding->name == NULL) return ENC_UNKNOWN;
-    return encoding->enc;
-}
-
-extern struct sb_stemmer *
-sb_stemmer_new(const char * algorithm, const char * charenc)
-{
-    stemmer_encoding_t enc;
-    struct stemmer_modules * module;
-    struct sb_stemmer * stemmer;
-
-    enc = sb_getenc(charenc);
-    if (enc == ENC_UNKNOWN) return NULL;
-
-    for (module = modules; module->name != 0; module++) {
-	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
-    }
-    if (module->name == NULL) return NULL;
-    
-    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
-    if (stemmer == NULL) return NULL;
-
-    stemmer->create = module->create;
-    stemmer->close = module->close;
-    stemmer->stem = module->stem;
-
-    stemmer->env = stemmer->create();
-    if (stemmer->env == NULL)
-    {
-        sb_stemmer_delete(stemmer);
-        return NULL;
-    }
-
-    return stemmer;
-}
-
-void
-sb_stemmer_delete(struct sb_stemmer * stemmer)
-{
-    if (stemmer == 0) return;
-    if (stemmer->close == 0) return;
-    stemmer->close(stemmer->env);
-    stemmer->close = 0;
-    free(stemmer);
-}
-
-const sb_symbol *
-sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
-{
-    int ret;
-    if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
-    {
-        stemmer->env->l = 0;
-        return NULL;
-    }
-    ret = stemmer->stem(stemmer->env);
-    if (ret < 0) return NULL;
-    stemmer->env->p[stemmer->env->l] = 0;
-    return (const sb_symbol *)(stemmer->env->p);
-}
-
-int
-sb_stemmer_length(struct sb_stemmer * stemmer)
-{
-    return stemmer->env->l;
-}

libstemmer_c/libstemmer/libstemmer_c.in

-
-#include <stdlib.h>
-#include <string.h>
-#include "../include/libstemmer.h"
-#include "../runtime/api.h"
-#include "@MODULES_H@"
-
-struct sb_stemmer {
-    struct SN_env * (*create)(void);
-    void (*close)(struct SN_env *);
-    int (*stem)(struct SN_env *);
-
-    struct SN_env * env;
-};
-
-extern const char **
-sb_stemmer_list(void)
-{
-    return algorithm_names;
-}
-
-static stemmer_encoding_t
-sb_getenc(const char * charenc)
-{
-    struct stemmer_encoding * encoding;
-    if (charenc == NULL) return ENC_UTF_8;
-    for (encoding = encodings; encoding->name != 0; encoding++) {
-	if (strcmp(encoding->name, charenc) == 0) break;
-    }
-    if (encoding->name == NULL) return ENC_UNKNOWN;
-    return encoding->enc;
-}
-
-extern struct sb_stemmer *
-sb_stemmer_new(const char * algorithm, const char * charenc)
-{
-    stemmer_encoding_t enc;
-    struct stemmer_modules * module;
-    struct sb_stemmer * stemmer;
-
-    enc = sb_getenc(charenc);
-    if (enc == ENC_UNKNOWN) return NULL;
-
-    for (module = modules; module->name != 0; module++) {
-	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
-    }
-    if (module->name == NULL) return NULL;
-    
-    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
-    if (stemmer == NULL) return NULL;
-
-    stemmer->create = module->create;
-    stemmer->close = module->close;
-    stemmer->stem = module->stem;
-
-    stemmer->env = stemmer->create();
-    if (stemmer->env == NULL)
-    {
-        sb_stemmer_delete(stemmer);
-        return NULL;
-    }
-
-    return stemmer;
-}
-
-void
-sb_stemmer_delete(struct sb_stemmer * stemmer)
-{
-    if (stemmer == 0) return;
-    if (stemmer->close == 0) return;
-    stemmer->close(stemmer->env);
-    stemmer->close = 0;
-    free(stemmer);
-}
-
-const sb_symbol *
-sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
-{
-    int ret;
-    if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
-    {
-        stemmer->env->l = 0;
-        return NULL;
-    }
-    ret = stemmer->stem(stemmer->env);
-    if (ret < 0) return NULL;
-    stemmer->env->p[stemmer->env->l] = 0;
-    return (const sb_symbol *)(stemmer->env->p);
-}
-
-int
-sb_stemmer_length(struct sb_stemmer * stemmer)
-{
-    return stemmer->env->l;
-}

libstemmer_c/libstemmer/libstemmer_utf8.c

-
-#include <stdlib.h>
-#include <string.h>
-#include "../include/libstemmer.h"
-#include "../runtime/api.h"
-#include "modules_utf8.h"
-
-struct sb_stemmer {
-    struct SN_env * (*create)(void);
-    void (*close)(struct SN_env *);
-    int (*stem)(struct SN_env *);
-
-    struct SN_env * env;
-};
-
-extern const char **
-sb_stemmer_list(void)
-{
-    return algorithm_names;
-}
-
-static stemmer_encoding_t
-sb_getenc(const char * charenc)
-{
-    struct stemmer_encoding * encoding;
-    if (charenc == NULL) return ENC_UTF_8;
-    for (encoding = encodings; encoding->name != 0; encoding++) {
-	if (strcmp(encoding->name, charenc) == 0) break;
-    }
-    if (encoding->name == NULL) return ENC_UNKNOWN;
-    return encoding->enc;
-}
-
-extern struct sb_stemmer *
-sb_stemmer_new(const char * algorithm, const char * charenc)
-{
-    stemmer_encoding_t enc;
-    struct stemmer_modules * module;
-    struct sb_stemmer * stemmer;
-
-    enc = sb_getenc(charenc);
-    if (enc == ENC_UNKNOWN) return NULL;
-
-    for (module = modules; module->name != 0; module++) {
-	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
-    }
-    if (module->name == NULL) return NULL;
-    
-    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
-    if (stemmer == NULL) return NULL;
-
-    stemmer->create = module->create;
-    stemmer->close = module->close;
-    stemmer->stem = module->stem;
-
-    stemmer->env = stemmer->create();
-    if (stemmer->env == NULL)
-    {
-        sb_stemmer_delete(stemmer);
-        return NULL;
-    }
-
-    return stemmer;
-}
-
-void
-sb_stemmer_delete(struct sb_stemmer * stemmer)
-{
-    if (stemmer == 0) return;
-    if (stemmer->close == 0) return;
-    stemmer->close(stemmer->env);
-    stemmer->close = 0;
-    free(stemmer);
-}
-
-const sb_symbol *
-sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
-{
-    int ret;
-    if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
-    {
-        stemmer->env->l = 0;
-        return NULL;
-    }
-    ret = stemmer->stem(stemmer->env);
-    if (ret < 0) return NULL;
-    stemmer->env->p[stemmer->env->l] = 0;
-    return (const sb_symbol *)(stemmer->env->p);
-}
-
-int
-sb_stemmer_length(struct sb_stemmer * stemmer)
-{
-    return stemmer->env->l;
-}

libstemmer_c/libstemmer/modules.h

-/* libstemmer/modules.h: List of stemming modules.
- *
- * This file is generated by mkmodules.pl from a list of module names.
- * Do not edit manually.
- *
- * Modules included by this file are: danish, dutch, english, finnish, french,
- * german, hungarian, italian, norwegian, porter, portuguese, romanian,
- * russian, spanish, swedish, turkish
- */
-
-#include "../src_c/stem_ISO_8859_1_danish.h"
-#include "../src_c/stem_UTF_8_danish.h"
-#include "../src_c/stem_ISO_8859_1_dutch.h"
-#include "../src_c/stem_UTF_8_dutch.h"
-#include "../src_c/stem_ISO_8859_1_english.h"
-#include "../src_c/stem_UTF_8_english.h"
-#include "../src_c/stem_ISO_8859_1_finnish.h"
-#include "../src_c/stem_UTF_8_finnish.h"
-#include "../src_c/stem_ISO_8859_1_french.h"
-#include "../src_c/stem_UTF_8_french.h"
-#include "../src_c/stem_ISO_8859_1_german.h"
-#include "../src_c/stem_UTF_8_german.h"
-#include "../src_c/stem_ISO_8859_1_hungarian.h"
-#include "../src_c/stem_UTF_8_hungarian.h"
-#include "../src_c/stem_ISO_8859_1_italian.h"
-#include "../src_c/stem_UTF_8_italian.h"
-#include "../src_c/stem_ISO_8859_1_norwegian.h"
-#include "../src_c/stem_UTF_8_norwegian.h"
-#include "../src_c/stem_ISO_8859_1_porter.h"
-#include "../src_c/stem_UTF_8_porter.h"
-#include "../src_c/stem_ISO_8859_1_portuguese.h"
-#include "../src_c/stem_UTF_8_portuguese.h"
-#include "../src_c/stem_ISO_8859_2_romanian.h"
-#include "../src_c/stem_UTF_8_romanian.h"
-#include "../src_c/stem_KOI8_R_russian.h"
-#include "../src_c/stem_UTF_8_russian.h"
-#include "../src_c/stem_ISO_8859_1_spanish.h"
-#include "../src_c/stem_UTF_8_spanish.h"
-#include "../src_c/stem_ISO_8859_1_swedish.h"
-#include "../src_c/stem_UTF_8_swedish.h"
-#include "../src_c/stem_UTF_8_turkish.h"
-
-typedef enum {
-  ENC_UNKNOWN=0,
-  ENC_ISO_8859_1,
-  ENC_ISO_8859_2,
-  ENC_KOI8_R,
-  ENC_UTF_8
-} stemmer_encoding_t;
-
-struct stemmer_encoding {
-  const char * name;
-  stemmer_encoding_t enc;
-};
-static struct stemmer_encoding encodings[] = {
-  {"ISO_8859_1", ENC_ISO_8859_1},
-  {"ISO_8859_2", ENC_ISO_8859_2},
-  {"KOI8_R", ENC_KOI8_R},
-  {"UTF_8", ENC_UTF_8},
-  {0,ENC_UNKNOWN}
-};
-
-struct stemmer_modules {
-  const char * name;
-  stemmer_encoding_t enc; 
-  struct SN_env * (*create)(void);
-  void (*close)(struct SN_env *);
-  int (*stem)(struct SN_env *);
-};
-static struct stemmer_modules modules[] = {
-  {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
-  {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
-  {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
-  {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
-  {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
-  {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
-  {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
-  {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
-  {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
-  {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
-  {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
-  {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
-  {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
-  {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
-  {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
-  {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
-  {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
-  {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
-  {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
-  {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
-  {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
-  {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
-  {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
-  {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
-  {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
-  {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
-  {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
-  {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
-  {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
-  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
-  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
-  {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
-  {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
-  {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
-  {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
-  {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
-  {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
-  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
-  {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
-  {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
-  {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
-  {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
-  {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
-  {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
-  {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
-  {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
-  {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
-  {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
-  {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
-  {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
-  {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
-  {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
-  {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
-  {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
-  {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
-  {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
-  {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
-  {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
-  {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
-  {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
-  {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
-  {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
-  {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
-  {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
-  {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
-  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
-  {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
-  {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
-  {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
-  {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
-  {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
-  {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
-  {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
-  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
-  {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
-  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
-  {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
-  {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
-  {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
-  {0,ENC_UNKNOWN,0,0,0}
-};
-static const char * algorithm_names[] = {
-  "danish", 
-  "dutch", 
-  "english", 
-  "finnish", 
-  "french", 
-  "german", 
-  "hungarian", 
-  "italian", 
-  "norwegian", 
-  "porter", 
-  "portuguese", 
-  "romanian", 
-  "russian", 
-  "spanish", 
-  "swedish", 
-  "turkish", 
-  0
-};

libstemmer_c/libstemmer/modules.txt

-# This file contains a list of stemmers to include in the distribution.
-# The format is a set of space separated lines - on each line:
-#  First item is name of stemmer.
-#  Second item is comma separated list of character sets.
-#  Third item is comma separated list of names to refer to the stemmer by.
-#
-# Lines starting with a #, or blank lines, are ignored.
-
-# List all the main algorithms for each language, in UTF-8, and also with
-# the most commonly used encoding.
-
-danish          UTF_8,ISO_8859_1        danish,da,dan
-dutch           UTF_8,ISO_8859_1        dutch,nl,dut,nld
-english         UTF_8,ISO_8859_1        english,en,eng
-finnish         UTF_8,ISO_8859_1        finnish,fi,fin
-french          UTF_8,ISO_8859_1        french,fr,fre,fra
-german          UTF_8,ISO_8859_1        german,de,ger,deu
-hungarian       UTF_8,ISO_8859_1        hungarian,hu,hun
-italian         UTF_8,ISO_8859_1        italian,it,ita
-norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
-portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
-romanian        UTF_8,ISO_8859_2        romanian,ro,rum,ron
-russian         UTF_8,KOI8_R            russian,ru,rus
-spanish         UTF_8,ISO_8859_1        spanish,es,esl,spa
-swedish         UTF_8,ISO_8859_1        swedish,sv,swe
-turkish         UTF_8                   turkish,tr,tur
-
-# Also include the traditional porter algorithm for english.
-# The porter algorithm is included in the libstemmer distribution to assist
-# with backwards compatibility, but for new systems the english algorithm
-# should be used in preference.
-porter          UTF_8,ISO_8859_1        porter
-
-# Some other stemmers in the snowball project are not included in the standard
-# distribution. To compile a libstemmer with them in, add them to this list,
-# and regenerate the distribution. (You will need a full source checkout for
-# this.) They are included in the snowball website as curiosities, but are not
-# intended for general use, and use of them is is not fully supported.  These
-# algorithms are:
-#
-# german2          - This is a slight modification of the german stemmer.
-#german2          UTF_8,ISO_8859_1        german2
-#
-# kraaij_pohlmann  - This is a different dutch stemmer.
-#kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann
-#
-# lovins           - This is an english stemmer, but fairly outdated, and
-#                    only really applicable to a restricted type of input text
-#                    (keywords in academic publications).
-#lovins           UTF_8,ISO_8859_1        lovins

libstemmer_c/libstemmer/modules_utf8.h

-/* libstemmer/modules_utf8.h: List of stemming modules.
- *
- * This file is generated by mkmodules.pl from a list of module names.
- * Do not edit manually.
- *
- * Modules included by this file are: danish, dutch, english, finnish, french,
- * german, hungarian, italian, norwegian, porter, portuguese, romanian,
- * russian, spanish, swedish, turkish
- */
-
-#include "../src_c/stem_UTF_8_danish.h"
-#include "../src_c/stem_UTF_8_dutch.h"
-#include "../src_c/stem_UTF_8_english.h"
-#include "../src_c/stem_UTF_8_finnish.h"
-#include "../src_c/stem_UTF_8_french.h"
-#include "../src_c/stem_UTF_8_german.h"
-#include "../src_c/stem_UTF_8_hungarian.h"
-#include "../src_c/stem_UTF_8_italian.h"
-#include "../src_c/stem_UTF_8_norwegian.h"
-#include "../src_c/stem_UTF_8_porter.h"
-#include "../src_c/stem_UTF_8_portuguese.h"
-#include "../src_c/stem_UTF_8_romanian.h"
-#include "../src_c/stem_UTF_8_russian.h"
-#include "../src_c/stem_UTF_8_spanish.h"
-#include "../src_c/stem_UTF_8_swedish.h"
-#include "../src_c/stem_UTF_8_turkish.h"
-
-typedef enum {
-  ENC_UNKNOWN=0,
-  ENC_UTF_8
-} stemmer_encoding_t;
-
-struct stemmer_encoding {
-  const char * name;
-  stemmer_encoding_t enc;
-};
-static struct stemmer_encoding encodings[] = {
-  {"UTF_8", ENC_UTF_8},
-  {0,ENC_UNKNOWN}
-};
-
-struct stemmer_modules {
-  const char * name;
-  stemmer_encoding_t enc; 
-  struct SN_env * (*create)(void);
-  void (*close)(struct SN_env *);
-  int (*stem)(struct SN_env *);
-};
-static struct stemmer_modules modules[] = {
-  {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
-  {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
-  {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
-  {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
-  {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
-  {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
-  {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
-  {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
-  {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
-  {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
-  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
-  {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
-  {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
-  {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
-  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
-  {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
-  {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
-  {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
-  {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
-  {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
-  {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
-  {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
-  {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
-  {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
-  {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
-  {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
-  {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
-  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
-  {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
-  {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
-  {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
-  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
-  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
-  {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
-  {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
-  {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
-  {0,ENC_UNKNOWN,0,0,0}
-};
-static const char * algorithm_names[] = {
-  "danish", 
-  "dutch", 
-  "english", 
-  "finnish", 
-  "french", 
-  "german", 
-  "hungarian", 
-  "italian", 
-  "norwegian", 
-  "porter", 
-  "portuguese", 
-  "romanian", 
-  "russian", 
-  "spanish", 
-  "swedish", 
-  "turkish", 
-  0
-};

libstemmer_c/libstemmer/modules_utf8.txt

-# This file contains a list of stemmers to include in the distribution.
-# The format is a set of space separated lines - on each line:
-#  First item is name of stemmer.
-#  Second item is comma separated list of character sets.
-#  Third item is comma separated list of names to refer to the stemmer by.
-#
-# Lines starting with a #, or blank lines, are ignored.
-
-# List all the main algorithms for each language, in UTF-8.
-
-danish          UTF_8                   danish,da,dan
-dutch           UTF_8                   dutch,nl,dut,nld
-english         UTF_8                   english,en,eng
-finnish         UTF_8                   finnish,fi,fin
-french          UTF_8                   french,fr,fre,fra
-german          UTF_8                   german,de,ger,deu
-hungarian       UTF_8                   hungarian,hu,hun
-italian         UTF_8                   italian,it,ita
-norwegian       UTF_8                   norwegian,no,nor
-portuguese      UTF_8                   portuguese,pt,por
-romanian        UTF_8                   romanian,ro,rum,ron
-russian         UTF_8                   russian,ru,rus
-spanish         UTF_8                   spanish,es,esl,spa
-swedish         UTF_8                   swedish,sv,swe
-turkish         UTF_8                   turkish,tr,tur
-
-# Also include the traditional porter algorithm for english.
-# The porter algorithm is included in the libstemmer distribution to assist
-# with backwards compatibility, but for new systems the english algorithm
-# should be used in preference.
-porter          UTF_8                   porter
-
-# Some other stemmers in the snowball project are not included in the standard
-# distribution. To compile a libstemmer with them in, add them to this list,
-# and regenerate the distribution. (You will need a full source checkout for
-# this.) They are included in the snowball website as curiosities, but are not
-# intended for general use, and use of them is is not fully supported.  These
-# algorithms are:
-#
-# german2          - This is a slight modification of the german stemmer.
-#german2          UTF_8                   german2
-#
-# kraaij_pohlmann  - This is a different dutch stemmer.
-#kraaij_pohlmann  UTF_8                   kraaij_pohlmann
-#
-# lovins           - This is an english stemmer, but fairly outdated, and
-#                    only really applicable to a restricted type of input text
-#                    (keywords in academic publications).
-#lovins           UTF_8                   lovins

libstemmer_c/mkinc.mak

-# libstemmer/mkinc.mak: List of stemming module source files
-#
-# This file is generated by mkmodules.pl from a list of module names.
-# Do not edit manually.
-#
-# Modules included by this file are: danish, dutch, english, finnish, french,
-# german, hungarian, italian, norwegian, porter, portuguese, romanian,
-# russian, spanish, swedish, turkish
-
-snowball_sources= \
-  src_c/stem_ISO_8859_1_danish.c \
-  src_c/stem_UTF_8_danish.c \
-  src_c/stem_ISO_8859_1_dutch.c \
-  src_c/stem_UTF_8_dutch.c \
-  src_c/stem_ISO_8859_1_english.c \
-  src_c/stem_UTF_8_english.c \
-  src_c/stem_ISO_8859_1_finnish.c \
-  src_c/stem_UTF_8_finnish.c \
-  src_c/stem_ISO_8859_1_french.c \
-  src_c/stem_UTF_8_french.c \
-  src_c/stem_ISO_8859_1_german.c \
-  src_c/stem_UTF_8_german.c \
-  src_c/stem_ISO_8859_1_hungarian.c \
-  src_c/stem_UTF_8_hungarian.c \
-  src_c/stem_ISO_8859_1_italian.c \
-  src_c/stem_UTF_8_italian.c \
-  src_c/stem_ISO_8859_1_norwegian.c \
-  src_c/stem_UTF_8_norwegian.c \
-  src_c/stem_ISO_8859_1_porter.c \
-  src_c/stem_UTF_8_porter.c \
-  src_c/stem_ISO_8859_1_portuguese.c \
-  src_c/stem_UTF_8_portuguese.c \
-  src_c/stem_ISO_8859_2_romanian.c \
-  src_c/stem_UTF_8_romanian.c \
-  src_c/stem_KOI8_R_russian.c \
-  src_c/stem_UTF_8_russian.c \
-  src_c/stem_ISO_8859_1_spanish.c \
-  src_c/stem_UTF_8_spanish.c \
-  src_c/stem_ISO_8859_1_swedish.c \
-  src_c/stem_UTF_8_swedish.c \
-  src_c/stem_UTF_8_turkish.c \
-  runtime/api.c \
-  runtime/utilities.c \
-  libstemmer/libstemmer.c
-
-snowball_headers= \
-  src_c/stem_ISO_8859_1_danish.h \
-  src_c/stem_UTF_8_danish.h \
-  src_c/stem_ISO_8859_1_dutch.h \
-  src_c/stem_UTF_8_dutch.h \
-  src_c/stem_ISO_8859_1_english.h \
-  src_c/stem_UTF_8_english.h \
-  src_c/stem_ISO_8859_1_finnish.h \
-  src_c/stem_UTF_8_finnish.h \
-  src_c/stem_ISO_8859_1_french.h \
-  src_c/stem_UTF_8_french.h \
-  src_c/stem_ISO_8859_1_german.h \
-  src_c/stem_UTF_8_german.h \
-  src_c/stem_ISO_8859_1_hungarian.h \
-  src_c/stem_UTF_8_hungarian.h \
-  src_c/stem_ISO_8859_1_italian.h \
-  src_c/stem_UTF_8_italian.h \
-  src_c/stem_ISO_8859_1_norwegian.h \
-  src_c/stem_UTF_8_norwegian.h \
-  src_c/stem_ISO_8859_1_porter.h \
-  src_c/stem_UTF_8_porter.h \
-  src_c/stem_ISO_8859_1_portuguese.h \
-  src_c/stem_UTF_8_portuguese.h \
-  src_c/stem_ISO_8859_2_romanian.h \
-  src_c/stem_UTF_8_romanian.h \
-  src_c/stem_KOI8_R_russian.h \
-  src_c/stem_UTF_8_russian.h \
-  src_c/stem_ISO_8859_1_spanish.h \
-  src_c/stem_UTF_8_spanish.h \
-  src_c/stem_ISO_8859_1_swedish.h \
-  src_c/stem_UTF_8_swedish.h \
-  src_c/stem_UTF_8_turkish.h \
-  include/libstemmer.h \
-  libstemmer/modules.h \
-  runtime/api.h \
-  runtime/header.h
-

libstemmer_c/mkinc_utf8.mak

-# libstemmer/mkinc_utf8.mak: List of stemming module source files
-#
-# This file is generated by mkmodules.pl from a list of module names.
-# Do not edit manually.
-#
-# Modules included by this file are: danish, dutch, english, finnish, french,
-# german, hungarian, italian, norwegian, porter, portuguese, romanian,
-# russian, spanish, swedish, turkish
-
-snowball_sources= \
-  src_c/stem_UTF_8_danish.c \
-  src_c/stem_UTF_8_dutch.c \
-  src_c/stem_UTF_8_english.c \
-  src_c/stem_UTF_8_finnish.c \
-  src_c/stem_UTF_8_french.c \
-  src_c/stem_UTF_8_german.c \
-  src_c/stem_UTF_8_hungarian.c \
-  src_c/stem_UTF_8_italian.c \
-  src_c/stem_UTF_8_norwegian.c \
-  src_c/stem_UTF_8_porter.c \
-  src_c/stem_UTF_8_portuguese.c \
-  src_c/stem_UTF_8_romanian.c \
-  src_c/stem_UTF_8_russian.c \
-  src_c/stem_UTF_8_spanish.c \
-  src_c/stem_UTF_8_swedish.c \
-  src_c/stem_UTF_8_turkish.c \
-  runtime/api.c \
-  runtime/utilities.c \
-  libstemmer/libstemmer_utf8.c
-
-snowball_headers= \
-  src_c/stem_UTF_8_danish.h \
-  src_c/stem_UTF_8_dutch.h \
-  src_c/stem_UTF_8_english.h \
-  src_c/stem_UTF_8_finnish.h \
-  src_c/stem_UTF_8_french.h \
-  src_c/stem_UTF_8_german.h \
-  src_c/stem_UTF_8_hungarian.h \
-  src_c/stem_UTF_8_italian.h \
-  src_c/stem_UTF_8_norwegian.h \
-  src_c/stem_UTF_8_porter.h \
-  src_c/stem_UTF_8_portuguese.h \
-  src_c/stem_UTF_8_romanian.h \
-  src_c/stem_UTF_8_russian.h \
-  src_c/stem_UTF_8_spanish.h \
-  src_c/stem_UTF_8_swedish.h \
-  src_c/stem_UTF_8_turkish.h \
-  include/libstemmer.h \
-  libstemmer/modules_utf8.h \
-  runtime/api.h \
-  runtime/header.h
-

libstemmer_c/runtime/api.c

-
-#include <stdlib.h> /* for calloc, free */
-#include "header.h"
-
-extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
-{
-    struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
-    if (z == NULL) return NULL;
-    z->p = create_s();
-    if (z->p == NULL) goto error;
-    if (S_size)
-    {
-        int i;
-        z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
-        if (z->S == NULL) goto error;
-
-        for (i = 0; i < S_size; i++)
-        {
-            z->S[i] = create_s();
-            if (z->S[i] == NULL) goto error;
-        }
-    }
-
-    if (I_size)
-    {
-        z->I = (int *) calloc(I_size, sizeof(int));
-        if (z->I == NULL) goto error;
-    }
-
-    if (B_size)
-    {
-        z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
-        if (z->B == NULL) goto error;
-    }
-
-    return z;
-error:
-    SN_close_env(z, S_size);
-    return NULL;
-}
-
-extern void SN_close_env(struct SN_env * z, int S_size)
-{
-    if (z == NULL) return;
-    if (S_size)
-    {
-        int i;
-        for (i = 0; i < S_size; i++)
-        {
-            lose_s(z->S[i]);
-        }
-        free(z->S);
-    }
-    free(z->I);
-    free(z->B);
-    if (z->p) lose_s(z->p);
-    free(z);
-}
-
-extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
-{
-    int err = replace_s(z, 0, z->l, size, s, NULL);
-    z->c = 0;
-    return err;
-}
-

libstemmer_c/runtime/api.h

-
-typedef unsigned char symbol;
-
-/* Or replace 'char' above with 'short' for 16 bit characters.
-
-   More precisely, replace 'char' with whatever type guarantees the
-   character width you need. Note however that sizeof(symbol) should divide
-   HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
-   there is an alignment problem. In the unlikely event of a problem here,
-   consult Martin Porter.
-
-*/
-
-struct SN_env {
-    symbol * p;
-    int c; int l; int lb; int bra; int ket;
-    symbol * * S;
-    int * I;
-    unsigned char * B;
-};
-
-extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
-extern void SN_close_env(struct SN_env * z, int S_size);
-
-extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
-

libstemmer_c/runtime/header.h

-
-#include <limits.h>
-
-#include "api.h"
-
-#define MAXINT INT_MAX
-#define MININT INT_MIN
-
-#define HEAD 2*sizeof(int)
-
-#define SIZE(p)        ((int *)(p))[-1]
-#define SET_SIZE(p, n) ((int *)(p))[-1] = n
-#define CAPACITY(p)    ((int *)(p))[-2]
-
-struct among
-{   int s_size;     /* number of chars in string */
-    const symbol * s;       /* search string */
-    int substring_i;/* index to longest matching substring */
-    int result;     /* result of the lookup */
-    int (* function)(struct SN_env *);
-};
-
-extern symbol * create_s(void);
-extern void lose_s(symbol * p);
-
-extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
-
-extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-
-extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
-
-extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
-extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
-extern int eq_v(struct SN_env * z, const symbol * p);
-extern int eq_v_b(struct SN_env * z, const symbol * p);
-
-extern int find_among(struct SN_env * z, const struct among * v, int v_size);
-extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
-
-extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
-extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
-extern int slice_from_v(struct SN_env * z, const symbol * p);
-extern int slice_del(struct SN_env * z);
-
-extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
-extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
-
-extern symbol * slice_to(struct SN_env * z, symbol * p);
-extern symbol * assign_to(struct SN_env * z, symbol * p);
-
-extern void debug(struct SN_env * z, int number, int line_count);
-

libstemmer_c/runtime/utilities.c

-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "header.h"
-
-#define unless(C) if(!(C))
-
-#define CREATE_SIZE 1
-
-extern symbol * create_s(void) {
-    symbol * p;
-    void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
-    if (mem == NULL) return NULL;
-    p = (symbol *) (HEAD + (char *) mem);
-    CAPACITY(p) = CREATE_SIZE;
-    SET_SIZE(p, CREATE_SIZE);
-    return p;
-}
-
-extern void lose_s(symbol * p) {
-    if (p == NULL) return;
-    free((char *) p - HEAD);
-}
-
-/*
-   new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
-   if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
-   position, or 0 on failure.
-
-   -- used to implement hop and next in the utf8 case.
-*/
-
-extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
-    int b;
-    if (n >= 0) {
-        for (; n > 0; n--) {
-            if (c >= l) return -1;
-            b = p[c++];
-            if (b >= 0xC0) {   /* 1100 0000 */
-                while (c < l) {
-                    b = p[c];
-                    if (b >= 0xC0 || b < 0x80) break;
-                    /* break unless b is 10------ */
-                    c++;
-                }
-            }
-        }
-    } else {
-        for (; n < 0; n++) {
-            if (c <= lb) return -1;
-            b = p[--c];
-            if (b >= 0x80) {   /* 1000 0000 */
-                while (c > lb) {
-                    b = p[c];
-                    if (b >= 0xC0) break; /* 1100 0000 */
-                    c--;
-                }
-            }
-        }
-    }
-    return c;
-}
-
-/* Code for character groupings: utf8 cases */
-
-static int get_utf8(const symbol * p, int c, int l, int * slot) {
-    int b0, b1;
-    if (c >= l) return 0;
-    b0 = p[c++];
-    if (b0 < 0xC0 || c == l) {   /* 1100 0000 */
-        * slot = b0; return 1;
-    }
-    b1 = p[c++];
-    if (b0 < 0xE0 || c == l) {   /* 1110 0000 */
-        * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
-    }
-    * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
-}
-
-static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
-    int b0, b1;
-    if (c <= lb) return 0;
-    b0 = p[--c];
-    if (b0 < 0x80 || c == lb) {   /* 1000 0000 */
-        * slot = b0; return 1;
-    }
-    b1 = p[--c];
-    if (b1 >= 0xC0 || c == lb) {   /* 1100 0000 */
-        * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
-    }
-    * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
-}
-
-extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	int w = get_utf8(z->p, z->c, z->l, & ch);
-	unless (w) return -1;
-	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return w;
-	z->c += w;
-    } while (repeat);
-    return 0;
-}
-
-extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	int w = get_b_utf8(z->p, z->c, z->lb, & ch);
-	unless (w) return -1;
-	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return w;
-	z->c -= w;
-    } while (repeat);
-    return 0;
-}
-
-extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	int w = get_utf8(z->p, z->c, z->l, & ch);
-	unless (w) return -1;
-	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return w;
-	z->c += w;
-    } while (repeat);
-    return 0;
-}
-
-extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	int w = get_b_utf8(z->p, z->c, z->lb, & ch);
-	unless (w) return -1;
-	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return w;
-	z->c -= w;
-    } while (repeat);
-    return 0;
-}
-
-/* Code for character groupings: non-utf8 cases */
-
-extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	if (z->c >= z->l) return -1;
-	ch = z->p[z->c];
-	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return 1;
-	z->c++;
-    } while (repeat);
-    return 0;
-}
-
-extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	if (z->c <= z->lb) return -1;
-	ch = z->p[z->c - 1];
-	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return 1;
-	z->c--;
-    } while (repeat);
-    return 0;
-}
-
-extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	if (z->c >= z->l) return -1;
-	ch = z->p[z->c];
-	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return 1;
-	z->c++;
-    } while (repeat);
-    return 0;
-}
-
-extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
-    do {
-	int ch;
-	if (z->c <= z->lb) return -1;
-	ch = z->p[z->c - 1];
-	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
-	    return 1;
-	z->c--;
-    } while (repeat);
-    return 0;
-}
-
-extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
-    if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
-    z->c += s_size; return 1;
-}
-
-extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
-    if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
-    z->c -= s_size; return 1;
-}
-
-extern int eq_v(struct SN_env * z, const symbol * p) {
-    return eq_s(z, SIZE(p), p);
-}
-
-extern int eq_v_b(struct SN_env * z, const symbol * p) {
-    return eq_s_b(z, SIZE(p), p);
-}
-
-extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
-
-    int i = 0;
-    int j = v_size;
-
-    int c = z->c; int l = z->l;
-    symbol * q = z->p + c;
-
-    const struct among * w;
-
-    int common_i = 0;
-    int common_j = 0;
-
-    int first_key_inspected = 0;
-
-    while(1) {
-        int k = i + ((j - i) >> 1);
-        int diff = 0;
-        int common = common_i < common_j ? common_i : common_j; /* smaller */
-        w = v + k;
-        {
-            int i2; for (i2 = common; i2 < w->s_size; i2++) {
-                if (c + common == l) { diff = -1; break; }
-                diff = q[common] - w->s[i2];
-                if (diff != 0) break;
-                common++;
-            }
-        }
-        if (diff < 0) { j = k; common_j = common; }
-                 else { i = k; common_i = common; }
-        if (j - i <= 1) {
-            if (i > 0) break; /* v->s has been inspected */
-            if (j == i) break; /* only one item in v */
-
-            /* - but now we need to go round once more to get
-               v->s inspected. This looks messy, but is actually
-               the optimal approach.  */
-
-            if (first_key_inspected) break;
-            first_key_inspected = 1;
-        }
-    }
-    while(1) {
-        w = v + i;
-        if (common_i >= w->s_size) {
-            z->c = c + w->s_size;
-            if (w->function == 0) return w->result;
-            {
-                int res = w->function(z);
-                z->c = c + w->s_size;
-                if (res) return w->result;
-            }
-        }
-        i = w->substring_i;
-        if (i < 0) return 0;
-    }
-}
-
-/* find_among_b is for backwards processing. Same comments apply */
-
-extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
-
-    int i = 0;
-    int j = v_size;
-
-    int c = z->c; int lb = z->lb;
-    symbol * q = z->p + c - 1;
-
-    const struct among * w;
-
-    int common_i = 0;
-    int common_j = 0;
-
-    int first_key_inspected = 0;
-
-    while(1) {
-        int k = i + ((j - i) >> 1);
-        int diff = 0;
-        int common = common_i < common_j ? common_i : common_j;
-        w = v + k;
-        {
-            int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
-                if (c - common == lb) { diff = -1; break; }
-                diff = q[- common] - w->s[i2];
-                if (diff != 0) break;
-                common++;
-            }
-        }
-        if (diff < 0) { j = k; common_j = common; }
-                 else { i = k; common_i = common; }
-        if (j - i <= 1) {
-            if (i > 0) break;
-            if (j == i) break;
-            if (first_key_inspected) break;
-            first_key_inspected = 1;
-        }
-    }
-    while(1) {
-        w = v + i;
-        if (common_i >= w->s_size) {
-            z->c = c - w->s_size;
-            if (w->function == 0) return w->result;
-            {
-                int res = w->function(z);
-                z->c = c - w->s_size;
-                if (res) return w->result;
-            }
-        }
-        i = w->substring_i;
-        if (i < 0) return 0;
-    }
-}
-
-
-/* Increase the size of the buffer pointed to by p to at least n symbols.
- * If insufficient memory, returns NULL and frees the old buffer.
- */
-static symbol * increase_size(symbol * p, int n) {
-    symbol * q;
-    int new_size = n + 20;
-    void * mem = realloc((char *) p - HEAD,
-                         HEAD + (new_size + 1) * sizeof(symbol));
-    if (mem == NULL) {
-        lose_s(p);
-        return NULL;
-    }
-    q = (symbol *) (HEAD + (char *)mem);
-    CAPACITY(q) = new_size;
-    return q;
-}
-
-/* to replace symbols between c_bra and c_ket in z->p by the
-   s_size symbols at s.
-   Returns 0 on success, -1 on error.
-   Also, frees z->p (and sets it to NULL) on error.
-*/
-extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
-{
-    int adjustment;
-    int len;
-    if (z->p == NULL) {
-        z->p = create_s();
-        if (z->p == NULL) return -1;
-    }
-    adjustment = s_size - (c_ket - c_bra);
-    len = SIZE(z->p);
-    if (adjustment != 0) {
-        if (adjustment + len > CAPACITY(z->p)) {
-            z->p = increase_size(z->p, adjustment + len);
-            if (z->p == NULL) return -1;
-        }
-        memmove(z->p + c_ket + adjustment,
-                z->p + c_ket,
-                (len - c_ket) * sizeof(symbol));
-        SET_SIZE(z->p, adjustment + len);
-        z->l += adjustment;
-        if (z->c >= c_ket)
-            z->c += adjustment;
-        else
-            if (z->c > c_bra)
-                z->c = c_bra;
-    }
-    unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
-    if (adjptr != NULL)
-        *adjptr = adjustment;
-    return 0;
-}
-
-static int slice_check(struct SN_env * z) {
-
-    if (z->bra < 0 ||
-        z->bra > z->ket ||
-        z->ket > z->l ||
-        z->p == NULL ||
-        z->l > SIZE(z->p)) /* this line could be removed */
-    {
-#if 0
-        fprintf(stderr, "faulty slice operation:\n");
-        debug(z, -1, 0);
-#endif
-        return -1;
-    }
-    return 0;
-}
-
-extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
-    if (slice_check(z)) return -1;
-    return replace_s(z, z->bra, z->ket, s_size, s, NULL);
-}
-
-extern int slice_from_v(struct SN_env * z, const symbol * p) {
-    return slice_from_s(z, SIZE(p), p);
-}
-
-extern int slice_del(struct SN_env * z) {
-    return slice_from_s(z, 0, 0);
-}
-
-extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
-    int adjustment;
-    if (replace_s(z, bra, ket, s_size, s, &adjustment))
-        return -1;
-    if (bra <= z->bra) z->bra += adjustment;
-    if (bra <= z->ket) z->ket += adjustment;
-    return 0;
-}
-
-extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
-    int adjustment;
-    if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
-        return -1;
-    if (bra <= z->bra) z->bra += adjustment;
-    if (bra <= z->ket) z->ket += adjustment;
-    return 0;
-}
-
-extern symbol * slice_to(struct SN_env * z, symbol * p) {
-    if (slice_check(z)) {
-        lose_s(p);
-        return NULL;
-    }
-    {
-        int len = z->ket - z->bra;
-        if (CAPACITY(p) < len) {
-            p = increase_size(p, len);
-            if (p == NULL)
-                return NULL;
-        }
-        memmove(p, z->p + z->bra, len * sizeof(symbol));
-        SET_SIZE(p, len);
-    }