Commits

Miki Tebeka committed 5e5d1c7

here we go

Comments (0)

Files changed (81)

libstemmer_c/MANIFEST

+README
+src_c/stem_ISO_8859_1_danish.c
+src_c/stem_ISO_8859_1_danish.h
+src_c/stem_ISO_8859_1_dutch.c
+src_c/stem_ISO_8859_1_dutch.h
+src_c/stem_ISO_8859_1_english.c
+src_c/stem_ISO_8859_1_english.h
+src_c/stem_ISO_8859_1_finnish.c
+src_c/stem_ISO_8859_1_finnish.h
+src_c/stem_ISO_8859_1_french.c
+src_c/stem_ISO_8859_1_french.h
+src_c/stem_ISO_8859_1_german.c
+src_c/stem_ISO_8859_1_german.h
+src_c/stem_ISO_8859_1_hungarian.c
+src_c/stem_ISO_8859_1_hungarian.h
+src_c/stem_ISO_8859_1_italian.c
+src_c/stem_ISO_8859_1_italian.h
+src_c/stem_ISO_8859_1_norwegian.c
+src_c/stem_ISO_8859_1_norwegian.h
+src_c/stem_ISO_8859_1_porter.c
+src_c/stem_ISO_8859_1_porter.h
+src_c/stem_ISO_8859_1_portuguese.c
+src_c/stem_ISO_8859_1_portuguese.h
+src_c/stem_ISO_8859_1_spanish.c
+src_c/stem_ISO_8859_1_spanish.h
+src_c/stem_ISO_8859_1_swedish.c
+src_c/stem_ISO_8859_1_swedish.h
+src_c/stem_ISO_8859_2_romanian.c
+src_c/stem_ISO_8859_2_romanian.h
+src_c/stem_KOI8_R_russian.c
+src_c/stem_KOI8_R_russian.h
+src_c/stem_UTF_8_danish.c
+src_c/stem_UTF_8_danish.h
+src_c/stem_UTF_8_dutch.c
+src_c/stem_UTF_8_dutch.h
+src_c/stem_UTF_8_english.c
+src_c/stem_UTF_8_english.h
+src_c/stem_UTF_8_finnish.c
+src_c/stem_UTF_8_finnish.h
+src_c/stem_UTF_8_french.c
+src_c/stem_UTF_8_french.h
+src_c/stem_UTF_8_german.c
+src_c/stem_UTF_8_german.h
+src_c/stem_UTF_8_hungarian.c
+src_c/stem_UTF_8_hungarian.h
+src_c/stem_UTF_8_italian.c
+src_c/stem_UTF_8_italian.h
+src_c/stem_UTF_8_norwegian.c
+src_c/stem_UTF_8_norwegian.h
+src_c/stem_UTF_8_porter.c
+src_c/stem_UTF_8_porter.h
+src_c/stem_UTF_8_portuguese.c
+src_c/stem_UTF_8_portuguese.h
+src_c/stem_UTF_8_romanian.c
+src_c/stem_UTF_8_romanian.h
+src_c/stem_UTF_8_russian.c
+src_c/stem_UTF_8_russian.h
+src_c/stem_UTF_8_spanish.c
+src_c/stem_UTF_8_spanish.h
+src_c/stem_UTF_8_swedish.c
+src_c/stem_UTF_8_swedish.h
+src_c/stem_UTF_8_turkish.c
+src_c/stem_UTF_8_turkish.h
+runtime/api.c
+runtime/api.h
+runtime/header.h
+runtime/utilities.c
+libstemmer/libstemmer.c
+libstemmer/libstemmer_utf8.c
+libstemmer/modules.h
+libstemmer/modules_utf8.h
+include/libstemmer.h

libstemmer_c/Makefile

+include mkinc.mak
+CFLAGS=-Iinclude
+all: libstemmer.o stemwords
+libstemmer.o: $(snowball_sources:.c=.o)
+	$(AR) -cru $@ $^
+stemwords: examples/stemwords.o libstemmer.o
+	$(CC) -o $@ $^
+clean:
+	rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o

libstemmer_c/README

+libstemmer_c
+============
+
+This document pertains to the C version of the libstemmer distribution,
+available for download from:
+
+http://snowball.tartarus.org/dist/libstemmer_c.tgz
+
+
+Compiling the library
+=====================
+
+A simple makefile is provided for Unix style systems.  On such systems, it
+should be possible simply to run "make", and the file "libstemmer.o"
+and the example program "stemwords" will be generated.
+
+If this doesn't work on your system, you need to write your own build
+system (or call the compiler directly).  The files to compile are
+all contained in the "libstemmer", "runtime" and "src_c" directories,
+and the public header file is contained in the "include" directory.
+
+The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
+sets.  To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
+"libstemmer.c".
+
+For convenience "mkinc.mak" is a makefile fragment listing the source files and
+header files used to compile the standard version of the library.
+"mkinc_utf8.mak" is a comparable makefile fragment listing just the source
+files for the UTF-8 only version of the library.
+
+
+Using the library
+=================
+
+The library provides a simple C API.  Essentially, a new stemmer can
+be obtained by using "sb_stemmer_new".  "sb_stemmer_stem" is then
+used to stem a word, "sb_stemmer_length" returns the stemmed
+length of the last word processed, and "sb_stemmer_delete" is
+used to delete a stemmer.
+
+Creating a stemmer is a relatively expensive operation - the expected
+usage pattern is that a new stemmer is created when needed, used
+to stem many words, and deleted after some time.
+
+Stemmers are re-entrant, but not threadsafe.  In other words, if
+you wish to access the same stemmer object from multiple threads,
+you must ensure that all access is protected by a mutex or similar
+device.
+
+libstemmer does not currently incorporate any mechanism for caching the results
+of stemming operations.  Such caching can greatly increase the performance of a
+stemmer under certain situations, so suitable patches will be considered for
+inclusion.
+
+The standard libstemmer sources contain an algorithm for each of the supported
+languages.  The algorithm may be selected using the english name of the
+language, or using the 2 or 3 letter ISO 639 language codes.  In addition,
+the traditional "Porter" stemming algorithm for english is included for
+backwards compatibility purposes, but we recommend use of the "English"
+stemmer in preference for new projects.
+
+(Some minor algorithms which are included only as curiosities in the snowball
+website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
+included in the standard libstemmer sources.  These are not really supported by
+the snowball project, but it would be possible to compile a modified libstemmer
+library containing these if desired.)
+
+
+The stemwords example
+=====================
+
+The stemwords example program allows you to run any of the stemmers
+compiled into the libstemmer library on a sample vocabulary.  For
+details on how to use it, run it with the "-h" command line option.
+
+
+Using the library in a larger system
+====================================
+
+If you are incorporating the library into the build system of a larger
+program, I recommend copying the unpacked tarball without modification into
+a subdirectory of the sources of your program.  Future versions of the
+library are intended to keep the same structure, so this will keep the
+work required to move to a new version of the library to a minimum.
+
+As an additional convenience, the list of source and header files used
+in the library is detailed in mkinc.mak - a file which is in a suitable
+format for inclusion by a Makefile.  By including this file in your build
+system, you can link the snowball system into your program with a few
+extra rules.
+
+Using the library in a system using GNU autotools
+=================================================
+
+The libstemmer_c library can be integrated into a larger system which uses the
+GNU autotool framework (and in particular, automake and autoconf) as follows:
+
+1) Unpack libstemmer_c.tgz in the top level project directory so that there is
+   a libstemmer_c subdirectory of the top level directory of the project.
+
+2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
+   
+noinst_LTLIBRARIES = libstemmer.la
+include $(srcdir)/mkinc.mak
+noinst_HEADERS = $(snowball_headers)
+libstemmer_la_SOURCES = $(snowball_sources) 
+
+(You may also need to add other lines to this, for example, if you are using
+compiler options which are not compatible with compiling the libstemmer
+library.)
+
+3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
+   configure.ac file.
+
+4) Add to the top level makefile the following lines (or modify existing
+   assignments to these variables appropriately):
+
+AUTOMAKE_OPTIONS = subdir-objects
+AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
+SUBDIRS=libstemmer_c
+<name>_LIBADD = libstemmer_c/libstemmer.la
+
+(Where <name> is the name of the library or executable which links against
+libstemmer.) 
+

libstemmer_c/examples/stemwords.c

+/* This is a simple program which uses libstemmer to provide a command
+ * line interface for stemming using any of the algorithms provided.
+ */
+
+#include <stdio.h>
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for memmove */
+#include <ctype.h>  /* for isupper, tolower */
+
+#include "libstemmer.h"
+
+const char * progname;
+static int pretty = 1;
+
+static void
+stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
+{
+#define INC 10
+    int lim = INC;
+    sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
+
+    while(1) {
+        int ch = getc(f_in);
+        if (ch == EOF) {
+            free(b); return;
+        }
+        {
+            int i = 0;
+	    int inlen = 0;
+            while(1) {
+                if (ch == '\n' || ch == EOF) break;
+                if (i == lim) {
+                    sb_symbol * newb;
+		    newb = (sb_symbol *)
+			    realloc(b, (lim + INC) * sizeof(sb_symbol));
+		    if (newb == 0) goto error;
+		    b = newb;
+                    lim = lim + INC;
+                }
+		/* Update count of utf-8 characters. */
+		if (ch < 0x80 || ch > 0xBF) inlen += 1;
+                /* force lower case: */
+                if (isupper(ch)) ch = tolower(ch);
+
+                b[i] = ch;
+		i++;
+                ch = getc(f_in);
+            }
+
+	    {
+		const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
+                if (stemmed == NULL)
+                {
+                    fprintf(stderr, "Out of memory");
+                    exit(1);
+                }
+                else
+		{
+		    if (pretty == 1) {
+			fwrite(b, i, 1, f_out);
+			fputs(" -> ", f_out);
+		    } else if (pretty == 2) {
+			fwrite(b, i, 1, f_out);
+			if (sb_stemmer_length(stemmer) > 0) {
+			    int j;
+			    if (inlen < 30) {
+				for (j = 30 - inlen; j > 0; j--)
+				    fputs(" ", f_out);
+			    } else {
+				fputs("\n", f_out);
+				for (j = 30; j > 0; j--)
+				    fputs(" ", f_out);
+			    }
+			}
+		    }
+
+		    fputs((char *)stemmed, f_out);
+		    putc('\n', f_out);
+		}
+            }
+        }
+    }
+error:
+    if (b != 0) free(b);
+    return;
+}
+
+/** Display the command line syntax, and then exit.
+ *  @param n The value to exit with.
+ */
+static void
+usage(int n)
+{
+    printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
+	  "\n"
+	  "The input file consists of a list of words to be stemmed, one per\n"
+	  "line. Words should be in lower case, but (for English) A-Z letters\n"
+	  "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
+	  "used.\n"
+	  "\n"
+	  "If -c is given, the argument is the character encoding of the input\n"
+          "and output files.  If it is omitted, the UTF-8 encoding is used.\n"
+	  "\n"
+	  "If -p is given the output file consists of each word of the input\n"
+	  "file followed by \"->\" followed by its stemmed equivalent.\n"
+	  "If -p2 is given the output file is a two column layout containing\n"
+	  "the input words in the first column and the stemmed eqivalents in\n"
+	  "the second column.\n"
+	  "Otherwise, the output file consists of the stemmed words, one per\n"
+	  "line.\n"
+	  "\n"
+	  "-h displays this help\n",
+	  progname);
+    exit(n);
+}
+
+int
+main(int argc, char * argv[])
+{
+    char * in = 0;
+    char * out = 0;
+    FILE * f_in;
+    FILE * f_out;
+    struct sb_stemmer * stemmer;
+
+    char * language = "english";
+    char * charenc = NULL;
+
+    char * s;
+    int i = 1;
+    pretty = 0;
+
+    progname = argv[0];
+
+    while(i < argc) {
+	s = argv[i++];
+	if (s[0] == '-') {
+	    if (strcmp(s, "-o") == 0) {
+		if (i >= argc) {
+		    fprintf(stderr, "%s requires an argument\n", s);
+		    exit(1);
+		}
+		out = argv[i++];
+	    } else if (strcmp(s, "-i") == 0) {
+		if (i >= argc) {
+		    fprintf(stderr, "%s requires an argument\n", s);
+		    exit(1);
+		}
+		in = argv[i++];
+	    } else if (strcmp(s, "-l") == 0) {
+		if (i >= argc) {
+		    fprintf(stderr, "%s requires an argument\n", s);
+		    exit(1);
+		}
+		language = argv[i++];
+	    } else if (strcmp(s, "-c") == 0) {
+		if (i >= argc) {
+		    fprintf(stderr, "%s requires an argument\n", s);
+		    exit(1);
+		}
+		charenc = argv[i++];
+	    } else if (strcmp(s, "-p2") == 0) {
+		pretty = 2;
+	    } else if (strcmp(s, "-p") == 0) {
+		pretty = 1;
+	    } else if (strcmp(s, "-h") == 0) {
+		usage(0);
+	    } else {
+		fprintf(stderr, "option %s unknown\n", s);
+		usage(1);
+	    }
+	} else {
+	    fprintf(stderr, "unexpected parameter %s\n", s);
+	    usage(1);
+	}
+    }
+
+    /* prepare the files */
+    f_in = (in == 0) ? stdin : fopen(in, "r");
+    if (f_in == 0) {
+	fprintf(stderr, "file %s not found\n", in);
+	exit(1);
+    }
+    f_out = (out == 0) ? stdout : fopen(out, "w");
+    if (f_out == 0) {
+	fprintf(stderr, "file %s cannot be opened\n", out);
+	exit(1);
+    }
+
+    /* do the stemming process: */
+    stemmer = sb_stemmer_new(language, charenc);
+    if (stemmer == 0) {
+        if (charenc == NULL) {
+            fprintf(stderr, "language `%s' not available for stemming\n", language);
+            exit(1);
+        } else {
+            fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
+            exit(1);
+        }
+    }
+    stem_file(stemmer, f_in, f_out);
+    sb_stemmer_delete(stemmer);
+
+    if (in != 0) (void) fclose(f_in);
+    if (out != 0) (void) fclose(f_out);
+
+    return 0;
+}
+

libstemmer_c/include/libstemmer.h

+
+/* Make header file work when included from C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sb_stemmer;
+typedef unsigned char sb_symbol;
+
+/* FIXME - should be able to get a version number for each stemming
+ * algorithm (which will be incremented each time the output changes). */
+
+/** Returns an array of the names of the available stemming algorithms.
+ *  Note that these are the canonical names - aliases (ie, other names for
+ *  the same algorithm) will not be included in the list.
+ *  The list is terminated with a null pointer.
+ *
+ *  The list must not be modified in any way.
+ */
+const char ** sb_stemmer_list(void);
+
+/** Create a new stemmer object, using the specified algorithm, for the
+ *  specified character encoding.
+ *
+ *  All algorithms will usually be available in UTF-8, but may also be
+ *  available in other character encodings.
+ *
+ *  @param algorithm The algorithm name.  This is either the english
+ *  name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
+ *  language.  Note that case is significant in this parameter - the
+ *  value should be supplied in lower case.
+ *
+ *  @param charenc The character encoding.  NULL may be passed as
+ *  this value, in which case UTF-8 encoding will be assumed. Otherwise,
+ *  the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
+ *  "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian).  Note that
+ *  case is significant in this parameter.
+ *
+ *  @return NULL if the specified algorithm is not recognised, or the
+ *  algorithm is not available for the requested encoding.  Otherwise,
+ *  returns a pointer to a newly created stemmer for the requested algorithm.
+ *  The returned pointer must be deleted by calling sb_stemmer_delete().
+ *
+ *  @note NULL will also be returned if an out of memory error occurs.
+ */
+struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
+
+/** Delete a stemmer object.
+ *
+ *  This frees all resources allocated for the stemmer.  After calling
+ *  this function, the supplied stemmer may no longer be used in any way.
+ *
+ *  It is safe to pass a null pointer to this function - this will have
+ *  no effect.
+ */
+void                sb_stemmer_delete(struct sb_stemmer * stemmer);
+
+/** Stem a word.
+ *
+ *  The return value is owned by the stemmer - it must not be freed or
+ *  modified, and it will become invalid when the stemmer is called again,
+ *  or if the stemmer is freed.
+ *
+ *  The length of the return value can be obtained using sb_stemmer_length().
+ *
+ *  If an out-of-memory error occurs, this will return NULL.
+ */
+const sb_symbol *   sb_stemmer_stem(struct sb_stemmer * stemmer,
+				    const sb_symbol * word, int size);
+
+/** Get the length of the result of the last stemmed word.
+ *  This should not be called before sb_stemmer_stem() has been called.
+ */
+int                 sb_stemmer_length(struct sb_stemmer * stemmer);
+
+#ifdef __cplusplus
+}
+#endif
+

libstemmer_c/libstemmer/libstemmer.c

+
+#include <stdlib.h>
+#include <string.h>
+#include "../include/libstemmer.h"
+#include "../runtime/api.h"
+#include "modules.h"
+
+struct sb_stemmer {
+    struct SN_env * (*create)(void);
+    void (*close)(struct SN_env *);
+    int (*stem)(struct SN_env *);
+
+    struct SN_env * env;
+};
+
+extern const char **
+sb_stemmer_list(void)
+{
+    return algorithm_names;
+}
+
+static stemmer_encoding_t
+sb_getenc(const char * charenc)
+{
+    struct stemmer_encoding * encoding;
+    if (charenc == NULL) return ENC_UTF_8;
+    for (encoding = encodings; encoding->name != 0; encoding++) {
+	if (strcmp(encoding->name, charenc) == 0) break;
+    }
+    if (encoding->name == NULL) return ENC_UNKNOWN;
+    return encoding->enc;
+}
+
+extern struct sb_stemmer *
+sb_stemmer_new(const char * algorithm, const char * charenc)
+{
+    stemmer_encoding_t enc;
+    struct stemmer_modules * module;
+    struct sb_stemmer * stemmer;
+
+    enc = sb_getenc(charenc);
+    if (enc == ENC_UNKNOWN) return NULL;
+
+    for (module = modules; module->name != 0; module++) {
+	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+    }
+    if (module->name == NULL) return NULL;
+    
+    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+    if (stemmer == NULL) return NULL;
+
+    stemmer->create = module->create;
+    stemmer->close = module->close;
+    stemmer->stem = module->stem;
+
+    stemmer->env = stemmer->create();
+    if (stemmer->env == NULL)
+    {
+        sb_stemmer_delete(stemmer);
+        return NULL;
+    }
+
+    return stemmer;
+}
+
+void
+sb_stemmer_delete(struct sb_stemmer * stemmer)
+{
+    if (stemmer == 0) return;
+    if (stemmer->close == 0) return;
+    stemmer->close(stemmer->env);
+    stemmer->close = 0;
+    free(stemmer);
+}
+
+const sb_symbol *
+sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
+{
+    int ret;
+    if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
+    {
+        stemmer->env->l = 0;
+        return NULL;
+    }
+    ret = stemmer->stem(stemmer->env);
+    if (ret < 0) return NULL;
+    stemmer->env->p[stemmer->env->l] = 0;
+    return (const sb_symbol *)(stemmer->env->p);
+}
+
+int
+sb_stemmer_length(struct sb_stemmer * stemmer)
+{
+    return stemmer->env->l;
+}

libstemmer_c/libstemmer/libstemmer_c.in

+
+#include <stdlib.h>
+#include <string.h>
+#include "../include/libstemmer.h"
+#include "../runtime/api.h"
+#include "@MODULES_H@"
+
+struct sb_stemmer {
+    struct SN_env * (*create)(void);
+    void (*close)(struct SN_env *);
+    int (*stem)(struct SN_env *);
+
+    struct SN_env * env;
+};
+
+extern const char **
+sb_stemmer_list(void)
+{
+    return algorithm_names;
+}
+
+static stemmer_encoding_t
+sb_getenc(const char * charenc)
+{
+    struct stemmer_encoding * encoding;
+    if (charenc == NULL) return ENC_UTF_8;
+    for (encoding = encodings; encoding->name != 0; encoding++) {
+	if (strcmp(encoding->name, charenc) == 0) break;
+    }
+    if (encoding->name == NULL) return ENC_UNKNOWN;
+    return encoding->enc;
+}
+
+extern struct sb_stemmer *
+sb_stemmer_new(const char * algorithm, const char * charenc)
+{
+    stemmer_encoding_t enc;
+    struct stemmer_modules * module;
+    struct sb_stemmer * stemmer;
+
+    enc = sb_getenc(charenc);
+    if (enc == ENC_UNKNOWN) return NULL;
+
+    for (module = modules; module->name != 0; module++) {
+	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+    }
+    if (module->name == NULL) return NULL;
+    
+    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+    if (stemmer == NULL) return NULL;
+
+    stemmer->create = module->create;
+    stemmer->close = module->close;
+    stemmer->stem = module->stem;
+
+    stemmer->env = stemmer->create();
+    if (stemmer->env == NULL)
+    {
+        sb_stemmer_delete(stemmer);
+        return NULL;
+    }
+
+    return stemmer;
+}
+
+void
+sb_stemmer_delete(struct sb_stemmer * stemmer)
+{
+    if (stemmer == 0) return;
+    if (stemmer->close == 0) return;
+    stemmer->close(stemmer->env);
+    stemmer->close = 0;
+    free(stemmer);
+}
+
+const sb_symbol *
+sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
+{
+    int ret;
+    if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
+    {
+        stemmer->env->l = 0;
+        return NULL;
+    }
+    ret = stemmer->stem(stemmer->env);
+    if (ret < 0) return NULL;
+    stemmer->env->p[stemmer->env->l] = 0;
+    return (const sb_symbol *)(stemmer->env->p);
+}
+
+int
+sb_stemmer_length(struct sb_stemmer * stemmer)
+{
+    return stemmer->env->l;
+}

libstemmer_c/libstemmer/libstemmer_utf8.c

+
+#include <stdlib.h>
+#include <string.h>
+#include "../include/libstemmer.h"
+#include "../runtime/api.h"
+#include "modules_utf8.h"
+
+struct sb_stemmer {
+    struct SN_env * (*create)(void);
+    void (*close)(struct SN_env *);
+    int (*stem)(struct SN_env *);
+
+    struct SN_env * env;
+};
+
+extern const char **
+sb_stemmer_list(void)
+{
+    return algorithm_names;
+}
+
+static stemmer_encoding_t
+sb_getenc(const char * charenc)
+{
+    struct stemmer_encoding * encoding;
+    if (charenc == NULL) return ENC_UTF_8;
+    for (encoding = encodings; encoding->name != 0; encoding++) {
+	if (strcmp(encoding->name, charenc) == 0) break;
+    }
+    if (encoding->name == NULL) return ENC_UNKNOWN;
+    return encoding->enc;
+}
+
+extern struct sb_stemmer *
+sb_stemmer_new(const char * algorithm, const char * charenc)
+{
+    stemmer_encoding_t enc;
+    struct stemmer_modules * module;
+    struct sb_stemmer * stemmer;
+
+    enc = sb_getenc(charenc);
+    if (enc == ENC_UNKNOWN) return NULL;
+
+    for (module = modules; module->name != 0; module++) {
+	if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+    }
+    if (module->name == NULL) return NULL;
+    
+    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+    if (stemmer == NULL) return NULL;
+
+    stemmer->create = module->create;
+    stemmer->close = module->close;
+    stemmer->stem = module->stem;
+
+    stemmer->env = stemmer->create();
+    if (stemmer->env == NULL)
+    {
+        sb_stemmer_delete(stemmer);
+        return NULL;
+    }
+
+    return stemmer;
+}
+
+void
+sb_stemmer_delete(struct sb_stemmer * stemmer)
+{
+    if (stemmer == 0) return;
+    if (stemmer->close == 0) return;
+    stemmer->close(stemmer->env);
+    stemmer->close = 0;
+    free(stemmer);
+}
+
+const sb_symbol *
+sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
+{
+    int ret;
+    if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
+    {
+        stemmer->env->l = 0;
+        return NULL;
+    }
+    ret = stemmer->stem(stemmer->env);
+    if (ret < 0) return NULL;
+    stemmer->env->p[stemmer->env->l] = 0;
+    return (const sb_symbol *)(stemmer->env->p);
+}
+
+int
+sb_stemmer_length(struct sb_stemmer * stemmer)
+{
+    return stemmer->env->l;
+}

libstemmer_c/libstemmer/modules.h

+/* libstemmer/modules.h: List of stemming modules.
+ *
+ * This file is generated by mkmodules.pl from a list of module names.
+ * Do not edit manually.
+ *
+ * Modules included by this file are: danish, dutch, english, finnish, french,
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
+ * russian, spanish, swedish, turkish
+ */
+
+#include "../src_c/stem_ISO_8859_1_danish.h"
+#include "../src_c/stem_UTF_8_danish.h"
+#include "../src_c/stem_ISO_8859_1_dutch.h"
+#include "../src_c/stem_UTF_8_dutch.h"
+#include "../src_c/stem_ISO_8859_1_english.h"
+#include "../src_c/stem_UTF_8_english.h"
+#include "../src_c/stem_ISO_8859_1_finnish.h"
+#include "../src_c/stem_UTF_8_finnish.h"
+#include "../src_c/stem_ISO_8859_1_french.h"
+#include "../src_c/stem_UTF_8_french.h"
+#include "../src_c/stem_ISO_8859_1_german.h"
+#include "../src_c/stem_UTF_8_german.h"
+#include "../src_c/stem_ISO_8859_1_hungarian.h"
+#include "../src_c/stem_UTF_8_hungarian.h"
+#include "../src_c/stem_ISO_8859_1_italian.h"
+#include "../src_c/stem_UTF_8_italian.h"
+#include "../src_c/stem_ISO_8859_1_norwegian.h"
+#include "../src_c/stem_UTF_8_norwegian.h"
+#include "../src_c/stem_ISO_8859_1_porter.h"
+#include "../src_c/stem_UTF_8_porter.h"
+#include "../src_c/stem_ISO_8859_1_portuguese.h"
+#include "../src_c/stem_UTF_8_portuguese.h"
+#include "../src_c/stem_ISO_8859_2_romanian.h"
+#include "../src_c/stem_UTF_8_romanian.h"
+#include "../src_c/stem_KOI8_R_russian.h"
+#include "../src_c/stem_UTF_8_russian.h"
+#include "../src_c/stem_ISO_8859_1_spanish.h"
+#include "../src_c/stem_UTF_8_spanish.h"
+#include "../src_c/stem_ISO_8859_1_swedish.h"
+#include "../src_c/stem_UTF_8_swedish.h"
+#include "../src_c/stem_UTF_8_turkish.h"
+
+typedef enum {
+  ENC_UNKNOWN=0,
+  ENC_ISO_8859_1,
+  ENC_ISO_8859_2,
+  ENC_KOI8_R,
+  ENC_UTF_8
+} stemmer_encoding_t;
+
+struct stemmer_encoding {
+  const char * name;
+  stemmer_encoding_t enc;
+};
+static struct stemmer_encoding encodings[] = {
+  {"ISO_8859_1", ENC_ISO_8859_1},
+  {"ISO_8859_2", ENC_ISO_8859_2},
+  {"KOI8_R", ENC_KOI8_R},
+  {"UTF_8", ENC_UTF_8},
+  {0,ENC_UNKNOWN}
+};
+
+struct stemmer_modules {
+  const char * name;
+  stemmer_encoding_t enc; 
+  struct SN_env * (*create)(void);
+  void (*close)(struct SN_env *);
+  int (*stem)(struct SN_env *);
+};
+static struct stemmer_modules modules[] = {
+  {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
+  {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+  {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
+  {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+  {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
+  {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+  {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+  {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+  {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+  {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+  {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
+  {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+  {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
+  {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+  {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
+  {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+  {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+  {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+  {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
+  {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+  {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
+  {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+  {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
+  {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+  {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+  {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+  {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+  {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+  {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
+  {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+  {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
+  {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+  {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
+  {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+  {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
+  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+  {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
+  {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+  {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
+  {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+  {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+  {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+  {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
+  {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+  {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
+  {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+  {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
+  {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+  {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
+  {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+  {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
+  {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
+  {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
+  {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+  {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
+  {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+  {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+  {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+  {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+  {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
+  {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+  {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+  {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
+  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+  {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
+  {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+  {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+  {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+  {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
+  {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+  {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
+  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+  {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
+  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+  {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+  {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+  {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+  {0,ENC_UNKNOWN,0,0,0}
+};
+static const char * algorithm_names[] = {
+  "danish", 
+  "dutch", 
+  "english", 
+  "finnish", 
+  "french", 
+  "german", 
+  "hungarian", 
+  "italian", 
+  "norwegian", 
+  "porter", 
+  "portuguese", 
+  "romanian", 
+  "russian", 
+  "spanish", 
+  "swedish", 
+  "turkish", 
+  0
+};

libstemmer_c/libstemmer/modules.txt

+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+#  First item is name of stemmer.
+#  Second item is comma separated list of character sets.
+#  Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8, and also with
+# the most commonly used encoding.
+
+danish          UTF_8,ISO_8859_1        danish,da,dan
+dutch           UTF_8,ISO_8859_1        dutch,nl,dut,nld
+english         UTF_8,ISO_8859_1        english,en,eng
+finnish         UTF_8,ISO_8859_1        finnish,fi,fin
+french          UTF_8,ISO_8859_1        french,fr,fre,fra
+german          UTF_8,ISO_8859_1        german,de,ger,deu
+hungarian       UTF_8,ISO_8859_1        hungarian,hu,hun
+italian         UTF_8,ISO_8859_1        italian,it,ita
+norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
+portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
+romanian        UTF_8,ISO_8859_2        romanian,ro,rum,ron
+russian         UTF_8,KOI8_R            russian,ru,rus
+spanish         UTF_8,ISO_8859_1        spanish,es,esl,spa
+swedish         UTF_8,ISO_8859_1        swedish,sv,swe
+turkish         UTF_8                   turkish,tr,tur
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter          UTF_8,ISO_8859_1        porter
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported.  These
+# algorithms are:
+#
+# german2          - This is a slight modification of the german stemmer.
+#german2          UTF_8,ISO_8859_1        german2
+#
+# kraaij_pohlmann  - This is a different dutch stemmer.
+#kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann
+#
+# lovins           - This is an english stemmer, but fairly outdated, and
+#                    only really applicable to a restricted type of input text
+#                    (keywords in academic publications).
+#lovins           UTF_8,ISO_8859_1        lovins

libstemmer_c/libstemmer/modules_utf8.h

+/* libstemmer/modules_utf8.h: List of stemming modules.
+ *
+ * This file is generated by mkmodules.pl from a list of module names.
+ * Do not edit manually.
+ *
+ * Modules included by this file are: danish, dutch, english, finnish, french,
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
+ * russian, spanish, swedish, turkish
+ */
+
+#include "../src_c/stem_UTF_8_danish.h"
+#include "../src_c/stem_UTF_8_dutch.h"
+#include "../src_c/stem_UTF_8_english.h"
+#include "../src_c/stem_UTF_8_finnish.h"
+#include "../src_c/stem_UTF_8_french.h"
+#include "../src_c/stem_UTF_8_german.h"
+#include "../src_c/stem_UTF_8_hungarian.h"
+#include "../src_c/stem_UTF_8_italian.h"
+#include "../src_c/stem_UTF_8_norwegian.h"
+#include "../src_c/stem_UTF_8_porter.h"
+#include "../src_c/stem_UTF_8_portuguese.h"
+#include "../src_c/stem_UTF_8_romanian.h"
+#include "../src_c/stem_UTF_8_russian.h"
+#include "../src_c/stem_UTF_8_spanish.h"
+#include "../src_c/stem_UTF_8_swedish.h"
+#include "../src_c/stem_UTF_8_turkish.h"
+
+typedef enum {
+  ENC_UNKNOWN=0,
+  ENC_UTF_8
+} stemmer_encoding_t;
+
+struct stemmer_encoding {
+  const char * name;
+  stemmer_encoding_t enc;
+};
+static struct stemmer_encoding encodings[] = {
+  {"UTF_8", ENC_UTF_8},
+  {0,ENC_UNKNOWN}
+};
+
+struct stemmer_modules {
+  const char * name;
+  stemmer_encoding_t enc; 
+  struct SN_env * (*create)(void);
+  void (*close)(struct SN_env *);
+  int (*stem)(struct SN_env *);
+};
+static struct stemmer_modules modules[] = {
+  {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+  {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+  {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+  {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+  {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+  {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+  {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+  {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+  {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+  {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+  {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+  {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+  {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+  {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+  {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+  {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+  {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+  {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+  {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+  {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+  {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
+  {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+  {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+  {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+  {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+  {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+  {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+  {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+  {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+  {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+  {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+  {0,ENC_UNKNOWN,0,0,0}
+};
+static const char * algorithm_names[] = {
+  "danish", 
+  "dutch", 
+  "english", 
+  "finnish", 
+  "french", 
+  "german", 
+  "hungarian", 
+  "italian", 
+  "norwegian", 
+  "porter", 
+  "portuguese", 
+  "romanian", 
+  "russian", 
+  "spanish", 
+  "swedish", 
+  "turkish", 
+  0
+};

libstemmer_c/libstemmer/modules_utf8.txt

+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+#  First item is name of stemmer.
+#  Second item is comma separated list of character sets.
+#  Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8.
+
+danish          UTF_8                   danish,da,dan
+dutch           UTF_8                   dutch,nl,dut,nld
+english         UTF_8                   english,en,eng
+finnish         UTF_8                   finnish,fi,fin
+french          UTF_8                   french,fr,fre,fra
+german          UTF_8                   german,de,ger,deu
+hungarian       UTF_8                   hungarian,hu,hun
+italian         UTF_8                   italian,it,ita
+norwegian       UTF_8                   norwegian,no,nor
+portuguese      UTF_8                   portuguese,pt,por
+romanian        UTF_8                   romanian,ro,rum,ron
+russian         UTF_8                   russian,ru,rus
+spanish         UTF_8                   spanish,es,esl,spa
+swedish         UTF_8                   swedish,sv,swe
+turkish         UTF_8                   turkish,tr,tur
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter          UTF_8                   porter
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported.  These
+# algorithms are:
+#
+# german2          - This is a slight modification of the german stemmer.
+#german2          UTF_8                   german2
+#
+# kraaij_pohlmann  - This is a different dutch stemmer.
+#kraaij_pohlmann  UTF_8                   kraaij_pohlmann
+#
+# lovins           - This is an english stemmer, but fairly outdated, and
+#                    only really applicable to a restricted type of input text
+#                    (keywords in academic publications).
+#lovins           UTF_8                   lovins

libstemmer_c/mkinc.mak

+# libstemmer/mkinc.mak: List of stemming module source files
+#
+# This file is generated by mkmodules.pl from a list of module names.
+# Do not edit manually.
+#
+# Modules included by this file are: danish, dutch, english, finnish, french,
+# german, hungarian, italian, norwegian, porter, portuguese, romanian,
+# russian, spanish, swedish, turkish
+
+snowball_sources= \
+  src_c/stem_ISO_8859_1_danish.c \
+  src_c/stem_UTF_8_danish.c \
+  src_c/stem_ISO_8859_1_dutch.c \
+  src_c/stem_UTF_8_dutch.c \
+  src_c/stem_ISO_8859_1_english.c \
+  src_c/stem_UTF_8_english.c \
+  src_c/stem_ISO_8859_1_finnish.c \
+  src_c/stem_UTF_8_finnish.c \
+  src_c/stem_ISO_8859_1_french.c \
+  src_c/stem_UTF_8_french.c \
+  src_c/stem_ISO_8859_1_german.c \
+  src_c/stem_UTF_8_german.c \
+  src_c/stem_ISO_8859_1_hungarian.c \
+  src_c/stem_UTF_8_hungarian.c \
+  src_c/stem_ISO_8859_1_italian.c \
+  src_c/stem_UTF_8_italian.c \
+  src_c/stem_ISO_8859_1_norwegian.c \
+  src_c/stem_UTF_8_norwegian.c \
+  src_c/stem_ISO_8859_1_porter.c \
+  src_c/stem_UTF_8_porter.c \
+  src_c/stem_ISO_8859_1_portuguese.c \
+  src_c/stem_UTF_8_portuguese.c \
+  src_c/stem_ISO_8859_2_romanian.c \
+  src_c/stem_UTF_8_romanian.c \
+  src_c/stem_KOI8_R_russian.c \
+  src_c/stem_UTF_8_russian.c \
+  src_c/stem_ISO_8859_1_spanish.c \
+  src_c/stem_UTF_8_spanish.c \
+  src_c/stem_ISO_8859_1_swedish.c \
+  src_c/stem_UTF_8_swedish.c \
+  src_c/stem_UTF_8_turkish.c \
+  runtime/api.c \
+  runtime/utilities.c \
+  libstemmer/libstemmer.c
+
+snowball_headers= \
+  src_c/stem_ISO_8859_1_danish.h \
+  src_c/stem_UTF_8_danish.h \
+  src_c/stem_ISO_8859_1_dutch.h \
+  src_c/stem_UTF_8_dutch.h \
+  src_c/stem_ISO_8859_1_english.h \
+  src_c/stem_UTF_8_english.h \
+  src_c/stem_ISO_8859_1_finnish.h \
+  src_c/stem_UTF_8_finnish.h \
+  src_c/stem_ISO_8859_1_french.h \
+  src_c/stem_UTF_8_french.h \
+  src_c/stem_ISO_8859_1_german.h \
+  src_c/stem_UTF_8_german.h \
+  src_c/stem_ISO_8859_1_hungarian.h \
+  src_c/stem_UTF_8_hungarian.h \
+  src_c/stem_ISO_8859_1_italian.h \
+  src_c/stem_UTF_8_italian.h \
+  src_c/stem_ISO_8859_1_norwegian.h \
+  src_c/stem_UTF_8_norwegian.h \
+  src_c/stem_ISO_8859_1_porter.h \
+  src_c/stem_UTF_8_porter.h \
+  src_c/stem_ISO_8859_1_portuguese.h \
+  src_c/stem_UTF_8_portuguese.h \
+  src_c/stem_ISO_8859_2_romanian.h \
+  src_c/stem_UTF_8_romanian.h \
+  src_c/stem_KOI8_R_russian.h \
+  src_c/stem_UTF_8_russian.h \
+  src_c/stem_ISO_8859_1_spanish.h \
+  src_c/stem_UTF_8_spanish.h \
+  src_c/stem_ISO_8859_1_swedish.h \
+  src_c/stem_UTF_8_swedish.h \
+  src_c/stem_UTF_8_turkish.h \
+  include/libstemmer.h \
+  libstemmer/modules.h \
+  runtime/api.h \
+  runtime/header.h
+

libstemmer_c/mkinc_utf8.mak

+# libstemmer/mkinc_utf8.mak: List of stemming module source files
+#
+# This file is generated by mkmodules.pl from a list of module names.
+# Do not edit manually.
+#
+# Modules included by this file are: danish, dutch, english, finnish, french,
+# german, hungarian, italian, norwegian, porter, portuguese, romanian,
+# russian, spanish, swedish, turkish
+
+snowball_sources= \
+  src_c/stem_UTF_8_danish.c \
+  src_c/stem_UTF_8_dutch.c \
+  src_c/stem_UTF_8_english.c \
+  src_c/stem_UTF_8_finnish.c \
+  src_c/stem_UTF_8_french.c \
+  src_c/stem_UTF_8_german.c \
+  src_c/stem_UTF_8_hungarian.c \
+  src_c/stem_UTF_8_italian.c \
+  src_c/stem_UTF_8_norwegian.c \
+  src_c/stem_UTF_8_porter.c \
+  src_c/stem_UTF_8_portuguese.c \
+  src_c/stem_UTF_8_romanian.c \
+  src_c/stem_UTF_8_russian.c \
+  src_c/stem_UTF_8_spanish.c \
+  src_c/stem_UTF_8_swedish.c \
+  src_c/stem_UTF_8_turkish.c \
+  runtime/api.c \
+  runtime/utilities.c \
+  libstemmer/libstemmer_utf8.c
+
+snowball_headers= \
+  src_c/stem_UTF_8_danish.h \
+  src_c/stem_UTF_8_dutch.h \
+  src_c/stem_UTF_8_english.h \
+  src_c/stem_UTF_8_finnish.h \
+  src_c/stem_UTF_8_french.h \
+  src_c/stem_UTF_8_german.h \
+  src_c/stem_UTF_8_hungarian.h \
+  src_c/stem_UTF_8_italian.h \
+  src_c/stem_UTF_8_norwegian.h \
+  src_c/stem_UTF_8_porter.h \
+  src_c/stem_UTF_8_portuguese.h \
+  src_c/stem_UTF_8_romanian.h \
+  src_c/stem_UTF_8_russian.h \
+  src_c/stem_UTF_8_spanish.h \
+  src_c/stem_UTF_8_swedish.h \
+  src_c/stem_UTF_8_turkish.h \
+  include/libstemmer.h \
+  libstemmer/modules_utf8.h \
+  runtime/api.h \
+  runtime/header.h
+

libstemmer_c/runtime/api.c

+
+#include <stdlib.h> /* for calloc, free */
+#include "header.h"
+
+extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
+{
+    struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
+    if (z == NULL) return NULL;
+    z->p = create_s();
+    if (z->p == NULL) goto error;
+    if (S_size)
+    {
+        int i;
+        z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
+        if (z->S == NULL) goto error;
+
+        for (i = 0; i < S_size; i++)
+        {
+            z->S[i] = create_s();
+            if (z->S[i] == NULL) goto error;
+        }
+    }
+
+    if (I_size)
+    {
+        z->I = (int *) calloc(I_size, sizeof(int));
+        if (z->I == NULL) goto error;
+    }
+
+    if (B_size)
+    {
+        z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
+        if (z->B == NULL) goto error;
+    }
+
+    return z;
+error:
+    SN_close_env(z, S_size);
+    return NULL;
+}
+
+extern void SN_close_env(struct SN_env * z, int S_size)
+{
+    if (z == NULL) return;
+    if (S_size)
+    {
+        int i;
+        for (i = 0; i < S_size; i++)
+        {
+            lose_s(z->S[i]);
+        }
+        free(z->S);
+    }
+    free(z->I);
+    free(z->B);
+    if (z->p) lose_s(z->p);
+    free(z);
+}
+
+extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
+{
+    int err = replace_s(z, 0, z->l, size, s, NULL);
+    z->c = 0;
+    return err;
+}
+

libstemmer_c/runtime/api.h

+
+typedef unsigned char symbol;
+
+/* Or replace 'char' above with 'short' for 16 bit characters.
+
+   More precisely, replace 'char' with whatever type guarantees the
+   character width you need. Note however that sizeof(symbol) should divide
+   HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
+   there is an alignment problem. In the unlikely event of a problem here,
+   consult Martin Porter.
+
+*/
+
+struct SN_env {
+    symbol * p;
+    int c; int l; int lb; int bra; int ket;
+    symbol * * S;
+    int * I;
+    unsigned char * B;
+};
+
+extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
+extern void SN_close_env(struct SN_env * z, int S_size);
+
+extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
+

libstemmer_c/runtime/header.h

+
+#include <limits.h>
+
+#include "api.h"
+
+#define MAXINT INT_MAX
+#define MININT INT_MIN
+
+#define HEAD 2*sizeof(int)
+
+#define SIZE(p)        ((int *)(p))[-1]
+#define SET_SIZE(p, n) ((int *)(p))[-1] = n
+#define CAPACITY(p)    ((int *)(p))[-2]
+
+struct among
+{   int s_size;     /* number of chars in string */
+    const symbol * s;       /* search string */
+    int substring_i;/* index to longest matching substring */
+    int result;     /* result of the lookup */
+    int (* function)(struct SN_env *);
+};
+
+extern symbol * create_s(void);
+extern void lose_s(symbol * p);
+
+extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
+
+extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+
+extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+
+extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
+extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
+extern int eq_v(struct SN_env * z, const symbol * p);
+extern int eq_v_b(struct SN_env * z, const symbol * p);
+
+extern int find_among(struct SN_env * z, const struct among * v, int v_size);
+extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
+
+extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
+extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
+extern int slice_from_v(struct SN_env * z, const symbol * p);
+extern int slice_del(struct SN_env * z);
+
+extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
+extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
+
+extern symbol * slice_to(struct SN_env * z, symbol * p);
+extern symbol * assign_to(struct SN_env * z, symbol * p);
+
+extern void debug(struct SN_env * z, int number, int line_count);
+

libstemmer_c/runtime/utilities.c

+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "header.h"
+
+#define unless(C) if(!(C))
+
+#define CREATE_SIZE 1
+
+extern symbol * create_s(void) {
+    symbol * p;
+    void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
+    if (mem == NULL) return NULL;
+    p = (symbol *) (HEAD + (char *) mem);
+    CAPACITY(p) = CREATE_SIZE;
+    SET_SIZE(p, CREATE_SIZE);
+    return p;
+}
+
+extern void lose_s(symbol * p) {
+    if (p == NULL) return;
+    free((char *) p - HEAD);
+}
+
+/*
+   new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
+   if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
+   position, or 0 on failure.
+
+   -- used to implement hop and next in the utf8 case.
+*/
+
+extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
+    int b;
+    if (n >= 0) {
+        for (; n > 0; n--) {
+            if (c >= l) return -1;
+            b = p[c++];
+            if (b >= 0xC0) {   /* 1100 0000 */
+                while (c < l) {
+                    b = p[c];
+                    if (b >= 0xC0 || b < 0x80) break;
+                    /* break unless b is 10------ */
+                    c++;
+                }
+            }
+        }
+    } else {
+        for (; n < 0; n++) {
+            if (c <= lb) return -1;
+            b = p[--c];
+            if (b >= 0x80) {   /* 1000 0000 */
+                while (c > lb) {
+                    b = p[c];
+                    if (b >= 0xC0) break; /* 1100 0000 */
+                    c--;
+                }
+            }
+        }
+    }
+    return c;
+}
+
+/* Code for character groupings: utf8 cases */
+
+static int get_utf8(const symbol * p, int c, int l, int * slot) {
+    int b0, b1;
+    if (c >= l) return 0;
+    b0 = p[c++];
+    if (b0 < 0xC0 || c == l) {   /* 1100 0000 */
+        * slot = b0; return 1;
+    }
+    b1 = p[c++];
+    if (b0 < 0xE0 || c == l) {   /* 1110 0000 */
+        * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
+    }
+    * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
+}
+
+static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
+    int b0, b1;
+    if (c <= lb) return 0;
+    b0 = p[--c];
+    if (b0 < 0x80 || c == lb) {   /* 1000 0000 */
+        * slot = b0; return 1;
+    }
+    b1 = p[--c];
+    if (b1 >= 0xC0 || c == lb) {   /* 1100 0000 */
+        * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
+    }
+    * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
+}
+
+extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	int w = get_utf8(z->p, z->c, z->l, & ch);
+	unless (w) return -1;
+	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return w;
+	z->c += w;
+    } while (repeat);
+    return 0;
+}
+
+extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+	unless (w) return -1;
+	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return w;
+	z->c -= w;
+    } while (repeat);
+    return 0;
+}
+
+extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	int w = get_utf8(z->p, z->c, z->l, & ch);
+	unless (w) return -1;
+	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return w;
+	z->c += w;
+    } while (repeat);
+    return 0;
+}
+
+extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+	unless (w) return -1;
+	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return w;
+	z->c -= w;
+    } while (repeat);
+    return 0;
+}
+
+/* Code for character groupings: non-utf8 cases */
+
+extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	if (z->c >= z->l) return -1;
+	ch = z->p[z->c];
+	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return 1;
+	z->c++;
+    } while (repeat);
+    return 0;
+}
+
+extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	if (z->c <= z->lb) return -1;
+	ch = z->p[z->c - 1];
+	if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return 1;
+	z->c--;
+    } while (repeat);
+    return 0;
+}
+
+extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	if (z->c >= z->l) return -1;
+	ch = z->p[z->c];
+	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return 1;
+	z->c++;
+    } while (repeat);
+    return 0;
+}
+
+extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+    do {
+	int ch;
+	if (z->c <= z->lb) return -1;
+	ch = z->p[z->c - 1];
+	unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+	    return 1;
+	z->c--;
+    } while (repeat);
+    return 0;
+}
+
+extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
+    if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
+    z->c += s_size; return 1;
+}
+
+extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
+    if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
+    z->c -= s_size; return 1;
+}
+
+extern int eq_v(struct SN_env * z, const symbol * p) {
+    return eq_s(z, SIZE(p), p);
+}
+
+extern int eq_v_b(struct SN_env * z, const symbol * p) {
+    return eq_s_b(z, SIZE(p), p);
+}
+
+extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
+
+    int i = 0;
+    int j = v_size;
+
+    int c = z->c; int l = z->l;
+    symbol * q = z->p + c;
+
+    const struct among * w;
+
+    int common_i = 0;
+    int common_j = 0;
+
+    int first_key_inspected = 0;
+
+    while(1) {
+        int k = i + ((j - i) >> 1);
+        int diff = 0;
+        int common = common_i < common_j ? common_i : common_j; /* smaller */
+        w = v + k;
+        {
+            int i2; for (i2 = common; i2 < w->s_size; i2++) {
+                if (c + common == l) { diff = -1; break; }
+                diff = q[common] - w->s[i2];
+                if (diff != 0) break;
+                common++;
+            }
+        }
+        if (diff < 0) { j = k; common_j = common; }
+                 else { i = k; common_i = common; }
+        if (j - i <= 1) {
+            if (i > 0) break; /* v->s has been inspected */
+            if (j == i) break; /* only one item in v */
+
+            /* - but now we need to go round once more to get
+               v->s inspected. This looks messy, but is actually
+               the optimal approach.  */
+
+            if (first_key_inspected) break;
+            first_key_inspected = 1;
+        }
+    }
+    while(1) {
+        w = v + i;
+        if (common_i >= w->s_size) {
+            z->c = c + w->s_size;
+            if (w->function == 0) return w->result;
+            {
+                int res = w->function(z);
+                z->c = c + w->s_size;
+                if (res) return w->result;
+            }
+        }
+        i = w->substring_i;
+        if (i < 0) return 0;
+    }
+}
+
+/* find_among_b is for backwards processing. Same comments apply */
+
+extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
+
+    int i = 0;
+    int j = v_size;
+
+    int c = z->c; int lb = z->lb;
+    symbol * q = z->p + c - 1;
+
+    const struct among * w;
+
+    int common_i = 0;
+    int common_j = 0;
+
+    int first_key_inspected = 0;
+
+    while(1) {
+        int k = i + ((j - i) >> 1);
+        int diff = 0;
+        int common = common_i < common_j ? common_i : common_j;
+        w = v + k;
+        {
+            int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
+                if (c - common == lb) { diff = -1; break; }
+                diff = q[- common] - w->s[i2];
+                if (diff != 0) break;
+                common++;
+            }
+        }
+        if (diff < 0) { j = k; common_j = common; }
+                 else { i = k; common_i = common; }
+        if (j - i <= 1) {
+            if (i > 0) break;
+            if (j == i) break;
+            if (first_key_inspected) break;
+            first_key_inspected = 1;
+        }
+    }
+    while(1) {
+        w = v + i;
+        if (common_i >= w->s_size) {
+            z->c = c - w->s_size;
+            if (w->function == 0) return w->result;
+            {
+                int res = w->function(z);
+                z->c = c - w->s_size;
+                if (res) return w->result;
+            }
+        }
+        i = w->substring_i;
+        if (i < 0) return 0;
+    }
+}
+
+
+/* Increase the size of the buffer pointed to by p to at least n symbols.
+ * If insufficient memory, returns NULL and frees the old buffer.
+ */
+static symbol * increase_size(symbol * p, int n) {
+    symbol * q;
+    int new_size = n + 20;
+    void * mem = realloc((char *) p - HEAD,
+                         HEAD + (new_size + 1) * sizeof(symbol));
+    if (mem == NULL) {
+        lose_s(p);
+        return NULL;
+    }
+    q = (symbol *) (HEAD + (char *)mem);
+    CAPACITY(q) = new_size;
+    return q;
+}
+
+/* to replace symbols between c_bra and c_ket in z->p by the
+   s_size symbols at s.
+   Returns 0 on success, -1 on error.
+   Also, frees z->p (and sets it to NULL) on error.
+*/
+extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
+{
+    int adjustment;
+    int len;
+    if (z->p == NULL) {
+        z->p = create_s();
+        if (z->p == NULL) return -1;
+    }
+    adjustment = s_size - (c_ket - c_bra);
+    len = SIZE(z->p);
+    if (adjustment != 0) {
+        if (adjustment + len > CAPACITY(z->p)) {
+            z->p = increase_size(z->p, adjustment + len);
+            if (z->p == NULL) return -1;
+        }
+        memmove(z->p + c_ket + adjustment,
+                z->p + c_ket,
+                (len - c_ket) * sizeof(symbol));
+        SET_SIZE(z->p, adjustment + len);
+        z->l += adjustment;
+        if (z->c >= c_ket)
+            z->c += adjustment;
+        else
+            if (z->c > c_bra)
+                z->c = c_bra;
+    }
+    unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
+    if (adjptr != NULL)
+        *adjptr = adjustment;
+    return 0;
+}
+
+static int slice_check(struct SN_env * z) {
+
+    if (z->bra < 0 ||
+        z->bra > z->ket ||
+        z->ket > z->l ||
+        z->p == NULL ||
+        z->l > SIZE(z->p)) /* this line could be removed */
+    {
+#if 0
+        fprintf(stderr, "faulty slice operation:\n");
+        debug(z, -1, 0);
+#endif
+        return -1;
+    }
+    return 0;
+}
+
+extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
+    if (slice_check(z)) return -1;
+    return replace_s(z, z->bra, z->ket, s_size, s, NULL);
+}
+
+extern int slice_from_v(struct SN_env * z, const symbol * p) {
+    return slice_from_s(z, SIZE(p), p);
+}
+
+extern int slice_del(struct SN_env * z) {
+    return slice_from_s(z, 0, 0);
+}
+
+extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
+    int adjustment;
+    if (replace_s(z, bra, ket, s_size, s, &adjustment))
+        return -1;
+    if (bra <= z->bra) z->bra += adjustment;
+    if (bra <= z->ket) z->ket += adjustment;
+    return 0;
+}
+
+extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
+    int adjustment;
+    if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
+        return -1;
+    if (bra <= z->bra) z->bra += adjustment;
+    if (bra <= z->ket) z->ket += adjustment;
+    return 0;
+}
+
+extern symbol * slice_to(struct SN_env * z, symbol * p) {
+    if (slice_check(z)) {
+        lose_s(p);
+        return NULL;
+    }
+    {
+        int len = z->ket - z->bra;
+        if (CAPACITY(p) < len) {
+            p = increase_size(p, len);
+            if (p == NULL)
+                return NULL;
+        }
+        memmove(p, z->p + z->bra, len * sizeof(symbol));
+        SET_SIZE(p, len);
+    }
+    return p;
+}
+
+extern symbol * assign_to(struct SN_env * z, symbol * p) {
+    int len = z->l;
+    if (CAPACITY(p) < len) {
+        p = increase_size(p, len);
+        if (p == NULL)
+            return NULL;
+    }
+    memmove(p, z->p, len * sizeof(symbol));
+    SET_SIZE(p, len);
+    return p;
+}
+
+#if 0
+extern void debug(struct SN_env * z, int number, int line_count) {
+    int i;
+    int limit = SIZE(z->p);
+    /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
+    if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
+    for (i = 0; i <= limit; i++) {
+        if (z->lb == i) printf("{");
+        if (z->bra == i) printf("[");
+        if (z->c == i) printf("|");
+        if (z->ket == i) printf("]");
+        if (z->l == i) printf("}");
+        if (i < limit)
+        {   int ch = z->p[i];
+            if (ch == 0) ch = '#';
+            printf("%c", ch);
+        }
+    }
+    printf("'\n");
+}
+#endif