1 Fix affixes with void replacement (AFAIK, it's only russian)

2 Optimize regex execution
2025-06-01 00:01:20 -04:00 · 2004-06-23 11:06:11 +00:00 · 2004-06-23 11:06:11 +00:00 · de55c0cef6
commit de55c0cef6
parent 153d5d31eb
6 changed files with 338 additions and 75 deletions
--- a/contrib/tsearch2/ispell/Makefile
+++ b/contrib/tsearch2/ispell/Makefile
@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.5 2003/11/29 19:51:36 pgsql Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.6 2004/06/23 11:06:11 teodor Exp $
 subdir = contrib/tsearch2/ispell
 top_builddir = ../../..
@ -8,7 +8,7 @@ include $(top_builddir)/src/Makefile.global
 PG_CPPFLAGS = -I$(srcdir)/.. $(CPPFLAGS)
 override CFLAGS += $(CFLAGS_SL)
-SUBOBJS = spell.o 
+SUBOBJS = spell.o regis.o 
 all: SUBSYS.o
--- a/contrib/tsearch2/ispell/regis.c
+++ b/contrib/tsearch2/ispell/regis.c
@ -0,0 +1,151 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include "regis.h"
 #include "common.h"
 int
 RS_isRegis(const char *str) {
 	unsigned char *ptr=(unsigned char *)str;
 	while(ptr && *ptr) 
 		if ( isalpha(*ptr) || *ptr=='[' || *ptr==']' || *ptr=='^')
 			ptr++;
 		else
 			return 0;
 	return 1; 
 }
 #define RS_IN_ONEOF	1
 #define RS_IN_ONEOF_IN	2
 #define RS_IN_NONEOF	3
 #define RS_IN_WAIT	4
 static RegisNode*
 newRegisNode(RegisNode *prev, int len) {
 	RegisNode       *ptr;
 	ptr = (RegisNode*)malloc(RNHDRSZ+len+1);
 	if (!ptr)
 		ts_error(ERROR, "No memory"); 
 	memset(ptr,0,RNHDRSZ+len+1);
 	if (prev)
 		prev->next=ptr;
 	return ptr;
 }
 int
 RS_compile(Regis *r, int issuffix, const char *str) {
 	int i,len = strlen(str);
 	int state = RS_IN_WAIT;
 	RegisNode	*ptr=NULL;
 	memset(r,0,sizeof(Regis));
 	r->issuffix = (issuffix) ? 1 : 0;
 	for(i=0;i<len;i++) {
 		unsigned char c = *( ( (unsigned char*)str ) + i );
 		if ( state == RS_IN_WAIT ) {
 			if ( isalpha(c) ) {
 				if ( ptr ) 
 					ptr = newRegisNode(ptr,len);
 				else
 					ptr = r->node = newRegisNode(NULL,len);
 				ptr->data[ 0 ] = c;
 				ptr->type = RSF_ONEOF;
 				ptr->len=1;
 			} else if ( c=='[' )  {
 				if ( ptr ) 
 					ptr = newRegisNode(ptr,len);
 				else
 					ptr = r->node = newRegisNode(NULL,len);
 				ptr->type = RSF_ONEOF;
 				state=RS_IN_ONEOF;
 			} else 
 				ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
 		} else if ( state == RS_IN_ONEOF ) {
 			if ( c=='^' ) {
 				ptr->type = RSF_NONEOF;
 				state=RS_IN_NONEOF;
 			} else if ( isalpha(c) ) {
 				ptr->data[ 0 ] = c;
 				ptr->len=1;
 				state=RS_IN_ONEOF_IN;
 			} else
 				ts_error(ERROR,"Error in regis: %s at pos %d\n", str,  i+1);
 		} else if ( state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF ) {
 			if ( isalpha(c) ) {
 				ptr->data[ ptr->len ] = c;
 				ptr->len++;
 			} else if ( c==']' ) {
 				state=RS_IN_WAIT;
 			} else
 				ts_error(ERROR,"Error in regis: %s at pos %d\n", str,  i+1);
 		} else
 			ts_error(ERROR,"Internal error in RS_compile: %d\n", state);
 	}
 	ptr = r->node;
 	while(ptr) {
 		r->nchar++;
 		ptr=ptr->next;
 	}
 	return 0;
 }
 void 
 RS_free(Regis *r) {
 	RegisNode *ptr=r->node,*tmp;
 	while(ptr) {
 		tmp=ptr->next;
 		free(ptr);
 		ptr = tmp;
 	}
 	r->node = NULL;
 }
 int 
 RS_execute(Regis *r, const char *str, int len) {
 	RegisNode *ptr=r->node;
 	unsigned char *c;
 	if (len<0)
 		len=strlen(str);
 	if (len<r->nchar)
 		return 0;
 	if ( r->issuffix ) 
 		c = ((unsigned char*)str) + len - r->nchar;
 	else
 		c = (unsigned char*)str; 
 	while(ptr) {
 		switch(ptr->type) {
 			case RSF_ONEOF:
 				if ( ptr->len==0 ) {
 					if ( *c != *(ptr->data) )
 						return 0;
 				} else if ( strchr((char*)ptr->data, *c) == NULL )
 					return 0;
 				break;
 			case RSF_NONEOF:
 				if ( ptr->len==0 ) {
 					if ( *c == *(ptr->data) )
 						return 0;
 				} else if ( strchr((char*)ptr->data, *c) != NULL )
 					return 0;
 				break;
 			default:
 				ts_error(ERROR,"RS_execute: Unknown type node: %d\n", ptr->type);
 		}
 		ptr=ptr->next;
 		c++;
 	}
 	return 1;
 }
--- a/contrib/tsearch2/ispell/regis.h
+++ b/contrib/tsearch2/ispell/regis.h
@ -0,0 +1,34 @@
 #ifndef __REGIS_H__
 #define __REGIS_H__
 #include "postgres.h" 
 typedef struct RegisNode {
 	uint32	
 		type:2,
 		len:16,
 		unused:14;
 	struct RegisNode *next;
 	unsigned char 	data[1];
 } RegisNode;
 #define  RNHDRSZ 	(sizeof(uint32)+sizeof(void*))
 #define	RSF_ONEOF	1
 #define	RSF_NONEOF	2
 typedef struct Regis {
 	RegisNode *node;
 	uint32	
 		issuffix:1,
 		nchar:16,
 		unused:15;
 } Regis;
 int RS_isRegis(const char *str);
 int RS_compile(Regis *r, int issuffix, const char *str);
 void RS_free(Regis *r);
 /*×ÏÚ×ÒÁÝÁÅÔ 1 ÅÓÌÉ ÍÁÔÞÉÔÓÑ */
 int RS_execute(Regis *r, const char *str, int len); 
 #endif
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@ -190,24 +190,24 @@ FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
 {
 	SPNode *node = Conf->Dictionary;
 	SPNodeData *StopLow, *StopHigh, *StopMiddle;
-	int level=0, wrdlen=strlen(word);
+	uint8 *ptr =(uint8*)word;
-	while( node && level<wrdlen) {
+	while( node && *ptr) {
 		StopLow = node->data;
 		StopHigh = node->data+node->length;
 		while (StopLow < StopHigh) {
-			StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
-			if ( StopMiddle->val == ((uint8*)(word))[level] ) {
+			if ( StopMiddle->val == *ptr ) {
-				if ( wrdlen==level+1 && StopMiddle->isword ) {
+				if ( *(ptr+1)=='\0' && StopMiddle->isword ) {
 					if ( compoundonly && !StopMiddle->compoundallow )
 						return 0;
 					if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
 						return 1;
 				}
 				node=StopMiddle->node;
-				level++;
+				ptr++;
 				break;
-			} else if ( StopMiddle->val < ((uint8*)(word))[level] ) {
+			} else if ( StopMiddle->val < *ptr ) {
 				StopLow = StopMiddle + 1;
 			} else {
 				StopHigh = StopMiddle;
@ -236,19 +236,32 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
 		}
 		MEMOUT(Conf->Affix);
 	}
 	if (type == 's')
 		sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
 	else
 		sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
 	Conf->Affix[Conf->naffixes].compile = 1;
 	Conf->Affix[Conf->naffixes].flagflags = flagflags;
 	Conf->Affix[Conf->naffixes].flag = flag;
 	Conf->Affix[Conf->naffixes].type = type;
-	strcpy(Conf->Affix[Conf->naffixes].find, find);
+        if ( strcmp(mask,".")==0 ) {
-	strcpy(Conf->Affix[Conf->naffixes].repl, repl);
+                Conf->Affix[Conf->naffixes].issimple=1;
-	Conf->Affix[Conf->naffixes].replen = strlen(repl);
+                Conf->Affix[Conf->naffixes].isregis=0;
-	Conf->naffixes++;
+                *( Conf->Affix[Conf->naffixes].mask )='\0';
        } else if ( RS_isRegis(mask) ) {
                Conf->Affix[Conf->naffixes].issimple=0;
                Conf->Affix[Conf->naffixes].isregis=1;
                strcpy(Conf->Affix[Conf->naffixes].mask, mask);
        } else {
                Conf->Affix[Conf->naffixes].issimple=0;
                Conf->Affix[Conf->naffixes].isregis=0;
                if (type == FF_SUFFIX)
                        sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
                else
                        sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
        }
        Conf->Affix[Conf->naffixes].compile = 1;
        Conf->Affix[Conf->naffixes].flagflags = flagflags;
        Conf->Affix[Conf->naffixes].flag = flag;
        Conf->Affix[Conf->naffixes].type = type;
        strcpy(Conf->Affix[Conf->naffixes].find, find);
        strcpy(Conf->Affix[Conf->naffixes].repl, repl);
        Conf->Affix[Conf->naffixes].replen = strlen(repl);
        Conf->naffixes++;
 	return (0);
 }
@ -366,7 +379,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
 				continue;
 		}
-		NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
+		NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
 	}
 	fclose(affix);
@ -550,6 +563,46 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) {
 	return rs;
 }
 static void
 mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) {
        int i,cnt=0;
        int start = (issuffix) ? startsuffix : 0;
        int end = (issuffix) ? Conf->naffixes : startsuffix;
        AffixNode       *Affix = (AffixNode*)malloc( ANHRDSZ + sizeof(AffixNodeData));
 	MEMOUT(Affix);
        memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData) );
        Affix->length=1;
        Affix->isvoid=1;
        if (issuffix) {
                Affix->data->node=Conf->Suffix;
                Conf->Suffix = Affix;
        } else {
                Affix->data->node=Conf->Prefix;
                Conf->Prefix = Affix;
        }
        for(i=start;i<end;i++)
                if (Conf->Affix[i].replen==0)
                        cnt++;
        if ( cnt==0 )
                return;
        Affix->data->aff = (AFFIX**)malloc( sizeof(AFFIX*) * cnt );
 	MEMOUT(Affix->data->aff);
        Affix->data->naff = (uint32)cnt;
        cnt=0; 
        for(i=start;i<end;i++)
                if (Conf->Affix[i].replen==0) {
                        Affix->data->aff[cnt] = Conf->Affix + i;
                        cnt++;
                }
 }
 void
 NISortAffixes(IspellDict * Conf)
 {
@ -584,6 +637,8 @@ NISortAffixes(IspellDict * Conf)
 	Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p'); 
 	Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
        mkVoidAffix(Conf, 1, firstsuffix);
        mkVoidAffix(Conf, 0, firstsuffix);
 }
 static AffixNodeData*
@ -591,17 +646,23 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
 	AffixNodeData *StopLow, *StopHigh, *StopMiddle;
 	uint8 symbol;
        if ( node->isvoid ) { /* search void affixes */
                if (node->data->naff)
                        return node->data;
                node = node->data->node;
        }
 	while( node && *level<wrdlen) {
 		StopLow = node->data;
 		StopHigh = node->data+node->length;
 		while (StopLow < StopHigh) {
-			StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
 			symbol = GETWCHAR(word,wrdlen,*level,type);
 			if ( StopMiddle->val == symbol ) {
 				(*level)++;
 				if ( StopMiddle->naff ) 
 					return StopMiddle;
 				node=StopMiddle->node;
 				(*level)++;
 				break;
 			} else if ( StopMiddle->val < symbol ) {
 				StopLow = StopMiddle + 1;
@ -617,11 +678,6 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
 static char *
 CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
 	regmatch_t	subs[2];		/* workaround for apache&linux */
 	int			err;
 	pg_wchar   *data;
 	size_t		data_len;
 	int			dat_len;
 	if ( flagflags & FF_COMPOUNDONLYAFX ) {
 		if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
@ -631,7 +687,7 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
 			return NULL;
 	} 
-	if ( Affix->type=='s' ) {
+	if ( Affix->type==FF_SUFFIX ) {
 		strcpy(newword, word);
 		strcpy(newword + len - Affix->replen, Affix->find);
 	} else {
@ -639,34 +695,50 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
 		strcat(newword, word + Affix->replen);
 	}
-	if (Affix->compile)
+        if ( Affix->issimple ) {
-	{
+                return newword;
-		int wmasklen,masklen = strlen(Affix->mask);
+        } else if ( Affix->isregis ) {
-		pg_wchar *mask;
+                if (Affix->compile) {
-		mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
+                        RS_compile(&(Affix->reg.regis), (Affix->type==FF_SUFFIX) ? 1 : 0, Affix->mask);
-		wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
+                        Affix->compile = 0;
-		
+                }
-		err = pg_regcomp(&(Affix->reg), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
+                if ( RS_execute(&(Affix->reg.regis), newword, -1) )
-		pfree(mask);
+                        return newword;
-		if (err)
+	} else {
 		regmatch_t	subs[2];		/* workaround for apache&linux */
 		int			err;
 		pg_wchar   *data;
 		size_t		data_len;
 		int	dat_len;
 		if (Affix->compile)
 		{
-			/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
+			int wmasklen,masklen = strlen(Affix->mask);
-			pg_regfree(&(Affix->reg));
+			pg_wchar *mask;
-			return (NULL);
+			mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
 			wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
 			err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
 			pfree(mask);
 			if (err)
 			{
 				/* regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE); */
 				pg_regfree(&(Affix->reg.regex));
 				return (NULL);
 			}
 			Affix->compile = 0;
 		}
 		Affix->compile = 0;
 	}
-	/* Convert data string to wide characters */
+		/* Convert data string to wide characters */
-	dat_len = strlen(newword);
+		dat_len = strlen(newword);
-	data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+		data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-	data_len = pg_mb2wchar_with_len(newword, data, dat_len);
+		data_len = pg_mb2wchar_with_len(newword, data, dat_len);
-	if (!(err = pg_regexec(&(Affix->reg), data,dat_len,NULL, 1, subs, 0))) {
+		if (!(err = pg_regexec(&(Affix->reg.regex), data,dat_len,NULL, 1, subs, 0))) {
-			pfree(data); 
+				pfree(data); 
-			return newword;
+				return newword;
 		}
 		pfree(data);
 	}
 	pfree(data);
 	return NULL;
 }
@ -715,7 +787,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
 			}
 		}
 		pnode = prefix->node;
 		plevel++;
 	}
 	/* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
@ -754,13 +825,11 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
 						}
 					}
 					pnode = prefix->node;
 					plevel++;
 				} 
 			}
 		}
 		snode=suffix->node;
 		slevel++;
 	}
 	if (cur == forms) {
@ -1013,8 +1082,12 @@ NIFree(IspellDict * Conf)
 	for (i = 0; i < Conf->naffixes; i++)
 	{
-		if (Affix[i].compile == 0)
+		if (Affix[i].compile == 0) {
-			pg_regfree(&(Affix[i].reg));
+                        if ( Affix[i].isregis )
                                RS_free(&(Affix[i].reg.regis));
                        else
 				pg_regfree(&(Affix[i].reg.regex));
 		}
 	}
 	if (Conf->Spell) {
 		for (i = 0; i < Conf->nspell; i++)
--- a/contrib/tsearch2/ispell/spell.h
+++ b/contrib/tsearch2/ispell/spell.h
@ -3,6 +3,7 @@
 #include <sys/types.h>
 #include "regex/regex.h"
 #include "regis.h"
 #include "c.h"
@ -40,20 +41,29 @@ typedef struct spell_struct
 typedef struct aff_struct
 {
-	char		flag;
+        uint32
-	char		flagflags;
+                flag:8,
-	char		type;
+                type:2,
-	char		mask[33];
+                compile:1,
-	char		find[16];
+                flagflags:3,
-	char		repl[16];
+                issimple:1,
-	regex_t		reg;
+                isregis:1,
-	size_t		replen;
+                unused:1,
-	char		compile;
+                replen:16;
        char            mask[32];
        char            find[16];
        char            repl[16];
        union {
                regex_t         regex;
                Regis           regis;
        } reg;
 }	AFFIX;
 #define FF_CROSSPRODUCT 	0x01
 #define FF_COMPOUNDWORD 	0x02
 #define FF_COMPOUNDONLYAFX      0x04
 #define FF_SUFFIX               2
 #define FF_PREFIX               1
 struct AffixNode;
@ -66,18 +76,13 @@ typedef struct {
 } AffixNodeData;
 typedef struct AffixNode {
-	uint32 length;
+        uint32  isvoid:1,
                length:31;
 	AffixNodeData	data[1];
 } AffixNode;
 #define ANHRDSZ        (sizeof(uint32))
 typedef struct Tree_struct
 {
 	int			Left[256],
 				Right[256];
 }	Tree_struct;
 typedef struct {
 	char *affix;
 	int len;
--- a/contrib/tsearch2/tsearch.sql.in
+++ b/contrib/tsearch2/tsearch.sql.in
@ -816,7 +816,7 @@ CREATE OPERATOR CLASS tsvector_ops
        FUNCTION        1       tsvector_cmp(tsvector, tsvector);
 --example of ISpell dictionary
--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_id=4;
+--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
 --example of synonym dict
 --update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
 END;