mirror of
https://github.com/postgres/postgres.git
synced 2025-05-28 00:03:23 -04:00
Improve support of Hunspell in ispell dictionary.
Now it's possible to load recent version of Hunspell for several languages. To handle these dictionaries Hunspell patch adds support for: * FLAG long - sets the double extended ASCII character flag type * FLAG num - sets the decimal number flag type (from 1 to 65535) * AF parameter - alias for flag's set Also it moves test dictionaries into separate directory. Author: Artur Zakirov with editorization by me
This commit is contained in:
parent
9445db925e
commit
d78a7d9c7f
@ -2615,18 +2615,41 @@ SELECT plainto_tsquery('supernova star');
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To create an <application>Ispell</> dictionary, use the built-in
|
||||
<literal>ispell</literal> template and specify several parameters:
|
||||
To create an <application>Ispell</> dictionary perform these steps:
|
||||
</para>
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<para>
|
||||
download dictionary configuration files. <productname>OpenOffice</>
|
||||
extension files have the <filename>.oxt</> extension. It is necessary
|
||||
to extract <filename>.aff</> and <filename>.dic</> files, change
|
||||
extensions to <filename>.affix</> and <filename>.dict</>. For some
|
||||
dictionary files it is also needed to convert characters to the UTF-8
|
||||
encoding with commands (for example, for norwegian language dictionary):
|
||||
<programlisting>
|
||||
CREATE TEXT SEARCH DICTIONARY english_ispell (
|
||||
TEMPLATE = ispell,
|
||||
DictFile = english,
|
||||
AffFile = english,
|
||||
StopWords = english
|
||||
);
|
||||
iconv -f ISO_8859-1 -t UTF-8 -o nn_no.affix nn_NO.aff
|
||||
iconv -f ISO_8859-1 -t UTF-8 -o nn_no.dict nn_NO.dic
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
copy files to the <filename>$SHAREDIR/tsearch_data</> directory
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
load files into PostgreSQL with the following command:
|
||||
<programlisting>
|
||||
CREATE TEXT SEARCH DICTIONARY english_hunspell (
|
||||
TEMPLATE = ispell,
|
||||
DictFile = en_us,
|
||||
AffFile = en_us,
|
||||
Stopwords = english);
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
Here, <literal>DictFile</>, <literal>AffFile</>, and <literal>StopWords</>
|
||||
@ -2642,6 +2665,56 @@ CREATE TEXT SEARCH DICTIONARY english_ispell (
|
||||
example, a Snowball dictionary, which recognizes everything.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The <filename>.affix</> file of <application>Ispell</> has the following
|
||||
structure:
|
||||
<programlisting>
|
||||
prefixes
|
||||
flag *A:
|
||||
. > RE # As in enter > reenter
|
||||
suffixes
|
||||
flag T:
|
||||
E > ST # As in late > latest
|
||||
[^AEIOU]Y > -Y,IEST # As in dirty > dirtiest
|
||||
[AEIOU]Y > EST # As in gray > grayest
|
||||
[^EY] > EST # As in small > smallest
|
||||
</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
And the <filename>.dict</> file has the following structure:
|
||||
<programlisting>
|
||||
lapse/ADGRS
|
||||
lard/DGRS
|
||||
large/PRTY
|
||||
lark/MRS
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Format of the <filename>.dict</> file is:
|
||||
<programlisting>
|
||||
basic_form/affix_class_name
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
In the <filename>.affix</> file every affix flag is described in the
|
||||
following format:
|
||||
<programlisting>
|
||||
condition > [-stripping_letters,] adding_affix
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Here, condition has a format similar to the format of regular expressions.
|
||||
It can use groupings <literal>[...]</> and <literal>[^...]</>.
|
||||
For example, <literal>[AEIOU]Y</> means that the last letter of the word
|
||||
is <literal>"y"</> and the penultimate letter is <literal>"a"</>,
|
||||
<literal>"e"</>, <literal>"i"</>, <literal>"o"</> or <literal>"u"</>.
|
||||
<literal>[^EY]</> means that the last letter is neither <literal>"e"</>
|
||||
nor <literal>"y"</>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Ispell dictionaries support splitting compound words;
|
||||
a useful feature.
|
||||
@ -2663,6 +2736,65 @@ SELECT ts_lexize('norwegian_ispell', 'sjokoladefabrikk');
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<application>MySpell</> format is a subset of <application>Hunspell</>.
|
||||
The <filename>.affix</> file of <application>Hunspell</> has the following
|
||||
structure:
|
||||
<programlisting>
|
||||
PFX A Y 1
|
||||
PFX A 0 re .
|
||||
SFX T N 4
|
||||
SFX T 0 st e
|
||||
SFX T y iest [^aeiou]y
|
||||
SFX T 0 est [aeiou]y
|
||||
SFX T 0 est [^ey]
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The first line of an affix class is the header. Fields of an affix rules are
|
||||
listed after the header:
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<para>
|
||||
parameter name (PFX or SFX)
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
flag (name of the affix class)
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
stripping characters from beginning (at prefix) or end (at suffix) of the
|
||||
word
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
adding affix
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
condition that has a format similar to the format of regular expressions.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
The <filename>.dict</> file looks like the <filename>.dict</> file of
|
||||
<application>Ispell</>:
|
||||
<programlisting>
|
||||
larder/M
|
||||
lardy/RT
|
||||
large/RSPMYT
|
||||
largehearted
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<application>MySpell</> does not support compound words.
|
||||
|
@ -13,8 +13,11 @@ include $(top_builddir)/src/Makefile.global
|
||||
|
||||
DICTDIR=tsearch_data
|
||||
|
||||
DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
|
||||
ispell_sample.affix ispell_sample.dict
|
||||
DICTFILES=dicts/synonym_sample.syn dicts/thesaurus_sample.ths \
|
||||
dicts/hunspell_sample.affix \
|
||||
dicts/ispell_sample.affix dicts/ispell_sample.dict \
|
||||
dicts/hunspell_sample_long.affix dicts/hunspell_sample_long.dict \
|
||||
dicts/hunspell_sample_num.affix dicts/hunspell_sample_num.dict
|
||||
|
||||
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
|
||||
dict_simple.o dict_synonym.o dict_thesaurus.o \
|
||||
|
35
src/backend/tsearch/dicts/hunspell_sample_long.affix
Normal file
35
src/backend/tsearch/dicts/hunspell_sample_long.affix
Normal file
@ -0,0 +1,35 @@
|
||||
FLAG long
|
||||
|
||||
AF 7
|
||||
AF cZ #1
|
||||
AF cL #2
|
||||
AF sGsJpUsS #3
|
||||
AF sSpB #4
|
||||
AF cZsS #5
|
||||
AF sScZs\ #6
|
||||
AF sA #7
|
||||
|
||||
COMPOUNDFLAG cZ
|
||||
ONLYINCOMPOUND cL
|
||||
|
||||
PFX pB Y 1
|
||||
PFX pB 0 re .
|
||||
|
||||
PFX pU N 1
|
||||
PFX pU 0 un .
|
||||
|
||||
SFX sJ Y 1
|
||||
SFX sJ 0 INGS [^E]
|
||||
|
||||
SFX sG Y 1
|
||||
SFX sG 0 ING [^E]
|
||||
|
||||
SFX sS Y 1
|
||||
SFX sS 0 S [^SXZHY]
|
||||
|
||||
SFX sA Y 1
|
||||
SFX sA Y IES [^AEIOU]Y
|
||||
|
||||
SFX s\ N 1
|
||||
SFX s\ 0 Y/2 [^Y]
|
||||
|
8
src/backend/tsearch/dicts/hunspell_sample_long.dict
Normal file
8
src/backend/tsearch/dicts/hunspell_sample_long.dict
Normal file
@ -0,0 +1,8 @@
|
||||
book/3
|
||||
booking/4
|
||||
footballklubber
|
||||
foot/5
|
||||
football/1
|
||||
ball/6
|
||||
klubber/1
|
||||
sky/7
|
26
src/backend/tsearch/dicts/hunspell_sample_num.affix
Normal file
26
src/backend/tsearch/dicts/hunspell_sample_num.affix
Normal file
@ -0,0 +1,26 @@
|
||||
FLAG num
|
||||
|
||||
COMPOUNDFLAG 101
|
||||
ONLYINCOMPOUND 102
|
||||
|
||||
PFX 201 Y 1
|
||||
PFX 201 0 re .
|
||||
|
||||
PFX 202 N 1
|
||||
PFX 202 0 un .
|
||||
|
||||
SFX 301 Y 1
|
||||
SFX 301 0 INGS [^E]
|
||||
|
||||
SFX 302 Y 1
|
||||
SFX 302 0 ING [^E]
|
||||
|
||||
SFX 303 Y 1
|
||||
SFX 303 0 S [^SXZHY]
|
||||
|
||||
SFX 304 Y 1
|
||||
SFX 304 Y IES [^AEIOU]Y
|
||||
|
||||
SFX 305 N 1
|
||||
SFX 305 0 Y/102 [^Y]
|
||||
|
8
src/backend/tsearch/dicts/hunspell_sample_num.dict
Normal file
8
src/backend/tsearch/dicts/hunspell_sample_num.dict
Normal file
@ -0,0 +1,8 @@
|
||||
book/302,301,202,303
|
||||
booking/303,201
|
||||
footballklubber
|
||||
foot/101,303
|
||||
football/101
|
||||
ball/303,101,305
|
||||
klubber/101
|
||||
sky/304
|
@ -5,6 +5,54 @@
|
||||
*
|
||||
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
|
||||
*
|
||||
* Ispell dictionary
|
||||
* -----------------
|
||||
*
|
||||
* Rules of dictionaries are defined in two files with .affix and .dict
|
||||
* extensions. They are used by spell checker programs Ispell and Hunspell.
|
||||
*
|
||||
* An .affix file declares morphological rules to get a basic form of words.
|
||||
* The format of an .affix file has different structure for Ispell and Hunspell
|
||||
* dictionaries. The Hunspell format is more complicated. But when an .affix
|
||||
* file is imported and compiled, it is stored in the same structure AffixNode.
|
||||
*
|
||||
* A .dict file stores a list of basic forms of words with references to
|
||||
* affix rules. The format of a .dict file has the same structure for Ispell
|
||||
* and Hunspell dictionaries.
|
||||
*
|
||||
* Compilation of a dictionary
|
||||
* ---------------------------
|
||||
*
|
||||
* A compiled dictionary is stored in the IspellDict structure. Compilation of
|
||||
* a dictionary is divided into the several steps:
|
||||
* - NIImportDictionary() - stores each word of a .dict file in the
|
||||
* temporary Spell field.
|
||||
* - NIImportAffixes() - stores affix rules of an .affix file in the
|
||||
* Affix field (not temporary) if an .affix file has the Ispell format.
|
||||
* -> NIImportOOAffixes() - stores affix rules if an .affix file has the
|
||||
* Hunspell format. The AffixData field is initialized if AF parameter
|
||||
* is defined.
|
||||
* - NISortDictionary() - builds a prefix tree (Trie) from the words list
|
||||
* and stores it in the Dictionary field. The words list is got from the
|
||||
* Spell field. The AffixData field is initialized if AF parameter is not
|
||||
* defined.
|
||||
* - NISortAffixes():
|
||||
* - builds a list of compond affixes from the affix list and stores it
|
||||
* in the CompoundAffix.
|
||||
* - builds prefix trees (Trie) from the affix list for prefixes and suffixes
|
||||
* and stores them in Suffix and Prefix fields.
|
||||
* The affix list is got from the Affix field.
|
||||
*
|
||||
* Memory management
|
||||
* -----------------
|
||||
*
|
||||
* The IspellDict structure has the Spell field which is used only in compile
|
||||
* time. The Spell field stores a words list. It can take a lot of memory.
|
||||
* Therefore when a dictionary is compiled this field is cleared by
|
||||
* NIFinishBuild().
|
||||
*
|
||||
* All resources which should cleared by NIFinishBuild() is initialized using
|
||||
* tmpalloc() and tmpalloc0().
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/tsearch/spell.c
|
||||
@ -150,10 +198,12 @@ cmpspell(const void *s1, const void *s2)
|
||||
{
|
||||
return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word));
|
||||
}
|
||||
|
||||
static int
|
||||
cmpspellaffix(const void *s1, const void *s2)
|
||||
{
|
||||
return (strncmp((*(SPELL *const *) s1)->p.flag, (*(SPELL *const *) s2)->p.flag, MAXFLAGLEN));
|
||||
return (strcmp((*(SPELL *const *) s1)->p.flag,
|
||||
(*(SPELL *const *) s2)->p.flag));
|
||||
}
|
||||
|
||||
static char *
|
||||
@ -220,6 +270,11 @@ strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compares affixes.
|
||||
* First compares the type of an affix. Prefixes should go before affixes.
|
||||
* If types are equal then compares replaceable string.
|
||||
*/
|
||||
static int
|
||||
cmpaffix(const void *s1, const void *s2)
|
||||
{
|
||||
@ -237,6 +292,176 @@ cmpaffix(const void *s1, const void *s2)
|
||||
(const unsigned char *) a2->repl);
|
||||
}
|
||||
|
||||
/*
|
||||
* Gets an affix flag from string representation (a set of affixes).
|
||||
*
|
||||
* Several flags can be stored in a single string. Flags can be represented by:
|
||||
* - 1 character (FM_CHAR).
|
||||
* - 2 characters (FM_LONG).
|
||||
* - numbers from 1 to 65000 (FM_NUM).
|
||||
*
|
||||
* Depending on the flagMode an affix string can have the following format:
|
||||
* - FM_CHAR: ABCD
|
||||
* Here we have 4 flags: A, B, C and D
|
||||
* - FM_LONG: ABCDE*
|
||||
* Here we have 3 flags: AB, CD and E*
|
||||
* - FM_NUM: 200,205,50
|
||||
* Here we have 3 flags: 200, 205 and 50
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* sflag: string representation (a set of affixes) of an affix flag.
|
||||
* sflagnext: returns reference to the start of a next affix flag in the sflag.
|
||||
*
|
||||
* Returns an integer representation of the affix flag.
|
||||
*/
|
||||
static uint16
|
||||
DecodeFlag(IspellDict *Conf, char *sflag, char **sflagnext)
|
||||
{
|
||||
int32 s;
|
||||
char *next;
|
||||
unsigned char *usflag;
|
||||
|
||||
switch (Conf->flagMode)
|
||||
{
|
||||
case FM_LONG:
|
||||
/*
|
||||
* Hunspell docs says flag could contains only
|
||||
* ASCII characters
|
||||
*/
|
||||
if (!(pg_mblen(sflag) == 1 && isascii(sflag[0]) &&
|
||||
pg_mblen(sflag + 1) == 1 && isascii(sflag[1])))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("non-ASCII affix flag \"%s\"", sflag)));
|
||||
|
||||
usflag = (unsigned char *)sflag;
|
||||
s = ((int)usflag[0]) << 8 | ((int)usflag[1]);
|
||||
if (sflagnext)
|
||||
/* Go to start of the next flag */
|
||||
*sflagnext = sflag + 2;
|
||||
break;
|
||||
case FM_NUM:
|
||||
s = strtol(sflag, &next, 10);
|
||||
if (s <= 0 || s > FLAGNUM_MAXSIZE)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("invalid affix flag \"%s\"", sflag)));
|
||||
|
||||
if (sflagnext)
|
||||
{
|
||||
/* Go to start of the next flag */
|
||||
if (next)
|
||||
{
|
||||
bool met_comma = false;
|
||||
|
||||
while (*next)
|
||||
{
|
||||
if (!(pg_mblen(next) == 1 && isascii(*next)))
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("non-ASCII affix flag \"%s\"",
|
||||
sflag)));
|
||||
}
|
||||
else if (isdigit(*next))
|
||||
{
|
||||
if (!met_comma)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("invalid affix flag \"%s\"",
|
||||
sflag)));
|
||||
break;
|
||||
}
|
||||
else if (*next == ',')
|
||||
{
|
||||
if (met_comma)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("invalid affix flag \"%s\"",
|
||||
sflag)));
|
||||
met_comma = true;
|
||||
}
|
||||
else if (!isspace(*next))
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("invalid character in affix flag \"%s\"", sflag)));
|
||||
}
|
||||
|
||||
next++;
|
||||
}
|
||||
|
||||
if (*next == '\0')
|
||||
next = NULL;
|
||||
}
|
||||
|
||||
*sflagnext = next;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (!(pg_mblen(sflag) == 1 && isascii(*sflag)))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("non-ASCII affix flag \"%s\"", sflag)));
|
||||
|
||||
s = *sflag;
|
||||
if (sflagnext)
|
||||
/* Go to start of the next flag */
|
||||
*sflagnext = sflag + pg_mblen(sflag);
|
||||
}
|
||||
|
||||
return (uint16)s;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks if the affix set Conf->AffixData[affix] contains affixflag.
|
||||
* Conf->AffixData[affix] is the string representation of an affix flags.
|
||||
* Conf->AffixData[affix] does not contain affixflag if this flag is not used
|
||||
* actually by the .dict file.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* affix: index of the Conf->AffixData array.
|
||||
* affixflag: integer representation of the affix flag.
|
||||
*
|
||||
* Returns true if the string Conf->AffixData[affix] contains affixflag,
|
||||
* otherwise returns false.
|
||||
*/
|
||||
static bool
|
||||
IsAffixFlagInUse(IspellDict *Conf, int affix, uint16 affixflag)
|
||||
{
|
||||
char *flagcur;
|
||||
char *flagnext = NULL;
|
||||
|
||||
if (affixflag == 0)
|
||||
return true;
|
||||
|
||||
flagcur = Conf->AffixData[affix];
|
||||
|
||||
while (*flagcur)
|
||||
{
|
||||
/* Compare first affix flag in flagcur with affixflag */
|
||||
if (DecodeFlag(Conf, flagcur, &flagnext) == affixflag)
|
||||
return true;
|
||||
/* Otherwise go to next flag */
|
||||
if (flagnext)
|
||||
flagcur = flagnext;
|
||||
/* If we have not flags anymore then exit */
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
/* Could not find affixflag */
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Adds the new word into the temporary array Spell.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* word: new word.
|
||||
* flag: set of affix flags. Integer representation of flag can be got by
|
||||
* DecodeFlag().
|
||||
*/
|
||||
static void
|
||||
NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
|
||||
{
|
||||
@ -255,14 +480,18 @@ NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
|
||||
}
|
||||
Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
|
||||
strcpy(Conf->Spell[Conf->nspell]->word, word);
|
||||
strlcpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
|
||||
Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
|
||||
? cpstrdup(Conf, flag) : VoidString;
|
||||
Conf->nspell++;
|
||||
}
|
||||
|
||||
/*
|
||||
* import dictionary
|
||||
* Imports dictionary into the temporary array Spell.
|
||||
*
|
||||
* Note caller must already have applied get_tsearch_config_filename
|
||||
* Note caller must already have applied get_tsearch_config_filename.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* filename: path to the .dict file.
|
||||
*/
|
||||
void
|
||||
NIImportDictionary(IspellDict *Conf, const char *filename)
|
||||
@ -280,6 +509,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
|
||||
{
|
||||
char *s,
|
||||
*pstr;
|
||||
/* Set of affix flags */
|
||||
const char *flag;
|
||||
|
||||
/* Extract flag from the line */
|
||||
@ -324,7 +554,30 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
|
||||
tsearch_readline_end(&trst);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Searches a basic form of word in the prefix tree. This word was generated
|
||||
* using an affix rule. This rule may not be presented in an affix set of
|
||||
* a basic form of word.
|
||||
*
|
||||
* For example, we have the entry in the .dict file:
|
||||
* meter/GMD
|
||||
*
|
||||
* The affix rule with the flag S:
|
||||
* SFX S y ies [^aeiou]y
|
||||
* is not presented here.
|
||||
*
|
||||
* The affix rule with the flag M:
|
||||
* SFX M 0 's .
|
||||
* is presented here.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* word: basic form of word.
|
||||
* affixflag: integer representation of the affix flag, by which a basic form of
|
||||
* word was generated.
|
||||
* flag: compound flag used to compare with StopMiddle->compoundflag.
|
||||
*
|
||||
* Returns 1 if the word was found in the prefix tree, else returns 0.
|
||||
*/
|
||||
static int
|
||||
FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
|
||||
{
|
||||
@ -349,13 +602,22 @@ FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
|
||||
{
|
||||
if (flag == 0)
|
||||
{
|
||||
/*
|
||||
* The word can be formed only with another word.
|
||||
* And in the flag parameter there is not a sign
|
||||
* that we search compound words.
|
||||
*/
|
||||
if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
|
||||
return 0;
|
||||
}
|
||||
else if ((flag & StopMiddle->compoundflag) == 0)
|
||||
return 0;
|
||||
|
||||
if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
|
||||
/*
|
||||
* Check if this affix rule is presented in the affix set
|
||||
* with index StopMiddle->affix.
|
||||
*/
|
||||
if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
|
||||
return 1;
|
||||
}
|
||||
node = StopMiddle->node;
|
||||
@ -373,6 +635,24 @@ FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Adds a new affix rule to the Affix field.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* flag: integer representation of the affix flag ('\' in the below example).
|
||||
* flagflags: set of flags from the flagval field for this affix rule. This set
|
||||
* is listed after '/' character in the added string (repl).
|
||||
*
|
||||
* For example L flag in the hunspell_sample.affix:
|
||||
* SFX \ 0 Y/L [^Y]
|
||||
*
|
||||
* mask: condition for search ('[^Y]' in the above example).
|
||||
* find: stripping characters from beginning (at prefix) or end (at suffix)
|
||||
* of the word ('0' in the above example, 0 means that there is not
|
||||
* stripping character).
|
||||
* repl: adding string after stripping ('Y' in the above example).
|
||||
* type: FF_SUFFIX or FF_PREFIX.
|
||||
*/
|
||||
static void
|
||||
NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
|
||||
{
|
||||
@ -394,18 +674,21 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
|
||||
|
||||
Affix = Conf->Affix + Conf->naffixes;
|
||||
|
||||
if (strcmp(mask, ".") == 0)
|
||||
/* This affix rule can be applied for words with any ending */
|
||||
if (strcmp(mask, ".") == 0 || *mask == '\0')
|
||||
{
|
||||
Affix->issimple = 1;
|
||||
Affix->isregis = 0;
|
||||
}
|
||||
/* This affix rule will use regis to search word ending */
|
||||
else if (RS_isRegis(mask))
|
||||
{
|
||||
Affix->issimple = 0;
|
||||
Affix->isregis = 1;
|
||||
RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false,
|
||||
RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
|
||||
*mask ? mask : VoidString);
|
||||
}
|
||||
/* This affix rule will use regex_t to search word ending */
|
||||
else
|
||||
{
|
||||
int masklen;
|
||||
@ -457,7 +740,6 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
|
||||
Conf->naffixes++;
|
||||
}
|
||||
|
||||
|
||||
/* Parsing states for parse_affentry() and friends */
|
||||
#define PAE_WAIT_MASK 0
|
||||
#define PAE_INMASK 1
|
||||
@ -712,9 +994,16 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
|
||||
|
||||
*pmask = *pfind = *prepl = '\0';
|
||||
|
||||
return (*mask && (*find || *repl)) ? true : false;
|
||||
return (*mask && (*find || *repl));
|
||||
}
|
||||
|
||||
/*
|
||||
* Sets up a correspondence for the affix parameter with the affix flag.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* s: affix flag in string.
|
||||
* val: affix parameter.
|
||||
*/
|
||||
static void
|
||||
addFlagValue(IspellDict *Conf, char *s, uint32 val)
|
||||
{
|
||||
@ -731,12 +1020,66 @@ addFlagValue(IspellDict *Conf, char *s, uint32 val)
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("multibyte flag character is not allowed")));
|
||||
|
||||
Conf->flagval[*(unsigned char *) s] = (unsigned char) val;
|
||||
Conf->flagval[DecodeFlag(Conf, s, (char **)NULL)] = (unsigned char) val;
|
||||
Conf->usecompound = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Import an affix file that follows MySpell or Hunspell format
|
||||
* Returns a set of affix parameters which correspondence to the set of affix
|
||||
* flags s.
|
||||
*/
|
||||
static int
|
||||
getFlagValues(IspellDict *Conf, char *s)
|
||||
{
|
||||
uint32 flag = 0;
|
||||
char *flagcur;
|
||||
char *flagnext = NULL;
|
||||
|
||||
flagcur = s;
|
||||
while (*flagcur)
|
||||
{
|
||||
flag |= Conf->flagval[DecodeFlag(Conf, flagcur, &flagnext)];
|
||||
if (flagnext)
|
||||
flagcur = flagnext;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
return flag;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a flag set using the s parameter.
|
||||
*
|
||||
* If Conf->useFlagAliases is true then the s parameter is index of the
|
||||
* Conf->AffixData array and function returns its entry.
|
||||
* Else function returns the s parameter.
|
||||
*/
|
||||
static char *
|
||||
getFlags(IspellDict *Conf, char *s)
|
||||
{
|
||||
if (Conf->useFlagAliases)
|
||||
{
|
||||
int curaffix = strtol(s, (char **)NULL, 10);
|
||||
|
||||
if (curaffix && curaffix <= Conf->nAffixData)
|
||||
/*
|
||||
* Do not substract 1 from curaffix
|
||||
* because empty string was added in NIImportOOAffixes
|
||||
*/
|
||||
return Conf->AffixData[curaffix];
|
||||
else
|
||||
return VoidString;
|
||||
}
|
||||
else
|
||||
return s;
|
||||
}
|
||||
|
||||
/*
|
||||
* Import an affix file that follows MySpell or Hunspell format.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* filename: path to the .affix file.
|
||||
*/
|
||||
static void
|
||||
NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
||||
@ -751,7 +1094,10 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
||||
char repl[BUFSIZ],
|
||||
*prepl;
|
||||
bool isSuffix = false;
|
||||
int flag = 0;
|
||||
int naffix = 0,
|
||||
curaffix = 0;
|
||||
int flag = 0,
|
||||
sflaglen = 0;
|
||||
char flagflags = 0;
|
||||
tsearch_readline_state trst;
|
||||
char *recoded;
|
||||
@ -759,6 +1105,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
||||
/* read file to find any flag */
|
||||
memset(Conf->flagval, 0, sizeof(Conf->flagval));
|
||||
Conf->usecompound = false;
|
||||
Conf->useFlagAliases = false;
|
||||
Conf->flagMode = FM_CHAR;
|
||||
|
||||
if (!tsearch_readline_begin(&trst, filename))
|
||||
ereport(ERROR,
|
||||
@ -806,10 +1154,18 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
||||
while (*s && t_isspace(s))
|
||||
s += pg_mblen(s);
|
||||
|
||||
if (*s && STRNCMP(s, "default") != 0)
|
||||
ereport(ERROR,
|
||||
if (*s)
|
||||
{
|
||||
if (STRNCMP(s, "long") == 0)
|
||||
Conf->flagMode = FM_LONG;
|
||||
else if (STRNCMP(s, "num") == 0)
|
||||
Conf->flagMode = FM_NUM;
|
||||
else if (STRNCMP(s, "default") != 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("Ispell dictionary supports only default flag value")));
|
||||
errmsg("Ispell dictionary supports only default, "
|
||||
"long and num flag value")));
|
||||
}
|
||||
}
|
||||
|
||||
pfree(recoded);
|
||||
@ -834,27 +1190,77 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
||||
if (ptype)
|
||||
pfree(ptype);
|
||||
ptype = lowerstr_ctx(Conf, type);
|
||||
|
||||
/* First try to parse AF parameter (alias compression) */
|
||||
if (STRNCMP(ptype, "af") == 0)
|
||||
{
|
||||
/* First line is the number of aliases */
|
||||
if (!Conf->useFlagAliases)
|
||||
{
|
||||
Conf->useFlagAliases = true;
|
||||
naffix = atoi(sflag);
|
||||
if (naffix == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("invalid number of flag vector aliases")));
|
||||
|
||||
/* Also reserve place for empty flag set */
|
||||
naffix++;
|
||||
|
||||
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
|
||||
Conf->lenAffixData = Conf->nAffixData = naffix;
|
||||
|
||||
/* Add empty flag set into AffixData */
|
||||
Conf->AffixData[curaffix] = VoidString;
|
||||
curaffix++;
|
||||
}
|
||||
/* Other lines is aliases */
|
||||
else
|
||||
{
|
||||
if (curaffix < naffix)
|
||||
{
|
||||
Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
|
||||
curaffix++;
|
||||
}
|
||||
}
|
||||
goto nextline;
|
||||
}
|
||||
/* Else try to parse prefixes and suffixes */
|
||||
if (fields_read < 4 ||
|
||||
(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
|
||||
goto nextline;
|
||||
|
||||
sflaglen = strlen(sflag);
|
||||
if (sflaglen == 0
|
||||
|| (sflaglen > 1 && Conf->flagMode == FM_CHAR)
|
||||
|| (sflaglen > 2 && Conf->flagMode == FM_LONG))
|
||||
goto nextline;
|
||||
|
||||
/*
|
||||
* Affix header. For example:
|
||||
* SFX \ N 1
|
||||
*/
|
||||
if (fields_read == 4)
|
||||
{
|
||||
if (strlen(sflag) != 1)
|
||||
goto nextline;
|
||||
flag = *sflag;
|
||||
isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
|
||||
/* Convert the affix flag to int */
|
||||
flag = DecodeFlag(Conf, sflag, (char **)NULL);
|
||||
|
||||
isSuffix = (STRNCMP(ptype, "sfx") == 0);
|
||||
if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
|
||||
flagflags = FF_CROSSPRODUCT;
|
||||
else
|
||||
flagflags = 0;
|
||||
}
|
||||
/*
|
||||
* Affix fields. For example:
|
||||
* SFX \ 0 Y/L [^Y]
|
||||
*/
|
||||
else
|
||||
{
|
||||
char *ptr;
|
||||
int aflg = 0;
|
||||
|
||||
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
|
||||
if (flag == 0)
|
||||
goto nextline;
|
||||
prepl = lowerstr_ctx(Conf, repl);
|
||||
/* Find position of '/' in lowercased string "prepl" */
|
||||
@ -866,11 +1272,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
||||
*/
|
||||
*ptr = '\0';
|
||||
ptr = repl + (ptr - prepl) + 1;
|
||||
while (*ptr)
|
||||
{
|
||||
aflg |= Conf->flagval[*(unsigned char *) ptr];
|
||||
ptr++;
|
||||
}
|
||||
aflg |= getFlagValues(Conf, getFlags(Conf, ptr));
|
||||
}
|
||||
pfind = lowerstr_ctx(Conf, find);
|
||||
pmask = lowerstr_ctx(Conf, mask);
|
||||
@ -928,6 +1330,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
|
||||
|
||||
memset(Conf->flagval, 0, sizeof(Conf->flagval));
|
||||
Conf->usecompound = false;
|
||||
Conf->useFlagAliases = false;
|
||||
Conf->flagMode = FM_CHAR;
|
||||
|
||||
while ((recoded = tsearch_readline(&trst)) != NULL)
|
||||
{
|
||||
@ -1044,6 +1448,12 @@ isnewformat:
|
||||
NIImportOOAffixes(Conf, filename);
|
||||
}
|
||||
|
||||
/*
|
||||
* Merges two affix flag sets and stores a new affix flag set into
|
||||
* Conf->AffixData.
|
||||
*
|
||||
* Returns index of a new affix flag set.
|
||||
*/
|
||||
static int
|
||||
MergeAffix(IspellDict *Conf, int a1, int a2)
|
||||
{
|
||||
@ -1068,21 +1478,25 @@ MergeAffix(IspellDict *Conf, int a1, int a2)
|
||||
return Conf->nAffixData - 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a set of affix parameters which correspondence to the set of affix
|
||||
* flags with the given index.
|
||||
*/
|
||||
static uint32
|
||||
makeCompoundFlags(IspellDict *Conf, int affix)
|
||||
{
|
||||
uint32 flag = 0;
|
||||
char *str = Conf->AffixData[affix];
|
||||
|
||||
while (str && *str)
|
||||
{
|
||||
flag |= Conf->flagval[*(unsigned char *) str];
|
||||
str++;
|
||||
}
|
||||
|
||||
return (flag & FF_DICTFLAGMASK);
|
||||
char *str = Conf->AffixData[affix];
|
||||
return (getFlagValues(Conf, str) & FF_DICTFLAGMASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* Makes a prefix tree for the given level.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* low: lower index of the Conf->Spell array.
|
||||
* high: upper index of the Conf->Spell array.
|
||||
* level: current prefix tree level.
|
||||
*/
|
||||
static SPNode *
|
||||
mkSPNode(IspellDict *Conf, int low, int high, int level)
|
||||
{
|
||||
@ -1115,6 +1529,7 @@ mkSPNode(IspellDict *Conf, int low, int high, int level)
|
||||
{
|
||||
if (lastchar)
|
||||
{
|
||||
/* Next level of the prefix tree */
|
||||
data->node = mkSPNode(Conf, lownew, i, level + 1);
|
||||
lownew = i;
|
||||
data++;
|
||||
@ -1154,6 +1569,7 @@ mkSPNode(IspellDict *Conf, int low, int high, int level)
|
||||
}
|
||||
}
|
||||
|
||||
/* Next level of the prefix tree */
|
||||
data->node = mkSPNode(Conf, lownew, high, level + 1);
|
||||
|
||||
return rs;
|
||||
@ -1172,44 +1588,83 @@ NISortDictionary(IspellDict *Conf)
|
||||
|
||||
/* compress affixes */
|
||||
|
||||
/* Count the number of different flags used in the dictionary */
|
||||
|
||||
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
|
||||
|
||||
naffix = 0;
|
||||
for (i = 0; i < Conf->nspell; i++)
|
||||
{
|
||||
if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
|
||||
naffix++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill in Conf->AffixData with the affixes that were used in the
|
||||
* dictionary. Replace textual flag-field of Conf->Spell entries with
|
||||
* indexes into Conf->AffixData array.
|
||||
* If we use flag aliases then we need to use Conf->AffixData filled
|
||||
* in the NIImportOOAffixes().
|
||||
*/
|
||||
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
|
||||
|
||||
curaffix = -1;
|
||||
for (i = 0; i < Conf->nspell; i++)
|
||||
if (Conf->useFlagAliases)
|
||||
{
|
||||
if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
|
||||
for (i = 0; i < Conf->nspell; i++)
|
||||
{
|
||||
curaffix++;
|
||||
Assert(curaffix < naffix);
|
||||
Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag);
|
||||
curaffix = strtol(Conf->Spell[i]->p.flag, (char **)NULL, 10);
|
||||
if (curaffix && curaffix <= Conf->nAffixData)
|
||||
Conf->Spell[i]->p.d.affix = curaffix;
|
||||
else
|
||||
/*
|
||||
* If Conf->Spell[i]->p.flag is empty, then get empty value of
|
||||
* Conf->AffixData (0 index).
|
||||
*/
|
||||
Conf->Spell[i]->p.d.affix = 0;
|
||||
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
|
||||
}
|
||||
}
|
||||
/* Otherwise fill Conf->AffixData here */
|
||||
else
|
||||
{
|
||||
/* Count the number of different flags used in the dictionary */
|
||||
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
|
||||
cmpspellaffix);
|
||||
|
||||
naffix = 0;
|
||||
for (i = 0; i < Conf->nspell; i++)
|
||||
{
|
||||
if (i == 0
|
||||
|| strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
|
||||
naffix++;
|
||||
}
|
||||
|
||||
Conf->Spell[i]->p.d.affix = curaffix;
|
||||
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
|
||||
/*
|
||||
* Fill in Conf->AffixData with the affixes that were used in the
|
||||
* dictionary. Replace textual flag-field of Conf->Spell entries with
|
||||
* indexes into Conf->AffixData array.
|
||||
*/
|
||||
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
|
||||
|
||||
curaffix = -1;
|
||||
for (i = 0; i < Conf->nspell; i++)
|
||||
{
|
||||
if (i == 0
|
||||
|| strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]))
|
||||
{
|
||||
curaffix++;
|
||||
Assert(curaffix < naffix);
|
||||
Conf->AffixData[curaffix] = cpstrdup(Conf,
|
||||
Conf->Spell[i]->p.flag);
|
||||
}
|
||||
|
||||
Conf->Spell[i]->p.d.affix = curaffix;
|
||||
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
|
||||
}
|
||||
|
||||
Conf->lenAffixData = Conf->nAffixData = naffix;
|
||||
}
|
||||
|
||||
Conf->lenAffixData = Conf->nAffixData = naffix;
|
||||
|
||||
/* Start build a prefix tree */
|
||||
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
|
||||
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Makes a prefix tree for the given level using the repl string of an affix
|
||||
* rule. Affixes with empty replace string do not include in the prefix tree.
|
||||
* This affixes are included by mkVoidAffix().
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* low: lower index of the Conf->Affix array.
|
||||
* high: upper index of the Conf->Affix array.
|
||||
* level: current prefix tree level.
|
||||
* type: FF_SUFFIX or FF_PREFIX.
|
||||
*/
|
||||
static AffixNode *
|
||||
mkANode(IspellDict *Conf, int low, int high, int level, int type)
|
||||
{
|
||||
@ -1247,6 +1702,7 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type)
|
||||
{
|
||||
if (lastchar)
|
||||
{
|
||||
/* Next level of the prefix tree */
|
||||
data->node = mkANode(Conf, lownew, i, level + 1, type);
|
||||
if (naff)
|
||||
{
|
||||
@ -1267,6 +1723,7 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type)
|
||||
}
|
||||
}
|
||||
|
||||
/* Next level of the prefix tree */
|
||||
data->node = mkANode(Conf, lownew, high, level + 1, type);
|
||||
if (naff)
|
||||
{
|
||||
@ -1281,6 +1738,10 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type)
|
||||
return rs;
|
||||
}
|
||||
|
||||
/*
|
||||
* Makes the root void node in the prefix tree. The root void node is created
|
||||
* for affixes which have empty replace string ("repl" field).
|
||||
*/
|
||||
static void
|
||||
mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
|
||||
{
|
||||
@ -1304,11 +1765,12 @@ mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
|
||||
Conf->Prefix = Affix;
|
||||
}
|
||||
|
||||
|
||||
/* Count affixes with empty replace string */
|
||||
for (i = start; i < end; i++)
|
||||
if (Conf->Affix[i].replen == 0)
|
||||
cnt++;
|
||||
|
||||
/* There is not affixes with empty replace string */
|
||||
if (cnt == 0)
|
||||
return;
|
||||
|
||||
@ -1324,18 +1786,31 @@ mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks if the affixflag is used by dictionary. Conf->AffixData does not
|
||||
* contain affixflag if this flag is not used actually by the .dict file.
|
||||
*
|
||||
* Conf: current dictionary.
|
||||
* affixflag: integer representation of the affix flag.
|
||||
*
|
||||
* Returns true if the Conf->AffixData array contains affixflag, otherwise
|
||||
* returns false.
|
||||
*/
|
||||
static bool
|
||||
isAffixInUse(IspellDict *Conf, char flag)
|
||||
isAffixInUse(IspellDict *Conf, uint16 affixflag)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < Conf->nAffixData; i++)
|
||||
if (strchr(Conf->AffixData[i], flag) != NULL)
|
||||
if (IsAffixFlagInUse(Conf, i, affixflag))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
|
||||
*/
|
||||
void
|
||||
NISortAffixes(IspellDict *Conf)
|
||||
{
|
||||
@ -1347,6 +1822,7 @@ NISortAffixes(IspellDict *Conf)
|
||||
if (Conf->naffixes == 0)
|
||||
return;
|
||||
|
||||
/* Store compound affixes in the Conf->CompoundAffix array */
|
||||
if (Conf->naffixes > 1)
|
||||
qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
|
||||
Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
|
||||
@ -1359,7 +1835,7 @@ NISortAffixes(IspellDict *Conf)
|
||||
firstsuffix = i;
|
||||
|
||||
if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
|
||||
isAffixInUse(Conf, (char) Affix->flag))
|
||||
isAffixInUse(Conf, Affix->flag))
|
||||
{
|
||||
if (ptr == Conf->CompoundAffix ||
|
||||
ptr->issuffix != (ptr - 1)->issuffix ||
|
||||
@ -1370,7 +1846,7 @@ NISortAffixes(IspellDict *Conf)
|
||||
/* leave only unique and minimals suffixes */
|
||||
ptr->affix = Affix->repl;
|
||||
ptr->len = Affix->replen;
|
||||
ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false;
|
||||
ptr->issuffix = (Affix->type == FF_SUFFIX);
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
@ -1378,6 +1854,7 @@ NISortAffixes(IspellDict *Conf)
|
||||
ptr->affix = NULL;
|
||||
Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
|
||||
|
||||
/* Start build a prefix tree */
|
||||
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
|
||||
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
|
||||
mkVoidAffix(Conf, true, firstsuffix);
|
||||
@ -1825,7 +2302,7 @@ SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int
|
||||
|
||||
if (StopLow < StopHigh)
|
||||
{
|
||||
if (level == FF_COMPOUNDBEGIN)
|
||||
if (startpos == 0)
|
||||
compoundflag = FF_COMPOUNDBEGIN;
|
||||
else if (level == wordlen - 1)
|
||||
compoundflag = FF_COMPOUNDLAST;
|
||||
|
@ -19,18 +19,18 @@
|
||||
#include "tsearch/ts_public.h"
|
||||
|
||||
/*
|
||||
* Max length of a flag name. Names longer than this will be truncated
|
||||
* to the maximum.
|
||||
* SPNode and SPNodeData are used to represent prefix tree (Trie) to store
|
||||
* a words list.
|
||||
*/
|
||||
#define MAXFLAGLEN 16
|
||||
|
||||
struct SPNode;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32 val:8,
|
||||
isword:1,
|
||||
/* Stores compound flags listed below */
|
||||
compoundflag:4,
|
||||
/* Reference to an entry of the AffixData field */
|
||||
affix:19;
|
||||
struct SPNode *node;
|
||||
} SPNodeData;
|
||||
@ -43,7 +43,8 @@ typedef struct
|
||||
#define FF_COMPOUNDBEGIN 0x02
|
||||
#define FF_COMPOUNDMIDDLE 0x04
|
||||
#define FF_COMPOUNDLAST 0x08
|
||||
#define FF_COMPOUNDFLAG ( FF_COMPOUNDBEGIN | FF_COMPOUNDMIDDLE | FF_COMPOUNDLAST )
|
||||
#define FF_COMPOUNDFLAG ( FF_COMPOUNDBEGIN | FF_COMPOUNDMIDDLE | \
|
||||
FF_COMPOUNDLAST )
|
||||
#define FF_DICTFLAGMASK 0x0f
|
||||
|
||||
typedef struct SPNode
|
||||
@ -54,19 +55,24 @@ typedef struct SPNode
|
||||
|
||||
#define SPNHDRSZ (offsetof(SPNode,data))
|
||||
|
||||
|
||||
/*
|
||||
* Represents an entry in a words list.
|
||||
*/
|
||||
typedef struct spell_struct
|
||||
{
|
||||
union
|
||||
{
|
||||
/*
|
||||
* flag is filled in by NIImportDictionary. After NISortDictionary, d
|
||||
* is valid and flag is invalid.
|
||||
* flag is filled in by NIImportDictionary(). After NISortDictionary(),
|
||||
* d is used instead of flag.
|
||||
*/
|
||||
char flag[MAXFLAGLEN];
|
||||
char *flag;
|
||||
/* d is used in mkSPNode() */
|
||||
struct
|
||||
{
|
||||
/* Reference to an entry of the AffixData field */
|
||||
int affix;
|
||||
/* Length of the word */
|
||||
int len;
|
||||
} d;
|
||||
} p;
|
||||
@ -75,10 +81,14 @@ typedef struct spell_struct
|
||||
|
||||
#define SPELLHDRSZ (offsetof(SPELL, word))
|
||||
|
||||
/*
|
||||
* Represents an entry in an affix list.
|
||||
*/
|
||||
typedef struct aff_struct
|
||||
{
|
||||
uint32 flag:8,
|
||||
type:1,
|
||||
uint32 flag:16;
|
||||
/* FF_SUFFIX or FF_PREFIX */
|
||||
uint32 type:1,
|
||||
flagflags:7,
|
||||
issimple:1,
|
||||
isregis:1,
|
||||
@ -106,6 +116,10 @@ typedef struct aff_struct
|
||||
#define FF_SUFFIX 1
|
||||
#define FF_PREFIX 0
|
||||
|
||||
/*
|
||||
* AffixNode and AffixNodeData are used to represent prefix tree (Trie) to store
|
||||
* an affix list.
|
||||
*/
|
||||
struct AffixNode;
|
||||
|
||||
typedef struct
|
||||
@ -132,6 +146,16 @@ typedef struct
|
||||
bool issuffix;
|
||||
} CMPDAffix;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
FM_CHAR,
|
||||
FM_LONG,
|
||||
FM_NUM
|
||||
} FlagMode;
|
||||
|
||||
#define FLAGCHAR_MAXSIZE (1 << 8)
|
||||
#define FLAGNUM_MAXSIZE (1 << 16)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int maffixes;
|
||||
@ -142,14 +166,17 @@ typedef struct
|
||||
AffixNode *Prefix;
|
||||
|
||||
SPNode *Dictionary;
|
||||
/* Array of sets of affixes */
|
||||
char **AffixData;
|
||||
int lenAffixData;
|
||||
int nAffixData;
|
||||
bool useFlagAliases;
|
||||
|
||||
CMPDAffix *CompoundAffix;
|
||||
|
||||
unsigned char flagval[256];
|
||||
unsigned char flagval[FLAGNUM_MAXSIZE];
|
||||
bool usecompound;
|
||||
FlagMode flagMode;
|
||||
|
||||
/*
|
||||
* Remaining fields are only used during dictionary construction; they are
|
||||
|
@ -191,6 +191,198 @@ SELECT ts_lexize('hunspell', 'footballyklubber');
|
||||
{foot,ball,klubber}
|
||||
(1 row)
|
||||
|
||||
-- Test ISpell dictionary with hunspell affix file with FLAG long parameter
|
||||
CREATE TEXT SEARCH DICTIONARY hunspell_long (
|
||||
Template=ispell,
|
||||
DictFile=hunspell_sample_long,
|
||||
AffFile=hunspell_sample_long
|
||||
);
|
||||
SELECT ts_lexize('hunspell_long', 'skies');
|
||||
ts_lexize
|
||||
-----------
|
||||
{sky}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'bookings');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'booking');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'foot');
|
||||
ts_lexize
|
||||
-----------
|
||||
{foot}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'foots');
|
||||
ts_lexize
|
||||
-----------
|
||||
{foot}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'rebookings');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'rebooking');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'rebook');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'unbookings');
|
||||
ts_lexize
|
||||
-----------
|
||||
{book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'unbooking');
|
||||
ts_lexize
|
||||
-----------
|
||||
{book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'unbook');
|
||||
ts_lexize
|
||||
-----------
|
||||
{book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'footklubber');
|
||||
ts_lexize
|
||||
----------------
|
||||
{foot,klubber}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'footballklubber');
|
||||
ts_lexize
|
||||
------------------------------------------------------
|
||||
{footballklubber,foot,ball,klubber,football,klubber}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'ballyklubber');
|
||||
ts_lexize
|
||||
----------------
|
||||
{ball,klubber}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'footballyklubber');
|
||||
ts_lexize
|
||||
---------------------
|
||||
{foot,ball,klubber}
|
||||
(1 row)
|
||||
|
||||
-- Test ISpell dictionary with hunspell affix file with FLAG num parameter
|
||||
CREATE TEXT SEARCH DICTIONARY hunspell_num (
|
||||
Template=ispell,
|
||||
DictFile=hunspell_sample_num,
|
||||
AffFile=hunspell_sample_num
|
||||
);
|
||||
SELECT ts_lexize('hunspell_num', 'skies');
|
||||
ts_lexize
|
||||
-----------
|
||||
{sky}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'bookings');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'booking');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'foot');
|
||||
ts_lexize
|
||||
-----------
|
||||
{foot}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'foots');
|
||||
ts_lexize
|
||||
-----------
|
||||
{foot}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'rebookings');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'rebooking');
|
||||
ts_lexize
|
||||
----------------
|
||||
{booking,book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'rebook');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'unbookings');
|
||||
ts_lexize
|
||||
-----------
|
||||
{book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'unbooking');
|
||||
ts_lexize
|
||||
-----------
|
||||
{book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'unbook');
|
||||
ts_lexize
|
||||
-----------
|
||||
{book}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'footklubber');
|
||||
ts_lexize
|
||||
----------------
|
||||
{foot,klubber}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'footballklubber');
|
||||
ts_lexize
|
||||
------------------------------------------------------
|
||||
{footballklubber,foot,ball,klubber,football,klubber}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'ballyklubber');
|
||||
ts_lexize
|
||||
----------------
|
||||
{ball,klubber}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'footballyklubber');
|
||||
ts_lexize
|
||||
---------------------
|
||||
{foot,ball,klubber}
|
||||
(1 row)
|
||||
|
||||
-- Synonim dictionary
|
||||
CREATE TEXT SEARCH DICTIONARY synonym (
|
||||
Template=synonym,
|
||||
@ -277,6 +469,48 @@ SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky');
|
||||
'foot':B & 'ball':B & 'klubber':B & ( 'booking':A | 'book':A ) & 'sky'
|
||||
(1 row)
|
||||
|
||||
-- Test ispell dictionary with hunspell affix with FLAG long in configuration
|
||||
ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING
|
||||
REPLACE hunspell WITH hunspell_long;
|
||||
SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
|
||||
to_tsvector
|
||||
----------------------------------------------------------------------------------------------------
|
||||
'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3
|
||||
(1 row)
|
||||
|
||||
SELECT to_tsquery('hunspell_tst', 'footballklubber');
|
||||
to_tsquery
|
||||
------------------------------------------------------------------------------
|
||||
( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber'
|
||||
(1 row)
|
||||
|
||||
SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky');
|
||||
to_tsquery
|
||||
------------------------------------------------------------------------
|
||||
'foot':B & 'ball':B & 'klubber':B & ( 'booking':A | 'book':A ) & 'sky'
|
||||
(1 row)
|
||||
|
||||
-- Test ispell dictionary with hunspell affix with FLAG num in configuration
|
||||
ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING
|
||||
REPLACE hunspell_long WITH hunspell_num;
|
||||
SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
|
||||
to_tsvector
|
||||
----------------------------------------------------------------------------------------------------
|
||||
'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3
|
||||
(1 row)
|
||||
|
||||
SELECT to_tsquery('hunspell_tst', 'footballklubber');
|
||||
to_tsquery
|
||||
------------------------------------------------------------------------------
|
||||
( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber'
|
||||
(1 row)
|
||||
|
||||
SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky');
|
||||
to_tsquery
|
||||
------------------------------------------------------------------------
|
||||
'foot':B & 'ball':B & 'klubber':B & ( 'booking':A | 'book':A ) & 'sky'
|
||||
(1 row)
|
||||
|
||||
-- Test synonym dictionary in configuration
|
||||
CREATE TEXT SEARCH CONFIGURATION synonym_tst (
|
||||
COPY=english
|
||||
|
@ -48,6 +48,54 @@ SELECT ts_lexize('hunspell', 'footballklubber');
|
||||
SELECT ts_lexize('hunspell', 'ballyklubber');
|
||||
SELECT ts_lexize('hunspell', 'footballyklubber');
|
||||
|
||||
-- Test ISpell dictionary with hunspell affix file with FLAG long parameter
|
||||
CREATE TEXT SEARCH DICTIONARY hunspell_long (
|
||||
Template=ispell,
|
||||
DictFile=hunspell_sample_long,
|
||||
AffFile=hunspell_sample_long
|
||||
);
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'skies');
|
||||
SELECT ts_lexize('hunspell_long', 'bookings');
|
||||
SELECT ts_lexize('hunspell_long', 'booking');
|
||||
SELECT ts_lexize('hunspell_long', 'foot');
|
||||
SELECT ts_lexize('hunspell_long', 'foots');
|
||||
SELECT ts_lexize('hunspell_long', 'rebookings');
|
||||
SELECT ts_lexize('hunspell_long', 'rebooking');
|
||||
SELECT ts_lexize('hunspell_long', 'rebook');
|
||||
SELECT ts_lexize('hunspell_long', 'unbookings');
|
||||
SELECT ts_lexize('hunspell_long', 'unbooking');
|
||||
SELECT ts_lexize('hunspell_long', 'unbook');
|
||||
|
||||
SELECT ts_lexize('hunspell_long', 'footklubber');
|
||||
SELECT ts_lexize('hunspell_long', 'footballklubber');
|
||||
SELECT ts_lexize('hunspell_long', 'ballyklubber');
|
||||
SELECT ts_lexize('hunspell_long', 'footballyklubber');
|
||||
|
||||
-- Test ISpell dictionary with hunspell affix file with FLAG num parameter
|
||||
CREATE TEXT SEARCH DICTIONARY hunspell_num (
|
||||
Template=ispell,
|
||||
DictFile=hunspell_sample_num,
|
||||
AffFile=hunspell_sample_num
|
||||
);
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'skies');
|
||||
SELECT ts_lexize('hunspell_num', 'bookings');
|
||||
SELECT ts_lexize('hunspell_num', 'booking');
|
||||
SELECT ts_lexize('hunspell_num', 'foot');
|
||||
SELECT ts_lexize('hunspell_num', 'foots');
|
||||
SELECT ts_lexize('hunspell_num', 'rebookings');
|
||||
SELECT ts_lexize('hunspell_num', 'rebooking');
|
||||
SELECT ts_lexize('hunspell_num', 'rebook');
|
||||
SELECT ts_lexize('hunspell_num', 'unbookings');
|
||||
SELECT ts_lexize('hunspell_num', 'unbooking');
|
||||
SELECT ts_lexize('hunspell_num', 'unbook');
|
||||
|
||||
SELECT ts_lexize('hunspell_num', 'footklubber');
|
||||
SELECT ts_lexize('hunspell_num', 'footballklubber');
|
||||
SELECT ts_lexize('hunspell_num', 'ballyklubber');
|
||||
SELECT ts_lexize('hunspell_num', 'footballyklubber');
|
||||
|
||||
-- Synonim dictionary
|
||||
CREATE TEXT SEARCH DICTIONARY synonym (
|
||||
Template=synonym,
|
||||
@ -94,6 +142,22 @@ SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footb
|
||||
SELECT to_tsquery('hunspell_tst', 'footballklubber');
|
||||
SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky');
|
||||
|
||||
-- Test ispell dictionary with hunspell affix with FLAG long in configuration
|
||||
ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING
|
||||
REPLACE hunspell WITH hunspell_long;
|
||||
|
||||
SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
|
||||
SELECT to_tsquery('hunspell_tst', 'footballklubber');
|
||||
SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky');
|
||||
|
||||
-- Test ispell dictionary with hunspell affix with FLAG num in configuration
|
||||
ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING
|
||||
REPLACE hunspell_long WITH hunspell_num;
|
||||
|
||||
SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
|
||||
SELECT to_tsquery('hunspell_tst', 'footballklubber');
|
||||
SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky');
|
||||
|
||||
-- Test synonym dictionary in configuration
|
||||
CREATE TEXT SEARCH CONFIGURATION synonym_tst (
|
||||
COPY=english
|
||||
|
Loading…
x
Reference in New Issue
Block a user