Browse Source

Merge remote-tracking branch 'ValdisVitolins/master'

master
Reece H. Dunn 8 years ago
parent
commit
fe3e3350e4
3 changed files with 100 additions and 150 deletions
  1. 52
    113
      dictsource/lv_rules
  2. 2
    3
      docs/dictionary.md
  3. 46
    34
      src/libespeak-ng/dictionary.c

+ 52
- 113
dictsource/lv_rules View File

@@ -4,7 +4,7 @@

.replace
ó ȯ // replace o-acute with o-dot, as it is more logical for "short o"
ḩ h
// +---------------------------+
// | Suffixes of nouns |
// +---------------------------+
@@ -49,7 +49,7 @@
.L22 č ģ j k ļ ņ ŗ š ž dž bj pj mj vj // consonant narrowers of e
.L23 s i m // remaining ending of the 2. and 5. decl. words
.L27 č e ē f ģ h ḩ i ī j k ļ ņ q ŗ š w x y ž ei ie iu // all narrowers of e
// Also: .L41 — international consonants and diphthongs as a narrowers of e/ē
// Also: .L41 — international consonants and diphthongs as a narrowers of e/ē

// E wideners
.L24 a ā u ū ai au // vowel wideners of e
@@ -57,7 +57,7 @@
// but it is mixed with ȯ and ō
.L25 b c d g ķ l m n p r s t v z dz // consonant wideners of e
.L28 a ā b c d e ē g ķ l m n p r s t u ū v z ai au // all wideners of e

// +---------------------------+
// | Pronunciation of o |
@@ -92,13 +92,13 @@
// | Consonant groups |
// +---------------------------+

.L60 b d dz dž g ģ j l ļ m n ņ r v z ž // voiced (loud) consonants
.L60 b d dz dž g ģ j l ļ m n ņ r v z ž // voiced (loud) consonants
.L61 p t k ķ c č s š f h ḩ // unvoiced (silent) consonants
.L62 b c d f g h j k l m n p q r s t v w x z // hard consonants
.L63 č ģ ḩ ķ ļ ņ ŗ š ž // soft consonants
.L64 s t
.L65 t d j s š // root endings for verbs and adjectives
.L66 g ģ
.L66 g ģ
.L67 k ķ
.L68 n r
.L69 k s
@@ -131,13 +131,6 @@
.L93 m l n // vowel (pre)lengtheners — vowels after these are spelled longer than usual
// j is vowel (post)lengthener, but as only one is handled in group j


// +---------------------------+
// | All letters |
// +---------------------------+
// These are used for "." like padding
.L10 a ā b c č d e ē f g ģ h ḩ i ī j k ķ l ļ m n ņ o ō p q r ŗ s š t u ū v w x y z ž

// +---------------------------+
// | Pronunciation groups |
// +---------------------------+
@@ -163,10 +156,12 @@

.group c
c ts
L55) c (L55 ts>
c (C ts_!

.group č
č tS
L55) č (L55 tS>

.group d
d d
@@ -177,20 +172,16 @@
.group e
e e // Default as narrow e
ee e_|e // internationalisms, prefix: ne-e...
ei ei
ei (+ ei

// narrow e ---------------------------------------------------------------------- //
a) e (ro e // internationalsms
a) e (ro e // internationalisms
e (o e
// international consonants
e (L10L10L10L41 e
e (L10L10L41+ e
e (L10L41++ e
e (JL41+ e
e (L41+++ e
L41) e (@ e // narrow only if there is another syllable for ending
L41L10) e (@ e
L41L10L10) e (@< e
L41L10L10L10) e (@<< e
L41) e (@ e // narrow only if there is another syllable for ending
L41J) e (@+ e

// narrowing vowels follow
e (Cinā e
@@ -217,10 +208,7 @@ L41L10L10L10) e (@<< e
e (CL05_ e // 5. decl. words
e (CCL05_< e
e (CCCL05_<< e
e (L10L05_+ e
e (L10L10L05_ e
e (L10L10L10L05_< e
e (L10L10L10L10L05_<< e
e (JL05_+ e

// wide e ------------------------------------------------------------------------ //
// widening consonants follow
@@ -235,15 +223,16 @@ L41L10L10L10) e (@<< e
e (L01_ E
e (L25L01_ E
e (L25L25L01_ E
e (Jam E
L25) e (L18_+ E // vareni etc.

// specific words ----------------------------------------------------------------- //
// āb) e (L77L05_ e // ābele
b) e (dL82L05_ e // bedre
e (lL88 e // ..el[tš]
b) e (dL82L05_ e // bedre
_b) e (t_ e
_b) e (z e
_b) e (z_ e
_C) e (_ e // re, te, ve
_C) e (_ e // re, te, ve
_č) e (trL07_ e // četri
d) ebe (L83 ebe
d) ebe (t ebe
@@ -299,15 +288,11 @@ L41L10L10L10) e (@<< e

// narrow ē ---------------------------------------------------------------------- //
// narrowing international consonants
ē (L10L10L10L41 e:
ē (L10L10L41 e:
ē (L10L41 e:
ē (L41+ e:
L41) ē (@ e: // narrow only if there is another syllable for ending
L41L10) ē (@ e:
L41L10L10) ē (@< e:
L41L10L10L10) ē (@<< e:
L41L10L10L10L10) ē (@<<< e:
ē (JL41+ e:
ē (L41+ e:
L41) ē (@+ e: // narrow only if there is another syllable for ending
L41J) ē (@ e:
// narrowing vowels follow
ē (CCCCL21 e:
ē (CCCL21 e:
@@ -355,10 +340,7 @@ L41L10L10L10L10) ē (@<<< e:
ē (CL05_ e:
ē (CCL05_< e:
ē (CCCL05_<< e:
ē (L10L05_+ e:
ē (L10L10L05_ e:
ē (L10L10L10L05_< e:
ē (L10L10L10L10L05_<< e:
ē (JL05_+ e:


// specific words ------------------------------------------------------------------- //
@@ -370,7 +352,7 @@ L41L10L10L10L10) ē (@<<< e:
ēdē e:de:
ē (kL04_+ E: // ēka
ē (L86L13_ e: // ēst
ē (rkšķ e:
ē (rkšķ e:
ē (tā_ E:
ē (tL55_ E:
izp) ē (t e:
@@ -394,12 +376,13 @@ L41L10L10L10L10) ē (@<<< e:
šķ) ē (L77L05_ e: // šķēle
šķ) ē (L82L05_ e: // šķēre
_tāp) ē (c_ e: // tāpēc
t) ē (L90 e:
t) ē (L90 e:
t) ē (mL04_ e: // ..tēma
v) ērtē e:Rte:

.group f
f f
L55) f (L55 f>

.group g
g g
@@ -409,6 +392,7 @@ L41L10L10L10L10) ē (@<<< e:

.group h
h h
L55) h (L55 h>

.group i
i i
@@ -430,10 +414,12 @@ L41L10L10L10L10) ē (@<<< e:

.group k
k k
L55) k (L55 k>
k (ļ k} // tiny pause between k and ļ

.group ķ
ķ c
L55) ķ (L55 c>
ķ (L55 c} // tiny pause for sharper sound before short vowels

.group l
@@ -473,45 +459,14 @@ L41L10L10L10L10) ē (@<<< e:
L42) o o: // o prefixes
o (L43+ o // o postfixes

// o indicators before
L41L10L10L10L10L10L10L10L10L10) o (<<<<<<<<<< o
L41L10L10L10L10L10L10L10L10) o (<<<<<<<<< o
L41L10L10L10L10L10L10L10) o (<<<<<<< o
L41L10L10L10L10L10L10) o (<<<<<< o
L41L10L10L10L10L10) o (<<<<< o
L41L10L10L10L10) o (<<<< o
L41L10L10L10) o (<<< o
L41L10L10) o (<< o
L41L10) o (< o
L41) o o
// o indicators after
o (L10L10L10L10L10L10L10L41<<<<<< o
o (L10L10L10L10L10L10L41<<<<< o
o (L10L10L10L10L10L41<<<< o
o (L10L10L10L10L41<<< o
o (L10L10L10L41<< o
o (L10L10L41< o
o (L10L41 o
o (L41+ o

// uo indicators before (facultative writing)
L30L10L10L10L10L10L10L10) o (<<<<<<< uo
L30L10L10L10L10L10L10) o (<<<<<< uo
L30L10L10L10L10L10) o (<<<<< uo
L30L10L10L10L10) o (<<<< uo
L30L10L10L10) o (<<< uo
L30L10L10) o (<< uo
L30L10) o (< uo
L30) o uo
// uo indicators after (facultative writing)
o (L10L10L10L10L10L10L30<<<<< uo
o (L10L10L10L10L10L30<<<< uo
o (L10L10L10L10L30<<< uo
o (L10L10L10L30<< uo
o (L10L10L30< uo
o (L10L30 uo
o (L30+ uo
L41J) o (+ o // o indicators before
o (L41+ o // o indicators after
o (JL41+ o

L30J) o uo // uo indicators before (facultative writing)
L30) o (+ uo
o (JL30 uo // uo indicators after (facultative writing)
o (L30+ uo

o uo // words with Latvian roots are more common
o (_++ uo // common ending for Latvian words
@@ -547,27 +502,10 @@ L46L45L45) o (<< uo
otu (_< uotu
o (_$w_alt2++ uo // $alt2 (international) words in lv_list, which are spelled with uo at the end

L41L10L10L10L10L10L10L10) o (lL01_<<<<< o: // [fhqxw]..ols
L41L10L10L10L10L10L10) o (lL01_<<<< o:
L41L10L10L10L10L10) o (lL01_<<< o:
L41L10L10L10L10) o (lL01_<< o:
L41L10L10L10) o (lL01_< o:
L41L10L10) o (lL01_ o:
L41L10) o (lL01_+ o:

oL10L10L10L10L10L10L10) o (lL01_<<<<<< o: // o...ols
oL10L10L10L10L10L10) o (lL01_<<<<< o:
oL10L10L10L10L10) o (lL01_<<<< o:
oL10L10L10L10) o (lL01_<<< o:
oL10L10L10) o (lL01_<< o:
oL10L10) o (lL01_< o:
oL10) o (lL01_ o:

L41J) o (lL01_+ o: // [fhqxw]..ols
oJ) o (lL01_ o: // o...ols
o (L62% o




aer) o o:
agn) o o
agr) o o
@@ -597,7 +535,7 @@ L46L45L45) o (<< uo
br) o (š o
celm) o uo
cet) o o
cēl) o (L80 uo
cēl) o (L80 uo
cikl) o o:
cit) o o
_c) o (_+ o:
@@ -691,7 +629,7 @@ L46L45L45) o (<< uo
miel) o (L65 uo
miel) o uo
migl) o uo
mir) o (L80+ uo
mir) o (L80+ uo
miz) o uo
m) o (L86L14_+ uo // mosties
mon) o o
@@ -1043,7 +981,7 @@ L46L45L45) o (<< uo
v) ol (ej ol
v) ol (t ol
v) ol (u ol
z) ol (it ol
z) ol (it ol
@z) ol (L01_ o:l // ..zols
z) ol (L05_ uol // ..zole

@@ -1138,7 +1076,7 @@ L46L45L45) o (<< uo
hr) on (o on
ikr) on (+ o:n
im) on o>n
ir) on (i< o:n
ir) on (i< o:n
is) on on
itr) on o:n
j) on (i on
@@ -1205,23 +1143,21 @@ L46L45L45) o (<< uo
ž) on (g oN

.group oo
_aut) oo (stL01_ o:uo // autoosta
_aut) oo (stL01_ o:uo // autoosta
k) oo (per o:o
oo (_ oo:
z) oo oo:

.group op

L41L10L10L10sk) op (L01_ o: // ...skops
L41L10L10sk) op (L01_ o:
L41L10sk) op (L01_ o:

L41Jsk) op (L01_ o:p // ...skops
ad) op op
cikl) op (L01_ o:p // ciklops
d) op op
eir) op (L04_ o:p
eir) op op
_gal) op (L01_ op // galops
k) op (pr uop // ..koppr..
k) op (en op // Kopenhāgena
k) op (ēC op // ..kopēt../..kopēš..
k) op (ējL01_ uop // ..kopējs
@@ -1280,7 +1216,7 @@ L41L10L10L10sk) op (L01_ o: // ...skops
ab) or oR
ak) or oR
aleg) or o:R
alg) or oR
alg) or oR
am) or oR
a) or oR
aut) or (i@ oR
@@ -1513,7 +1449,7 @@ L41L10L10L10sk) op (L01_ o: // ...skops
@@) oz (L05_ o:z // ..oze
ozo (lL01_ uozuo // ..ozols
oz (on oz
ozo (_ o:zuo
ozo (_ o:zuo
p) oz oz
_r) oz (eL88L05_ oz // rozete
_r) oz (īL80L05_ oz // rozīne
@@ -1529,8 +1465,8 @@ L41L10L10L10sk) op (L01_ o: // ...skops

.group p
p p
// pus..number stress on next syllable
_) pus (vien p%us
L55) p (L55 p>
_) pus (vien p%us // pus..number stress on next syllable
_) pus (div p%us
_) pus (otr p%us
_) pus (trij p%us
@@ -1554,15 +1490,18 @@ L41L10L10L10sk) op (L01_ o: // ...skops
ŗ r

.group s
sh s_!h_! // probably no need to make distinct in other places
sh s_!h_! // probably no need to make distinct in other places
_) s (L61 s_ // make s distinct at start of the word before unvocied consonants
s s
L55) s (L55 s>

.group š
š S
L55) š (L55 S>

.group t
t t
L55) t (L55 t>

.group u
ui ui

+ 2
- 3
docs/dictionary.md View File

@@ -120,7 +120,6 @@ those two groups is used.

`~` Letter in letter group means, that there can be no letter in this group
at the beginning or end of the word.
Beware of [issue #196](https://github.com/espeak-ng/espeak-ng/issues/196).

_For example:_

@@ -237,7 +236,7 @@ rule with more syllables.
|--------|-------------|
| `&` | A syllable which may be stressed (i.e. is not defined as unstressed). |
| `V` | Matches only if a previous word has indicated that a verb form is expected. |
| `xxJ` | Skip letters until `xx`. Simple `xx` means start of current word. `xx_yy` means `xx` as end of previous and `yy` as start of current word. If necessary more than one `J` can be used. |
| `xxJ` | Skip letters until `xx`. Simple `xx` means start of current word. `xx_yy` means `xx` as end of previous and `yy` as start of current word. If necessary, more than one `J` can be used, and `Lxx` group as letter mark. |

e.g.

@@ -252,7 +251,7 @@ e.g.
|-------------|-------------|
| `+` | Force an increase in the score in this rule by 20 points (may be repeated for more effect). |
| `<` | Force a decrease in the score in this rule by 20 points (may be repeated for more effect). |
| `Jxx` | Skip letters until `xx`. Simple `xx` means end of current word. `xx_yy` means `xx` as end of current and `yy` as start of next word. If necessary more than one `J` can be used. |
| `Jxx` | Skip letters until `xx`. Simple `xx` means end of current word. `xx_yy` means `xx` as end of current and `yy` as start of next word. If necessary, more than one `J` can be used, and `Lxx` group as letter mark. |
| `S<number>` | This number of matching characters are a standard suffix, remove them and retranslate the word. |
| `P<number>` | This number of matching characters are a standard prefix, remove them and retranslate the word. |
| `Lnn` | `nn` is a 2-digit decimal number in the range 01 to 20 Matches with any of the letter sequences which have been defined for letter group `nn` |

+ 46
- 34
src/libespeak-ng/dictionary.c View File

@@ -657,9 +657,22 @@ const char *GetTranslatedPhonemeString(int phoneme_mode)
return phon_out_buf;
}

static int LetterGroupNo(char *rule)
{
/*
* Returns number of letter group
*/
int groupNo = *rule;
groupNo = groupNo - 'A'; // substracting 'A' makes letter_group equal to number in .Lxx definition
if (groupNo < 0) // fix sign if necessary
groupNo += 256;
return groupNo;
}

static int IsLetterGroup(Translator *tr, char *word, int group, int pre)
{
/* Match the word against a list of utf-8 strings.
* returns length of matching letter group or -1
*
* How this works:
*
@@ -693,20 +706,11 @@ static int IsLetterGroup(Translator *tr, char *word, int group, int pre)
} else
w = word;

// If no character is allowed in group
// at the start (for pre-rule) or end (post-rule)
// of the checked letter in the word, return true.
if (*p == '~' && *w == ' ') // word end checked because of comment below
return 1;
/* TODO: Need to investigate why word end mark _ doesn't work properly
* for post rule somewhere in MatchRule() function. or e.g.:
*
* .L01 ~ b c
* .group a
* _L01) a i // this works
* a (L01_ u // this doesn't work
*/
// If '~' (no character) is allowed in group, return 0.
if (*p == '~')
return 0;

// Check current group
while ((*p == *w) && (*w != 0)) {
w++;
p++;
@@ -721,7 +725,8 @@ static int IsLetterGroup(Translator *tr, char *word, int group, int pre)
while (*p++ != 0)
;
}
return 0;
// Not found
return -1;
}

static int IsLetter(Translator *tr, int letter, int group)
@@ -1743,7 +1748,7 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
switch (rb)
{
case RULE_LETTERGP:
letter_group = *rule++ - 'A';
letter_group = LetterGroupNo(rule++);
if (IsLetter(tr, letter_w, letter_group)) {
lg_pts = 20;
if (letter_group == 2)
@@ -1754,12 +1759,11 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
failed = 1;
break;
case RULE_LETTERGP2: // match against a list of utf-8 strings
letter_group = *rule++ - 'A';
if (letter_group < 0)
letter_group += 256;
if ((n_bytes = IsLetterGroup(tr, post_ptr-1, letter_group, 0)) > 0) {
letter_group = LetterGroupNo(rule++);
if ((n_bytes = IsLetterGroup(tr, post_ptr-1, letter_group, 0)) >= 0) {
add_points = (20-distance_right);
post_ptr += (n_bytes-1);
if (n_bytes > 0) // move pointer, if non-zero length group was found
post_ptr += (n_bytes-1);
} else
failed = 1;
break;
@@ -1876,15 +1880,18 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
case RULE_SKIPCHARS:
{
// '(Jxy' means 'skip characters until xy'
char *p = post_ptr + letter_xbytes;
char *p2 = p; // pointer to the previous character in the word
int rule_w; // first wide character of skip rule
char *p = post_ptr - 1; // to allow empty jump (without letter between), go one back
char *p2 = p; // pointer to the previous character in the word
int rule_w; // first wide character of skip rule
utf8_in(&rule_w, rule);
while ((letter_w != rule_w) && (letter_w != RULE_SPACE) && (letter_w != 0)) {
int g_bytes = -1; // bytes of successfully found character group
while ((letter_w != rule_w) && (letter_w != RULE_SPACE) && (letter_w != 0) && (g_bytes == -1)) {
if (rule_w == RULE_LETTERGP2)
g_bytes = IsLetterGroup(tr, p, LetterGroupNo(rule + 1), 0);
p2 = p;
p += utf8_in(&letter_w, p);
}
if (letter_w == rule_w)
if ((letter_w == rule_w) || (g_bytes >= 0))
post_ptr = p2;
}
break;
@@ -1949,7 +1956,7 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
switch (rb)
{
case RULE_LETTERGP:
letter_group = *rule++ - 'A';
letter_group = LetterGroupNo(rule++);
if (IsLetter(tr, letter_w, letter_group)) {
lg_pts = 20;
if (letter_group == 2)
@@ -1960,12 +1967,11 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
failed = 1;
break;
case RULE_LETTERGP2: // match against a list of utf-8 strings
letter_group = *rule++ - 'A'; // substracting 'A' makes letter_group equal to number in .Lxx definition
if(letter_group<0)
letter_group += 256;
if ((n_bytes = IsLetterGroup(tr, pre_ptr, letter_group, 1)) > 0) {
letter_group = LetterGroupNo(rule++);
if ((n_bytes = IsLetterGroup(tr, pre_ptr, letter_group, 1)) >= 0) {
add_points = (20-distance_right);
pre_ptr -= (n_bytes-1);
if (n_bytes > 0) // move pointer, if non-zero length group was found
pre_ptr -= (n_bytes-1);
} else
failed = 1;
break;
@@ -2079,18 +2085,24 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_

case RULE_SKIPCHARS: {
// 'xyJ)' means 'skip characters backwards until xy'
char *p = pre_ptr; // pointer to current character in word
char *p2 = p; // pointer to previous character in word
char *p = pre_ptr + 1; // to allow empty jump (without letter between), go one forward
char *p2 = p; // pointer to previous character in word
int g_bytes = -1; // bytes of successfully found character group

while ((*p != *rule) && (*p != RULE_SPACE) && (*p != 0)) {
while ((*p != *rule) && (*p != RULE_SPACE) && (*p != 0) && (g_bytes == -1)) {
p2 = p;
p--;
if (*rule == RULE_LETTERGP2)
g_bytes = IsLetterGroup(tr, p2, LetterGroupNo(rule + 1), 1);
}

// if succeed, set pre_ptr to next character after 'xy' and remaining
// 'xy' part is checked as usual in following cycles of PRE rule characters
if (*p == *rule)
pre_ptr = p2;
if (g_bytes >= 0)
pre_ptr = p2 + 1;

}
break;


Loading…
Cancel
Save