9 years ago · fe3e3350e4
--- a/dictsource/lv_rules
+++ b/dictsource/lv_rules
@@ -4,7 +4,7 @@

 .replace
      ó    ȯ                    // replace o-acute with o-dot, as it is more logical for "short o"

      ḩ    h
 // +---------------------------+
 // |     Suffixes of nouns     |
 // +---------------------------+
@@ -49,7 +49,7 @@
 .L22  č ģ j k ļ ņ ŗ š ž dž bj pj mj vj // consonant narrowers of e
 .L23  s i m                            // remaining ending of the 2. and 5. decl. words
 .L27  č e ē f ģ h ḩ i ī j k ļ ņ q ŗ š w x y ž ei ie iu // all narrowers of e
 // Also: .L41 — international consonants and diphthongs as a narrowers of e/ē 
 // Also: .L41 — international consonants and diphthongs as a narrowers of e/ē

 // E wideners
 .L24  a ā u ū ai au                    // vowel wideners of e
@@ -57,7 +57,7 @@
                                       // but it is mixed with ȯ and ō
 .L25  b c d g ķ l m n p r s t v z dz   // consonant wideners of e
 .L28  a ā b c d e ē g ķ l m n p r s t u ū v z ai au // all wideners of e
 


 // +---------------------------+
 // |     Pronunciation of o    |
@@ -92,13 +92,13 @@
 // |    Consonant groups       |
 // +---------------------------+

 .L60  b d dz dž g ģ j l ļ m n ņ r v z ž // voiced (loud) consonants 
 .L60  b d dz dž g ģ j l ļ m n ņ r v z ž // voiced (loud) consonants
 .L61  p t k ķ c č s š f h ḩ             // unvoiced (silent) consonants
 .L62  b c d f g h j k l m n p q r s t v w x z // hard consonants
 .L63  č ģ ḩ ķ ļ ņ ŗ š ž                       // soft consonants
 .L64  s t
 .L65  t d j s š                              // root endings for verbs and adjectives
 .L66  g ģ       
 .L66  g ģ
 .L67  k ķ
 .L68  n r
 .L69  k s
@@ -131,13 +131,6 @@
 .L93  m l n     // vowel (pre)lengtheners — vowels after these are spelled longer than usual
                // j is vowel (post)lengthener, but as only one is handled in group j


 // +---------------------------+
 // |        All letters        |
 // +---------------------------+
 // These are used for "." like padding
 .L10 a ā b c č d e ē f g ģ h ḩ i ī j k ķ l ļ m n ņ o ō p q r ŗ s š t u ū v w x y z ž

 // +---------------------------+
 // |   Pronunciation groups    |
 // +---------------------------+
@@ -163,10 +156,12 @@

 .group c
           c               ts
      L55) c (L55          ts>
           c (C            ts_!

 .group č
           č               tS
      L55) č (L55          tS>

 .group d
           d               d
@@ -177,20 +172,16 @@
 .group e
           e               e         // Default as narrow e
           ee              e_|e      // internationalisms, prefix: ne-e...
           ei              ei
           ei (+           ei

    // narrow e ---------------------------------------------------------------------- //
           a) e (ro            e     // internationalsms
           a) e (ro            e     // internationalisms
           e (o                e
                                     // international consonants
           e (L10L10L10L41     e
           e (L10L10L41+       e
           e (L10L41++         e
           e (JL41+            e
           e (L41+++           e
         L41) e (@             e     // narrow only if there is another syllable for ending
      L41L10) e (@             e
   L41L10L10) e (@<            e
 L41L10L10L10) e (@<<           e
      L41) e (@                e     // narrow only if there is another syllable for ending
     L41J) e (@+               e

                                     // narrowing vowels follow
           e (Cinā             e
@@ -217,10 +208,7 @@ L41L10L10L10) e (@<<           e
           e (CL05_            e     // 5. decl. words
           e (CCL05_<          e
           e (CCCL05_<<        e
           e (L10L05_+         e
           e (L10L10L05_       e
           e (L10L10L10L05_<   e
           e (L10L10L10L10L05_<< e
           e (JL05_+           e

    // wide e ------------------------------------------------------------------------ //
                                     // widening consonants follow
@@ -235,15 +223,16 @@ L41L10L10L10) e (@<<           e
           e (L01_             E
           e (L25L01_          E
           e (L25L25L01_       E
           e (Jam              E
      L25) e (L18_+            E     // vareni etc.

    // specific words ----------------------------------------------------------------- //
 //       āb) e (L77L05_          e         // ābele
        b) e (dL82L05_         e         // bedre  
           e (lL88             e         // ..el[tš]
        b) e (dL82L05_         e         // bedre
       _b) e (t_               e
       _b) e (z                e
       _b) e (z_               e
       _C) e (_                e         // re, te, ve 
       _C) e (_                e         // re, te, ve
       _č) e (trL07_           e         // četri
        d) ebe (L83            ebe
        d) ebe (t              ebe
@@ -299,15 +288,11 @@ L41L10L10L10) e (@<<           e

    // narrow ē ---------------------------------------------------------------------- //
                                     // narrowing international consonants
              ē (L10L10L10L41  e:
              ē (L10L10L41     e:
              ē (L10L41        e:
              ē (L41+          e:
         L41) ē (@             e:    // narrow only if there is another syllable for ending
      L41L10) ē (@             e:
   L41L10L10) ē (@<            e:
 L41L10L10L10) ē (@<<           e:
 L41L10L10L10L10) ē (@<<<       e:
              ē (JL41+         e:
              ē (L41+          e:
         L41) ē (@+            e:    // narrow only if there is another syllable for ending
        L41J) ē (@             e:
                                     // narrowing vowels follow
           ē (CCCCL21          e:
           ē (CCCL21           e:
@@ -355,10 +340,7 @@ L41L10L10L10L10) ē (@<<<       e:
           ē (CL05_            e:
           ē (CCL05_<          e:
           ē (CCCL05_<<        e:
           ē (L10L05_+         e:
           ē (L10L10L05_       e:
           ē (L10L10L10L05_<   e:
           ē (L10L10L10L10L05_<< e:
           ē (JL05_+           e:


    // specific words ------------------------------------------------------------------- //
@@ -370,7 +352,7 @@ L41L10L10L10L10) ē (@<<<       e:
           ēdē                 e:de:
           ē (kL04_+           E:    // ēka
           ē (L86L13_          e:    // ēst
           ē (rkšķ             e:    
           ē (rkšķ             e:
           ē (tā_              E:
           ē (tL55_            E:
      izp) ē (t                e:
@@ -394,12 +376,13 @@ L41L10L10L10L10) ē (@<<<       e:
       šķ) ē (L77L05_          e:    // šķēle
       šķ) ē (L82L05_          e:    // šķēre
     _tāp) ē (c_               e:    // tāpēc
        t) ē (L90              e: 
        t) ē (L90              e:
        t) ē (mL04_            e:    // ..tēma
        v) ērtē                e:Rte:

 .group f
           f               f
      L55) f (L55          f>

 .group g
           g               g
@@ -409,6 +392,7 @@ L41L10L10L10L10) ē (@<<<       e:

 .group h
           h               h
      L55) h (L55          h>

 .group i
           i               i
@@ -430,10 +414,12 @@ L41L10L10L10L10) ē (@<<<       e:

 .group k
           k               k
      L55) k (L55          k>
           k (ļ            k}       // tiny pause between k and ļ

 .group ķ
           ķ               c
      L55) ķ (L55          c>
           ķ (L55          c}       // tiny pause for sharper sound before short vowels

 .group l
@@ -473,45 +459,14 @@ L41L10L10L10L10) ē (@<<<       e:
      L42) o               o:        // o prefixes
           o (L43+         o         // o postfixes

                                     // o indicators before
 L41L10L10L10L10L10L10L10L10L10)   o (<<<<<<<<<< o
 L41L10L10L10L10L10L10L10L10)      o (<<<<<<<<<  o
 L41L10L10L10L10L10L10L10)         o (<<<<<<<    o
 L41L10L10L10L10L10L10)            o (<<<<<<     o
 L41L10L10L10L10L10)               o (<<<<<      o
   L41L10L10L10L10)               o (<<<<       o
     L41L10L10L10)                o (<<<        o
        L41L10L10)                o (<<         o
           L41L10)                o (<          o
              L41)                o             o
                                     // o indicators after
           o (L10L10L10L10L10L10L10L41<<<<<<    o
           o (L10L10L10L10L10L10L41<<<<<        o
           o (L10L10L10L10L10L41<<<<            o
           o (L10L10L10L10L41<<<                o
           o (L10L10L10L41<<                    o
           o (L10L10L41<                        o
           o (L10L41                            o
           o (L41+                              o

                                     // uo indicators before (facultative writing)
 L30L10L10L10L10L10L10L10)         o (<<<<<<<    uo
 L30L10L10L10L10L10L10)            o (<<<<<<     uo
 L30L10L10L10L10L10)               o (<<<<<      uo
   L30L10L10L10L10)               o (<<<<       uo
     L30L10L10L10)                o (<<<        uo
        L30L10L10)                o (<<         uo
           L30L10)                o (<          uo
              L30)                o             uo
                                     // uo indicators after (facultative writing)
           o (L10L10L10L10L10L10L30<<<<<        uo
           o (L10L10L10L10L10L30<<<<            uo
           o (L10L10L10L10L30<<<                uo
           o (L10L10L10L30<<                    uo
           o (L10L10L30<                        uo
           o (L10L30                            uo
           o (L30+                              uo
     L41J) o (+            o         // o indicators before
           o (L41+         o         // o indicators after
           o (JL41+        o

     L30J) o               uo        // uo indicators before (facultative writing)
      L30) o (+            uo
           o (JL30         uo        // uo indicators after (facultative writing)
           o (L30+         uo

           o               uo        // words with Latvian roots are more common
           o (_++          uo        // common ending for Latvian words
@@ -547,27 +502,10 @@ L46L45L45) o (<<           uo
           otu (_<         uotu
           o (_$w_alt2++   uo        // $alt2 (international) words in lv_list, which are spelled with uo at the end

 L41L10L10L10L10L10L10L10) o (lL01_<<<<< o: //  [fhqxw]..ols
    L41L10L10L10L10L10L10) o (lL01_<<<<  o:
       L41L10L10L10L10L10) o (lL01_<<<   o:
          L41L10L10L10L10) o (lL01_<<    o:
             L41L10L10L10) o (lL01_<     o:
                L41L10L10) o (lL01_      o:
                   L41L10) o (lL01_+     o:

   oL10L10L10L10L10L10L10) o (lL01_<<<<<< o: //  o...ols
      oL10L10L10L10L10L10) o (lL01_<<<<<  o:
         oL10L10L10L10L10) o (lL01_<<<<   o:
            oL10L10L10L10) o (lL01_<<<    o:
               oL10L10L10) o (lL01_<<     o:
                  oL10L10) o (lL01_<      o:
                     oL10) o (lL01_       o:

     L41J) o (lL01_+       o:        //  [fhqxw]..ols
       oJ) o (lL01_        o:        //  o...ols
           o (L62%         o




      aer) o               o:
      agn) o               o
      agr) o               o
@@ -597,7 +535,7 @@ L46L45L45) o (<<           uo
       br) o (š            o
     celm) o               uo
      cet) o               o
      cēl) o (L80          uo 
      cēl) o (L80          uo
     cikl) o               o:
      cit) o               o
       _c) o (_+           o:
@@ -691,7 +629,7 @@ L46L45L45) o (<<           uo
     miel) o (L65          uo
     miel) o               uo
     migl) o               uo
      mir) o (L80+         uo             
      mir) o (L80+         uo
      miz) o               uo
        m) o (L86L14_+     uo        // mosties
      mon) o               o
@@ -1043,7 +981,7 @@ L46L45L45) o (<<           uo
        v) ol (ej          ol
        v) ol (t           ol
        v) ol (u           ol
        z) ol (it          ol 
        z) ol (it          ol
       @z) ol (L01_        o:l       // ..zols
        z) ol (L05_        uol      // ..zole

@@ -1138,7 +1076,7 @@ L46L45L45) o (<<           uo
       hr) on (o           on
      ikr) on (+           o:n
       im) on              o>n
       ir) on (i<          o:n             
       ir) on (i<          o:n
       is) on              on
      itr) on              o:n
        j) on (i           on
@@ -1205,23 +1143,21 @@ L46L45L45) o (<<           uo
        ž) on (g           oN

 .group oo
     _aut) oo (stL01_      o:uo   // autoosta
     _aut) oo (stL01_      o:uo      // autoosta
        k) oo (per         o:o
           oo (_           oo:
        z) oo              oo:

 .group op

 L41L10L10L10sk) op (L01_   o:        //  ...skops
   L41L10L10sk) op (L01_   o:
      L41L10sk) op (L01_   o:

   L41Jsk) op (L01_        o:p       //  ...skops
       ad) op              op
     cikl) op (L01_        o:p       // ciklops
        d) op              op
      eir) op (L04_        o:p
      eir) op              op
     _gal) op (L01_        op        // galops
        k) op (pr          uop       // ..koppr..
        k) op (en          op        // Kopenhāgena
        k) op (ēC          op        // ..kopēt../..kopēš..
        k) op (ējL01_      uop       // ..kopējs
@@ -1280,7 +1216,7 @@ L41L10L10L10sk) op (L01_   o:        //  ...skops
       ab) or              oR
       ak) or              oR
     aleg) or              o:R
      alg) or              oR 
      alg) or              oR
       am) or              oR
        a) or              oR
      aut) or (i@          oR
@@ -1513,7 +1449,7 @@ L41L10L10L10sk) op (L01_   o:        //  ...skops
      @@)  oz (L05_        o:z       // ..oze
           ozo (lL01_      uozuo     // ..ozols
           oz (on          oz
           ozo (_          o:zuo 
           ozo (_          o:zuo
        p) oz              oz
       _r) oz (eL88L05_    oz        // rozete
       _r) oz (īL80L05_    oz        // rozīne
@@ -1529,8 +1465,8 @@ L41L10L10L10sk) op (L01_   o:        //  ...skops

 .group p
           p               p
                                     // pus..number stress on next syllable
        _) pus (vien       p%us
      L55) p (L55          p>   
        _) pus (vien       p%us      // pus..number stress on next syllable
        _) pus (div        p%us
        _) pus (otr        p%us
        _) pus (trij       p%us
@@ -1554,15 +1490,18 @@ L41L10L10L10sk) op (L01_   o:        //  ...skops
           ŗ               r

 .group s
           sh              s_!h_!    // probably no need to make distinct in other places 
           sh              s_!h_!    // probably no need to make distinct in other places
        _) s (L61          s_        // make s distinct at start of the word before unvocied consonants
           s               s
      L55) s (L55          s>

 .group š
           š               S
      L55) š (L55          S>

 .group t
           t               t
      L55) t (L55          t>

 .group u
           ui              ui
--- a/docs/dictionary.md
+++ b/docs/dictionary.md
@@ -120,7 +120,6 @@ those two groups is used.

 `~` Letter in letter group means, that there can be no letter in this group 
    at the beginning or end of the word.
    Beware of [issue #196](https://github.com/espeak-ng/espeak-ng/issues/196).

 _For example:_

@@ -237,7 +236,7 @@ rule with more syllables.
 |--------|-------------|
 | `&`    | A syllable which may be stressed (i.e. is not defined as unstressed). |
 | `V`    | Matches only if a previous word has indicated that a verb form is expected. |
 | `xxJ`  | Skip letters until `xx`. Simple `xx` means start of current word. `xx_yy` means `xx` as end of previous and `yy` as start of current word. If necessary more than one `J` can be used. |
 | `xxJ`  | Skip letters until `xx`. Simple `xx` means start of current word. `xx_yy` means `xx` as end of previous and `yy` as start of current word. If necessary, more than one `J` can be used, and `Lxx` group as letter mark. |

 e.g.

@@ -252,7 +251,7 @@ e.g.
 |-------------|-------------|
 | `+`         | Force an increase in the score in this rule by 20 points (may be repeated for more effect). |
 | `<`         | Force a decrease in the score in this rule by 20 points (may be repeated for more effect). |
 | `Jxx`       | Skip letters until `xx`. Simple `xx` means end of current word. `xx_yy` means `xx` as end of current and `yy` as start of next word. If necessary more than one `J` can be used. |
 | `Jxx`       | Skip letters until `xx`. Simple `xx` means end of current word. `xx_yy` means `xx` as end of current and `yy` as start of next word.  If necessary, more than one `J` can be used, and `Lxx` group as letter mark. |
 | `S<number>` | This number of matching characters are a standard suffix, remove them and retranslate the word. |
 | `P<number>` | This number of matching characters are a standard prefix, remove them and retranslate the word. |
 | `Lnn`       | `nn` is a 2-digit decimal number in the range 01 to 20 Matches with any of the letter sequences which have been defined for letter group `nn` |
--- a/src/libespeak-ng/dictionary.c
+++ b/src/libespeak-ng/dictionary.c
@@ -657,9 +657,22 @@ const char *GetTranslatedPhonemeString(int phoneme_mode)
 	return phon_out_buf;
 }

 static int LetterGroupNo(char *rule)
 {
 	/*
 	 * Returns number of letter group
 	 */
 	int groupNo = *rule;
 	groupNo = groupNo - 'A'; // substracting 'A' makes letter_group equal to number in .Lxx definition
 	if (groupNo < 0)         // fix sign if necessary
 		groupNo += 256;
 	return groupNo;
 }

 static int IsLetterGroup(Translator *tr, char *word, int group, int pre)
 {
 	/* Match the word against a list of utf-8 strings.
 	 * returns length of matching letter group or -1
 	 *
 	 * How this works:
 	 *
@@ -693,20 +706,11 @@ static int IsLetterGroup(Translator *tr, char *word, int group, int pre)
 		} else
 			w = word;

 		// If no character is allowed in group
 		// at the start (for pre-rule) or end (post-rule)
 		// of the checked letter in the word, return true.
 		if (*p == '~' && *w == ' ') // word end checked because of comment below
 			return 1;
 		/* TODO: Need to investigate why word end mark _ doesn't work properly
 		 * for post rule somewhere in MatchRule() function. or e.g.:
 		 *
 		 * .L01 ~ b c
 		 * .group a
 		 *  _L01) a       i  // this works
 		 *        a (L01_ u  // this doesn't work
 		 */
 		// If '~' (no character) is allowed in group, return 0.
 		if (*p == '~')
 			return 0;

 		//  Check current group
 		while ((*p == *w) && (*w != 0)) {
 			w++;
 			p++;
@@ -721,7 +725,8 @@ static int IsLetterGroup(Translator *tr, char *word, int group, int pre)
 		while (*p++ != 0)
 			;
 	}
 	return 0;
 	// Not found
 	return -1;
 }

 static int IsLetter(Translator *tr, int letter, int group)
@@ -1743,7 +1748,7 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
 				switch (rb)
 				{
 				case RULE_LETTERGP:
 					letter_group = *rule++ - 'A';
 					letter_group = LetterGroupNo(rule++);
 					if (IsLetter(tr, letter_w, letter_group)) {
 						lg_pts = 20;
 						if (letter_group == 2)
@@ -1754,12 +1759,11 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
 						failed = 1;
 					break;
 				case RULE_LETTERGP2: // match against a list of utf-8 strings
 					letter_group = *rule++ - 'A';
 					if (letter_group < 0)
 						letter_group += 256;
 					if ((n_bytes = IsLetterGroup(tr, post_ptr-1, letter_group, 0)) > 0) {
 					letter_group = LetterGroupNo(rule++);
 					if ((n_bytes = IsLetterGroup(tr, post_ptr-1, letter_group, 0)) >= 0) {
 						add_points = (20-distance_right);
 						post_ptr += (n_bytes-1);
 						if (n_bytes > 0) // move pointer, if non-zero length group was found
 							post_ptr += (n_bytes-1);
 					} else
 						failed = 1;
 					break;
@@ -1876,15 +1880,18 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
 				case RULE_SKIPCHARS:
 				{
 					// '(Jxy'  means 'skip characters until xy'
 					char *p = post_ptr + letter_xbytes;
 					char *p2 = p; // pointer to the previous character in the word
 					int rule_w;   // first wide character of skip rule
 					char *p = post_ptr - 1; // to allow empty jump (without letter between), go one back
 					char *p2 = p;		// pointer to the previous character in the word
 					int rule_w;		// first wide character of skip rule
 					utf8_in(&rule_w, rule);
 					while ((letter_w != rule_w) && (letter_w != RULE_SPACE) && (letter_w != 0)) {
 					int g_bytes = -1;	// bytes of successfully found character group
 					while ((letter_w != rule_w) && (letter_w != RULE_SPACE) && (letter_w != 0) && (g_bytes == -1)) {
 						if (rule_w == RULE_LETTERGP2)
 							g_bytes = IsLetterGroup(tr, p, LetterGroupNo(rule + 1), 0);
 						p2 = p;
 						p += utf8_in(&letter_w, p);
 					}
 					if (letter_w == rule_w)
 					if ((letter_w == rule_w) || (g_bytes >= 0))
 						post_ptr = p2;
 				}
 					break;
@@ -1949,7 +1956,7 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
 				switch (rb)
 				{
 				case RULE_LETTERGP:
 					letter_group = *rule++ - 'A';
 					letter_group = LetterGroupNo(rule++);
 					if (IsLetter(tr, letter_w, letter_group)) {
 						lg_pts = 20;
 						if (letter_group == 2)
@@ -1960,12 +1967,11 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_
 						failed = 1;
 					break;
 				case RULE_LETTERGP2: // match against a list of utf-8 strings
 					letter_group = *rule++ - 'A'; // substracting 'A' makes letter_group equal to number in .Lxx definition
 					if(letter_group<0)
 						letter_group += 256;
 					if ((n_bytes = IsLetterGroup(tr, pre_ptr, letter_group, 1)) > 0) {
 					letter_group = LetterGroupNo(rule++);
 					if ((n_bytes = IsLetterGroup(tr, pre_ptr, letter_group, 1)) >= 0) {
 						add_points = (20-distance_right);
 						pre_ptr -= (n_bytes-1);
 							if (n_bytes > 0)  // move pointer, if non-zero length group was found
 								pre_ptr -= (n_bytes-1);
 					} else
 						failed = 1;
 					break;
@@ -2079,18 +2085,24 @@ static void MatchRule(Translator *tr, char *word[], char *word_start, int group_

 				case RULE_SKIPCHARS: {
 					// 'xyJ)'  means 'skip characters backwards until xy'
 					char *p = pre_ptr;  // pointer to current character in word
 					char *p2 = p;       // pointer to previous character in word
 					char *p = pre_ptr + 1;	// to allow empty jump (without letter between), go one forward
 					char *p2 = p;		// pointer to previous character in word
 					int g_bytes = -1;	// bytes of successfully found character group

 					while ((*p != *rule) && (*p != RULE_SPACE) && (*p != 0)) {
 					while ((*p != *rule) && (*p != RULE_SPACE) && (*p != 0) && (g_bytes == -1)) {
 						p2 = p;
 						p--;
 						if (*rule == RULE_LETTERGP2)
 							g_bytes = IsLetterGroup(tr, p2, LetterGroupNo(rule + 1), 1);
 					}

 					// if succeed, set pre_ptr to next character after 'xy' and remaining
 					// 'xy' part is checked as usual in following cycles of PRE rule characters
 					if (*p == *rule)
 						pre_ptr = p2;
 					if (g_bytes >= 0)
 						pre_ptr = p2 + 1;

 				}
 					break;