7 years ago · c8f84f1f46
--- a/dictsource/en_rules
+++ b/dictsource/en_rules
@@ -50,6 +50,8 @@
   ﬀ   ff     // ligatures
   ﬁ   fi
   ﬂ   fl
   ﬅ   ft
   ﬆ   st

   ά   α      // some of polytonic greek to plain greek letters (list in el_rules is much longer)
   έ   ε
--- a/docs/dictionary.md
+++ b/docs/dictionary.md
@@ -509,9 +509,9 @@ each language. The number fragments are given in the `*_list` file.
 ## Character Substitution

 Character substitutions can be specified by using a `.replace` section
 at the start of the `*_rules` file. In each line one character can be
 replaced by one or two characters. (Source and target of replacement can consume
 up to four bytes.) This substitution is done to a word _before_ word is searched
 at the start of the `*_rules` file. In each line several characters can be
 replaced by several other characters. (Source and target of replacement can consume
 up to 40 bytes.) This substitution is done to a word _before_ word is searched
 in `*_list` or `*_listx` file and translated using the spelling-to-phoneme rules.
 Only the lower-case version of the characters needs to be specified. e.g.:

--- a/src/libespeak-ng/compiledict.c
+++ b/src/libespeak-ng/compiledict.c
@@ -1448,30 +1448,53 @@ static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_t
 			break;
 		case 2: //  .replace
 		{
 			int replace1;
 			int replace2;
 			int from[N_WORD_BYTES + 1] = { 0 }; // Size of N_WORD_BYTES + null byte
 			int to[N_WORD_BYTES + 1] = { 0 };
 			char *p;

 			p = buf;
 			replace1 = 0;
 			replace2 = 0;
 			while (isspace2(*p)) p++;

 			while (isspace2(*p)) // skip spaces in xx_rules file
 				p++;

 			ix = 0;
 			while ((unsigned char)(*p) > 0x20) { // not space or zero-byte
 				p += utf8_in(&c, p);
 				replace1 += (c << ix);
 				ix += 16;
 			while ((unsigned char) (*p) > ' ') { // prepare 'from' characters
 				p += utf8_in(&from[ix], p);
 				ix++;
 			}
 			while (isspace2(*p)) p++;
 			while (isspace2(*p)) // skip spaces again
 				p++;
 			ix = 0;
 			while ((unsigned char)(*p) > 0x20) {
 				p += utf8_in(&c, p);
 				replace2 += (c << ix);
 				ix += 16;
 			while ((unsigned char) (*p) > ' ') {  // prepare 'to' characters
 				p += utf8_in(&to[ix], p);
 				ix++;
 			}
 			if (replace1 != 0) {
 				Write4Bytes(f_out, replace1); // write as little-endian
 				Write4Bytes(f_out, replace2); // if big-endian, reverse the bytes in LoadDictionary()
 			// save into file
 			if (from[0] != 0) {
 				ix = 0;
 				while (from[ix] != 0) {
 					Write4Bytes(f_out, from[ix]);
 					ix++;
 					if (ix > N_WORD_BYTES / 4) {
 						fprintf(stderr,
 								"Too long .replace 'from' part (limit is: %d characters)\n",
 								N_WORD_BYTES / 4);
 						break;
 					}
 				}
 				Write4Bytes(f_out, ' '); // end of 'from'
 				ix = 0;
 				while (to[ix] != 0) {
 					Write4Bytes(f_out, to[ix]);
 					ix++;
 					if (ix > N_WORD_BYTES / 4) {
 						fprintf(stderr,
 								"Too long .replace 'to' part (limit is: %d characters)\n",
 								N_WORD_BYTES / 4);
 						break;
 					}
 				}
 				Write4Bytes(f_out, ' '); // end of 'to'
 			}
 		}
 			break;
--- a/src/libespeak-ng/dictionary.c
+++ b/src/libespeak-ng/dictionary.c
@@ -152,7 +152,7 @@ static void InitGroups(Translator *tr)
 			pw = (unsigned int *)(((intptr_t)p+4) & ~3); // advance to next word boundary
 			tr->langopts.replace_chars = pw;
 			while (pw[0] != 0)
 				pw += 2; // find the end of the replacement list, each entry is 2 words.
 				pw ++; // find the end of the replacement list
 			p = (char *)(pw+1);

 #ifdef ARCH_BIG
--- a/src/libespeak-ng/translate.c
+++ b/src/libespeak-ng/translate.c
@@ -1790,81 +1790,158 @@ static int EmbeddedCommand(unsigned int *source_index_out)
 	return 1;
 }

 // handle .replace rule in xx_rules file
 static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
 static bool GetNextTranslateChars(const int **from, const int **to, bool next_step)
 {
 	int ix;
 	unsigned int word;
 	unsigned int new_c, c2 = ' ', c_lower;
 	int upper_case = 0;
 	static bool ignore_next = false;
 	const unsigned int *replace_chars;

 	if (ignore_next) {
 		ignore_next = false;
 		return 8;
 	// Set pointers to next 'from' and 'to' part for replacement
 	// next_step indicates second and succeeding replacement group
 	// Return true if got data, false, if reached end
 	if (next_step) {                        // first 'from' is pointed already
 		int spaces = 0;
 		while (**from != 0 && spaces < 2) { // Get next 'from' position skipping 'to' part
 			(*from)++;
 			if (**from == ' ')
 				spaces++;
 		}
 		while (**from == ' ') // skip remaining spaces, if exists
 			(*from)++;
 	}
 	if (c == 0) return 0;
 	if (**from == 0)
 		return false;

 	if ((replace_chars = tr->langopts.replace_chars) == NULL)
 		return c;
    *to = *from;              // start seeking 'to' from 'from' position
 	while (**to != 0 && **to != ' ')   // get next 'to' position
 		(*to)++;

 	// there is a list of character codes to be substituted with alternative codes
 	while (**to == ' ')       // skip remaining spaces, if exists
 		(*to)++;

 	if (iswupper(c_lower = c)) {
 		c_lower = towlower2(c, tr);
 		upper_case = 1;
 	return **to != 0;
 }

 static void ReplaceMatchingChars(char **matchstart, const int *from, const int *to, int upper_case)
 {
 	// Replace characters from 'from' to 'to' group in source clause, starting
 	// with place indicated in matchstart.
 	// Rewrites remaining part of source, if length of 'from' and 'to' groups differs.

 	char tmpsource[N_TR_SOURCE + 40]; // Prepare working copy of source
 	memcpy(&tmpsource, *matchstart, N_TR_SOURCE + 40);

 	// prepare from part
 	char charfrom[N_WORD_BYTES + 1] = { 0 };
 	int usedfrom = 0;
 	int const *pintfrom = from;
 	char *pcharfrom = charfrom;
 	while (*pintfrom != ' ') {
 		usedfrom += utf8_out(*pintfrom, pcharfrom + usedfrom);
 		pintfrom++;
 	}

 	// prepare to part
 	char charto[N_WORD_BYTES + 1] = { 0 };
 	int usedto = 0;
 	int const *pintto = to;
 	char *pcharto = charto;
 	while (*pintto != 0 && *pintto != ' ') {
 		usedto += utf8_out(*pintto, pcharto + usedto);
 		pintto++;
 	}

 	// Set up replacement in temporary buffer
 	char *psource = *matchstart;
 	char *ptmpsource = tmpsource;
 	pcharto = charto;

 	// Do replacement writing 'to' part in buffer
 	while (*pcharto != 0 ) {
 		if(upper_case) {        // TODO: should check for capital letter from source (maybe using word_flags)
 			*ptmpsource = toupper(*pcharto);
 			upper_case = 0;     // capitalize only first character if necessary (will work in most cases)
 		}
 		else
 			*ptmpsource = *pcharto;
 		pcharto++;
 		ptmpsource++;
 	}

 	// If 'to' part is different than 'from' part, fix remaining part of the buffer.
 	// (Read after 'from' in source, write after 'to' in buffer.
 	// Don't touch last 40 bytes of source, which could contain control data.
 	if (usedfrom != usedto) {
 		psource = *matchstart + usedfrom;
 		while (*psource != 0 && psource < &source[N_TR_SOURCE]
 				&& ptmpsource < &tmpsource[N_TR_SOURCE]) {
 			*ptmpsource = *psource;
 			ptmpsource++;
 			psource++;
 		}
 	}

 	new_c = 0;
 	for (ix = 0; (word = replace_chars[ix]) != 0; ix += 2) {
 		if (c_lower == (word & 0xffff)) {
 			if ((word >> 16) == 0) {
 				new_c = replace_chars[ix+1];
 				break;
 	// Write temporary buffer back to source
 	psource = *matchstart;
 	ptmpsource = tmpsource;
 	while (*ptmpsource != 0 && ptmpsource < &tmpsource[N_TR_SOURCE]
 			&& psource < &source[N_TR_SOURCE]) {
 		*psource = *ptmpsource;
 		ptmpsource++;
 		psource++;
 	}

 	// if requested, print trace
 	if (option_phonemes & espeakPHONEMES_TRACE)
 		fprintf(f_trans, "Replace   %s > %s\n", charfrom, charto);
 }

 static void TranslateChars(Translator *tr, char *source)
 {
 	// Replace characters using Translator.replace_chars in passed source buffer
 	char *startmatch;
 	const int *from;
 	const int *to;
 	bool upper_case = false;

 	if ((from = tr->langopts.replace_chars) == NULL)
 		return;

 	char *curchar = source; // pointer to the source clause
 	int curint;    // current UTF-16 character
 	int used;               // bytes used in buffer
 	const int *curfrom;     // pointer to current 'from'
 	bool next_step = false; // signal to jumping to next 'from' part in dictionary
 	while (GetNextTranslateChars(&from, &to, next_step)) {
 		next_step = true;
 		curfrom = from;
 		curchar = source;
 		startmatch = 0;
 		curint = 0;
 		do {
 			used = utf8_in(&curint, curchar);
 			if (curint < ' ') {        // skip control characters
 				curchar += used;
 				continue;
 			}
 			if ((word >> 16) == (unsigned int)towlower2(next_in, tr)) {
 				new_c = replace_chars[ix+1];
 				ignore_next = true;
 				break;
 			if ((upper_case = iswupper(curint)) == true) // remember, if uppercase
 				curint = ucd_tolower(curint);

 			if (curint == *curfrom) {  // if matches
 				if (startmatch == 0)
 					startmatch = curchar;
 				curfrom++;             // move check to next character
 				if (*curfrom == ' ') { // if full match, replace
 					ReplaceMatchingChars(&startmatch, from, to, upper_case);
 					startmatch = 0;    // reset match to start of 'from'
 					curfrom = from;
 				}
 			} else {                   // reset match to start of 'from'
 				startmatch = 0;
 				curfrom = from;
 			}
 		}
 			curchar += used;
 		} while (curint > 0);
 	}

 	if (new_c == 0)
 		return c; // no substitution

 	if (new_c & 0xffe00000) {
 		// there is a second character to be inserted
 		// don't convert the case of the second character unless the next letter is also upper case
 		c2 = new_c >> 16;
 		if (upper_case && iswupper(next_in))
 			c2 = ucd_toupper(c2);
 		*insert = c2;
 		new_c &= 0xffff;
 	}

 	if (upper_case)
 		new_c = ucd_toupper(new_c);

 	*wordflags |= FLAG_CHAR_REPLACED;
 	if (option_phonemes & espeakPHONEMES_TRACE) {
 		char msg[21] = {'R','e','p','l','a','c','e',':',' '};
 		char *index = msg;
 		index += 9;
 		index += utf8_out(c, index);
 		*index++ = ' ';
 		*index++ = '>';
 		*index++ = ' ';
 		index += utf8_out(new_c, index);
 		index += utf8_out(c2, index);
 		*index = 0;
 		fprintf(f_trans, "%s\n", msg);
 	}
 	return new_c;
 }

 static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
 static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert)
 {
 	// To allow language specific examination and replacement of characters

@@ -1930,8 +2007,7 @@ static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c,
 		}
 		break;
 	}
 	// handle .replace rule in xx_rules file
 	return SubstituteChar(tr, c, next_in, insert, wordflags);
 	return c;
 }

 static const char *UCase_ga[] = { "bp", "bhf", "dt", "gc", "hA", "mb", "nd", "ng", "ts", "tA", "nA", NULL };
@@ -2104,6 +2180,9 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
 	}
 	words[0].length = k;

 	// handle .replace rule of xx_rules file
 	TranslateChars(tr, source);

 	while (!finished && (ix < (int)sizeof(sbuf) - 1) && (n_ph_list2 < N_PHONEME_LIST-4)) {
 		prev_out2 = prev_out;
 		utf8_in2(&prev_out, &sbuf[ix-1], 1);
@@ -2206,7 +2285,7 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
 				word_flags |= FLAG_COMMA_AFTER;
 			}
 			// language specific character translations
 			c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted, &word_flags);
 			c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted);
 			if (c == 8)
 				continue; // ignore this character

--- a/tests/language-phonemes.test
+++ b/tests/language-phonemes.test
@@ -71,7 +71,7 @@ test_lang fr-CH 4e312a224c8976da90a7faec8edabbc2fe311310 "ma na n^a Na pa ta ka
 test_lang ga 63433f113b6507eb2af79d03e0f28aa08f1d643c "ma m;a mja na n;a nja Na N;a pa p;a t[a t;a ka k;a ca ba b;a bja d[a d;a ga g;a fa f;a sa Sa xa Ca C;a ha h;a wa va v;a vja ra Ra Qa Q\"a Q\"ja ja la l;a lja _:_ m@ m@/ ma ma# mE mE# mI mi mO mO# mU mU# mu1 m0 m0# mo: mA: me: mi: mu: m@U mi@ mu@ maI maI# maU maU# meI"
 test_lang gd 744f0ab9fb2661984f2bf2da9f9f150402f7118a "ma na n[a n^a pa #pa ta #ta t;a #t;a ka k;a #ka #k;a ba b;a #ba da d;a #da #d;a ga g;a #ga #g;a fa fja va vja sa Sa Ca J^a xa Qa ha hja dZa la l^a ja *a *;a Ra _:_ m& mI m@ mi: mi me: me mE: mE ma: ma ma2 mO: mO mo: mo mu: mu mu-: mu- mo-: mo- mi@ mia mu@ maI meI m@I mu-I maU mOU"
 test_lang gn a4e5ea82c978c2f592b9086ff482157f46614ba5 "ma na n^a n.a Na pa Ba ta ka k:a ?a ba da dZa ga fa Ta sa Sa S;a J^a ha la ra R2a wa _:_ ma me mi mo mu my ma~ me~ mi~ mo~ mu~ my~"
 test_lang grc ad3beb0255c55f385f4212111aea6de83dd9cb59 "ma na Na pa ta ka fa ba da ga sa za Ta va xa ha za Ra la ja wa _:_ my me mE mo mO mEU mOI mOU myI my: me: mE: mo: mO: mEU: mOI: mOU: myI:" # f = p_f
 test_lang grc fa595d58e82556b8aca26c38027677bf402f37b7 "ma na Na pa ta ka fa ba da ga sa za Ta va xa ha za Ra la ja wa _:_ my me mE mo mO mEU mOI mOU myI my: me: mE: mo: mO: mEU: mOI: mOU: myI:" # f = p_f
 test_lang gu be5801d73a5692e34ee1c30729b393170cc9414f "ma na n.a Na pa t#a t.a ca tSa ka ba d#a d.a Ja dZa ga p#a t.#a c#a k#a b#a d.#a g#a fa sa Sa za ha v#a la l.a ja *a Ra wa Qa _:_ mi mu mI mU mo m@ mE mV mO mi~ mu~ mI~ mV~ me~ mo~ mi: ma: me: me~: mo: maI maI~ maU maU~"
 test_lang hak a2c353ce28a8399b657e6b603219c914e16e957b "na Na la ja pa pha fa ta tha ka kha tS;a tS;ha S;a s.a tsa tsha ts.a ts.ha N-a _:_ ma mA mai mAu m@ m@r mE mei mi mi[ mi. miA miAu miE mio miou mo mo- mou mong mu muA mua muai mu@ mei muo my myu my& myE my@ myi _:_ ma11 ma21 ma214 ma22 ma33 ma35 ma44 ma51 ma53 ma55"
 test_lang hi ae76141a2b3fb3090795e74f6e4a0a6ebbb510e6 "ma na n.a n^a pa t#a t.a tSa ka qa p#a t.#a c#a k#a ba d#a d.a dZa ga b#a d.#a J#a g#a fa sa Sa xa va za Za Qa Ha ra r.a la ja _:_ mI mU me m@ mo mE mV mO ma mi: mu: me: mo: mE: mO: ma: m&: mI~ mi~ mU~ mu~ mU~ me~ mo~ mE~ mV~ mO~ ma~ mAI maU"
@@ -81,7 +81,7 @@ test_lang hy d3fc6d3778f8669abed91b397f28c0e2ecf3c1cc "ma na Na pa p#a ba ta t#a
 test_lang hyw 72f392c05e74e2fce620b9e1b0dc440481f597e3 "ma na Na pa p#a ba ta t#a da ka k#a ga tsa ts#a dza tSa tS#a dZa fa va sa za Sa Za Xa ha ja la ra r\"a Ra *a _:_ m@ mi my me ma mo mu mW maI meI mi@"
 test_lang ia 1be09d2d915404d00aacb7895edeed5ff947345a "ma na pa ba ta da ka ga tsa tSa dZa fa va sa za Sa ha ja wa la Ra _:_ ma me mo mi mu maI maU meI meU moI"
 test_lang id 75a57a020af2b62e3448792d3f6a945a9b2c6b75 "ma na n^a Na pa ba ta da ka ga ?a tSa dZa fa va Ta Da sa za Sa xa Qa ha ja wa la Ra R2a _:_ ma mE mO m@ me mo mi mu maI meI mOI maU m@U"
 test_lang is b27c88c4b4e47d80a80e690f3dfc2026821f39dd "m#a ma n#a na n^#a n^a N#a Na pa ta ca ka fa va Ta Da sa Ca J^a xa Qa ha l#a la tl#a r#a ra _:_ mi mi: mI mI: mE mE: ma ma: mO mO: mu mu: my my: mW mW: maI maI: meI meI: maU maU: moU moU: mYy mOI myI"
 test_lang is feb18f89c3bb8b7844efab08395fdc4aeec47530 "m#a ma n#a na n^#a n^a N#a Na pa ta ca ka fa va Ta Da sa Ca J^a xa Qa ha l#a la tl#a r#a ra _:_ mi mi: mI mI: mE mE: ma ma: mO mO: mu mu: my my: mW mW: maI maI: meI meI: maU maU: moU moU: mYy mOI myI"
 test_lang it 8780284c48a97327b120f554adb3c2dd5ffbc283 "ma na n^a pa ba ta da ka ga tsa dza tSa dZa fa va sa za Sa ja wa la l^a Ra *a ssa k~a tS~a dZ~a g~a Q~a S~a _:_ ma ma/ me me/ mE mi mi/ mi# mI mo mo/ mO mu mU maU maI moI"
 test_lang ja 602ba8d6db3e45b0966f72d6404eadb0e0837930 "ma na n^a Na N\\a pa ba ta da ka ga t_sa d_za t_s\\a d_z\\a p\\a sa za s\\a z\\a Ca ha r\`a ja wa _:_ mi mu me mo ma mi~ mu~"
 test_lang jbo 77a00420ef9c1b40f87800f5ffee921e68a48a32 "ma na Na pa ba ta da ka ga ?a fa va xa ha tSa dZa sa za Sa Za wa ja la Ra n-a r-a l-a _:_ ma me mi mo mu m@ maI maU meI moI"
@@ -97,5 +97,5 @@ test_lang lfn 044e27a5100528760a185e0773dccaca504b5bd4 "ma na Na pa ba ta da ka
 test_lang lt 615e503b996ea5f7b267ebd77b91e77c5b874e18 "ma m;a na n;a pa p;a ta t;a ka k;a ba b;a da d;a ga g;a tsa ts;a tSa tS;a dza dz;a dZa dZ;a fa f;a sa s;a Sa S;a xa x;a va v;a za z;a Za Z;a la l;a ra r;a ja _:_ m@ ma mA ma: me mE me: mee meA mi mI mi: mo mO mo: mu mU mu: mw mW mai mei mau muo moi mui mie maU meU moU maI meI"
 test_lang lv ebd5378940b27e39ac35cda90e027ee0fa625fde "ma na n^a Na pa ba ta da ca Ja ka ga tsa dza Dz\`a tSa dZa DZ\`a fa va sa za Sa Za xa ha ja la l^a Ra ra _:_ mi my mu mE me mo ma mi: my: mu: mE: me: mo: ma: mai mau mei mie miu mui muo muo\` moi"
 test_lang mi b6e622de46c33181cdfea351b907f932da9a0a1a "ma na Na pa ta ka fa ha ra wa _:_ ma ma: me me: mi mi: mo mo: mu mu:"
 test_lang mk 072d0a74acf54bea528e7dde427eb04808d38364 "ma na n^a Na pa ta xa k^a ka ba da Ja ga tsa tSa tS;a dza dZa dZ;a fa sa Sa xa va za Za l^a la ja Ra @-*a ra _:_ ma me mi mo mu mA mE ma: me: mi: mo: mu: moU"
 test_lang mk b21aa811cf39be7102be35bd635a3eaf10625d70 "ma na n^a Na pa ta xa k^a ka ba da Ja ga tsa tSa tS;a dza dZa dZ;a fa sa Sa xa va za Za l^a la ja Ra @-*a ra _:_ ma me mi mo mu mA mE ma: me: mi: mo: mu: moU"
 test_lang shn e568aca66c2f58fdaf5dda8a67f4d21f05710234 "ma na Ja Na pa p_ha ba ta t_ha da ka k_ha ga ?a fa sa za Ta ha tS;a Ra ja wa la _:_ mi mI mW mu me m@ mo mE ma ma: mO miu meu mEu mau ma:u mWi mui m@i moi mai ma:i mOi maW _:_ ma1 ma2 ma3 ma4 ma5 ma6"
--- a/tests/translate.test
+++ b/tests/translate.test
@@ -47,7 +47,8 @@ test_phonemes en " 'e@ri:z r'eInboU" "♈ 🌈"
 test_phonemes en " Ekskla#m'eIS@n kw'EstS@n m'A@k" "⁉"
 test_phonemes en " Ekskla#m'eIS@n kw'EstS@n m'A@k r'eInboU" "⁉ 🌈"
 test_phonemes en " r'oUlIN 0nD@ fl'o@ l'aafIN" "🤣" # skip words

 # Test replacement rules
 test_phonemes en " 'i:T@ d'i:m@n Ens,aIkl@p'oUdi@r- ,0n@m,at@p'oUIk a#m'i:b@ f'i:t@s l'0ft h'Ist@ri" "Æther dæmon encyclopœdia onomatopœic amœba fœtus loﬅ hiﬆory"
 # bug: https://github.com/espeak-ng/espeak-ng/issues/471
 test_phonemes sk " sm'eju:tsa s'a tv'a:R" "☺"
 test_phonemes sk " bl'ax sm'eju:tsa s'a tv'a:R" "blah ☺"