7 years ago · 9f0667de86
--- a/docs/dictionary.md
+++ b/docs/dictionary.md
@@ -509,9 +509,9 @@ each language. The number fragments are given in the `*_list` file.
 ## Character Substitution

 Character substitutions can be specified by using a `.replace` section
 at the start of the `*_rules` file. In each line one character can be
 replaced by one or two characters. (Source and target of replacement can consume
 up to four bytes.) This substitution is done to a word _before_ word is searched
 at the start of the `*_rules` file. In each line several characters can be
 replaced by several other characters. (Source and target of replacement can consume
 up to 40 bytes.) This substitution is done to a word _before_ word is searched
 in `*_list` or `*_listx` file and translated using the spelling-to-phoneme rules.
 Only the lower-case version of the characters needs to be specified. e.g.:

--- a/src/libespeak-ng/compiledict.c
+++ b/src/libespeak-ng/compiledict.c
@@ -1448,30 +1448,53 @@ static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_t
 			break;
 		case 2: //  .replace
 		{
 			int replace1;
 			int replace2;
 			int from[N_WORD_BYTES + 1] = { 0 }; // Size of N_WORD_BYTES + null byte
 			int to[N_WORD_BYTES + 1] = { 0 };
 			char *p;

 			p = buf;
 			replace1 = 0;
 			replace2 = 0;
 			while (isspace2(*p)) p++;

 			while (isspace2(*p)) // skip spaces in xx_rules file
 				p++;

 			ix = 0;
 			while ((unsigned char)(*p) > 0x20) { // not space or zero-byte
 				p += utf8_in(&c, p);
 				replace1 += (c << ix);
 				ix += 16;
 			while ((unsigned char) (*p) > ' ') { // prepare 'from' characters
 				p += utf8_in(&from[ix], p);
 				ix++;
 			}
 			while (isspace2(*p)) p++;
 			while (isspace2(*p)) // skip spaces again
 				p++;
 			ix = 0;
 			while ((unsigned char)(*p) > 0x20) {
 				p += utf8_in(&c, p);
 				replace2 += (c << ix);
 				ix += 16;
 			while ((unsigned char) (*p) > ' ') {  // prepare 'to' characters
 				p += utf8_in(&to[ix], p);
 				ix++;
 			}
 			if (replace1 != 0) {
 				Write4Bytes(f_out, replace1); // write as little-endian
 				Write4Bytes(f_out, replace2); // if big-endian, reverse the bytes in LoadDictionary()
 			// save into file
 			if (from[0] != 0) {
 				ix = 0;
 				while (from[ix] != 0) {
 					Write4Bytes(f_out, from[ix]);
 					ix++;
 					if (ix > N_WORD_BYTES / 4) {
 						fprintf(stderr,
 								"Too long .replace 'from' part (limit is: %d characters)\n",
 								N_WORD_BYTES / 4);
 						break;
 					}
 				}
 				Write4Bytes(f_out, ' '); // end of 'from'
 				ix = 0;
 				while (to[ix] != 0) {
 					Write4Bytes(f_out, to[ix]);
 					ix++;
 					if (ix > N_WORD_BYTES / 4) {
 						fprintf(stderr,
 								"Too long .replace 'to' part (limit is: %d characters)\n",
 								N_WORD_BYTES / 4);
 						break;
 					}
 				}
 				Write4Bytes(f_out, ' '); // end of 'to'
 			}
 		}
 			break;
--- a/src/libespeak-ng/dictionary.c
+++ b/src/libespeak-ng/dictionary.c
@@ -152,7 +152,7 @@ static void InitGroups(Translator *tr)
 			pw = (unsigned int *)(((intptr_t)p+4) & ~3); // advance to next word boundary
 			tr->langopts.replace_chars = pw;
 			while (pw[0] != 0)
 				pw += 2; // find the end of the replacement list, each entry is 2 words.
 				pw ++; // find the end of the replacement list
 			p = (char *)(pw+1);

 #ifdef ARCH_BIG
--- a/src/libespeak-ng/translate.c
+++ b/src/libespeak-ng/translate.c
@@ -1864,6 +1864,157 @@ static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in,
 	return new_c;
 }

 static bool GetNextTranslateChars(const int **from, const int **to, bool next_step)
 {
 	// Set pointers to next 'from' and 'to' part for replacement
 	// next_step indicates second and succeeding replacement group
 	// Return true if got data, false, if reached end
 	if (next_step) {                        // first 'from' is pointed already
 		int spaces = 0;
 		while (**from != 0 && spaces < 2) { // Get next 'from' position skipping 'to' part
 			(*from)++;
 			if (**from == ' ')
 				spaces++;
 		}
 		while (**from == ' ') // skip remaining spaces, if exists
 			(*from)++;
 	}
 	if (**from == 0)
 		return false;

    *to = *from;              // start seeking 'to' from 'from' position
 	while (**to != 0 && **to != ' ')   // get next 'to' position
 		(*to)++;

 	while (**to == ' ')       // skip remaining spaces, if exists
 		(*to)++;

 	return **to != 0;
 }

 static void ReplaceMatchingChars(char **matchstart, const int *from, const int *to, int upper_case)
 {
 	// Replace characters from 'from' to 'to' group in source clause, starting
 	// with place indicated in matchstart.
 	// Rewrites remaining part of source, if length of 'from' and 'to' groups differs.

 	char tmpsource[N_TR_SOURCE + 40]; // Prepare working copy of source
 	memcpy(&tmpsource, *matchstart, N_TR_SOURCE + 40);

 	// prepare from part
 	char charfrom[N_WORD_BYTES + 1] = { 0 };
 	int usedfrom = 0;
 	int const *pintfrom = from;
 	char *pcharfrom = charfrom;
 	while (*pintfrom != ' ') {
 		usedfrom += utf8_out(*pintfrom, pcharfrom + usedfrom);
 		pintfrom++;
 	}

 	// prepare to part
 	char charto[N_WORD_BYTES + 1] = { 0 };
 	int usedto = 0;
 	int const *pintto = to;
 	char *pcharto = charto;
 	while (*pintto != 0 && *pintto != ' ') {
 		usedto += utf8_out(*pintto, pcharto + usedto);
 		pintto++;
 	}

 	// Set up replacement in temporary buffer
 	char *psource = *matchstart;
 	char *ptmpsource = tmpsource;
 	pcharto = charto;

 	// Do replacement writing 'to' part in buffer
 	while (*pcharto != 0 ) {
 		if(upper_case) {        // TODO: should check for capital letter from source
 			*ptmpsource = toupper(*pcharto);
 			upper_case = 0;     // capitalize only first character if necessary (will work in most cases)
 		}
 		else
 			*ptmpsource = *pcharto;
 		pcharto++;
 		ptmpsource++;
 	}

 	// If 'to' part is different than 'from' part, fix remaining part of the buffer.
 	// (Read after 'from' in source, write after 'to' in buffer.
 	// Don't touch last 40 bytes of source, which could contain control data.
 	if (usedfrom != usedto) {
 		psource = *matchstart + usedfrom;
 		while (*psource != 0 && psource < &source[N_TR_SOURCE]
 				&& ptmpsource < &tmpsource[N_TR_SOURCE]) {
 			*ptmpsource = *psource;
 			ptmpsource++;
 			psource++;
 		}
 	}

 	// Write temporary buffer back to source
 	psource = *matchstart;
 	ptmpsource = tmpsource;
 	while (*ptmpsource != 0 && ptmpsource < &tmpsource[N_TR_SOURCE]
 			&& psource < &source[N_TR_SOURCE]) {
 		*psource = *ptmpsource;
 		ptmpsource++;
 		psource++;
 	}

 	// if requested, print trace
 	if (option_phonemes & espeakPHONEMES_TRACE)
 		fprintf(f_trans, "Replace   %s > %s\n", charfrom, charto);
 }

 static void TranslateChars(Translator *tr, char *source)
 {
 	// Replace characters using Translator.replace_chars in passed source buffer
 	char *startmatch;
 	const int *from;
 	const int *to;
 	bool upper_case = false;

 	if ((from = tr->langopts.replace_chars) == NULL)
 		return;

 	char *curchar = source; // pointer to the source clause
 	int curint;    // current UTF-16 character
 	int used;               // bytes used in buffer
 	const int *curfrom;     // pointer to current 'from'
 	bool next_step = false; // signal to jumping to next 'from' part in dictionary
 	while (GetNextTranslateChars(&from, &to, next_step)) {
 		next_step = true;
 		curfrom = from;
 		curchar = source;
 		startmatch = 0;
 		curint = 0;
 		do {
 			used = utf8_in(&curint, curchar);
 			if (curint < ' ') {        // skip control characters
 				curchar += used;
 				continue;
 			}
 			if ((upper_case = iswupper(curint)) == true) // remember, if uppercase
 				curint = ucd_tolower(curint);

 			if (curint == *curfrom) {  // if matches
 				if (startmatch == 0)
 					startmatch = curchar;
 				curfrom++;             // move check to next character
 				if (*curfrom == ' ') { // if full match, replace
 					ReplaceMatchingChars(&startmatch, from, to, upper_case);
 					startmatch = 0;    // reset match to start of 'from'
 					curfrom = from;
 				}
 			} else {                   // reset match to start of 'from'
 				startmatch = 0;
 				curfrom = from;
 			}
 			curchar += used;
 		} while (curint > 0);
 	}
 }

 static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
 {
 	// To allow language specific examination and replacement of characters
@@ -1930,8 +2081,7 @@ static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c,
 		}
 		break;
 	}
 	// handle .replace rule in xx_rules file
 	return SubstituteChar(tr, c, next_in, insert, wordflags);
 	return c;
 }

 static const char *UCase_ga[] = { "bp", "bhf", "dt", "gc", "hA", "mb", "nd", "ng", "ts", "tA", "nA", NULL };
@@ -2104,6 +2254,9 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
 	}
 	words[0].length = k;

 	// handle .replace rule of xx_rules file
 	TranslateChars(tr, source);

 	while (!finished && (ix < (int)sizeof(sbuf) - 1) && (n_ph_list2 < N_PHONEME_LIST-4)) {
 		prev_out2 = prev_out;
 		utf8_in2(&prev_out, &sbuf[ix-1], 1);