@@ -50,6 +50,8 @@ | |||
ff ff // ligatures | |||
fi fi | |||
fl fl | |||
ſt ft | |||
st st | |||
ά α // some of polytonic greek to plain greek letters (list in el_rules is much longer) | |||
έ ε |
@@ -509,9 +509,9 @@ each language. The number fragments are given in the `*_list` file. | |||
## Character Substitution | |||
Character substitutions can be specified by using a `.replace` section | |||
at the start of the `*_rules` file. In each line one character can be | |||
replaced by one or two characters. (Source and target of replacement can consume | |||
up to four bytes.) This substitution is done to a word _before_ word is searched | |||
at the start of the `*_rules` file. In each line several characters can be | |||
replaced by several other characters. (Source and target of replacement can consume | |||
up to 40 bytes.) This substitution is done to a word _before_ word is searched | |||
in `*_list` or `*_listx` file and translated using the spelling-to-phoneme rules. | |||
Only the lower-case version of the characters needs to be specified. e.g.: | |||
@@ -1448,30 +1448,53 @@ static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_t | |||
break; | |||
case 2: // .replace | |||
{ | |||
int replace1; | |||
int replace2; | |||
int from[N_WORD_BYTES + 1] = { 0 }; // Size of N_WORD_BYTES + null byte | |||
int to[N_WORD_BYTES + 1] = { 0 }; | |||
char *p; | |||
p = buf; | |||
replace1 = 0; | |||
replace2 = 0; | |||
while (isspace2(*p)) p++; | |||
while (isspace2(*p)) // skip spaces in xx_rules file | |||
p++; | |||
ix = 0; | |||
while ((unsigned char)(*p) > 0x20) { // not space or zero-byte | |||
p += utf8_in(&c, p); | |||
replace1 += (c << ix); | |||
ix += 16; | |||
while ((unsigned char) (*p) > ' ') { // prepare 'from' characters | |||
p += utf8_in(&from[ix], p); | |||
ix++; | |||
} | |||
while (isspace2(*p)) p++; | |||
while (isspace2(*p)) // skip spaces again | |||
p++; | |||
ix = 0; | |||
while ((unsigned char)(*p) > 0x20) { | |||
p += utf8_in(&c, p); | |||
replace2 += (c << ix); | |||
ix += 16; | |||
while ((unsigned char) (*p) > ' ') { // prepare 'to' characters | |||
p += utf8_in(&to[ix], p); | |||
ix++; | |||
} | |||
if (replace1 != 0) { | |||
Write4Bytes(f_out, replace1); // write as little-endian | |||
Write4Bytes(f_out, replace2); // if big-endian, reverse the bytes in LoadDictionary() | |||
// save into file | |||
if (from[0] != 0) { | |||
ix = 0; | |||
while (from[ix] != 0) { | |||
Write4Bytes(f_out, from[ix]); | |||
ix++; | |||
if (ix > N_WORD_BYTES / 4) { | |||
fprintf(stderr, | |||
"Too long .replace 'from' part (limit is: %d characters)\n", | |||
N_WORD_BYTES / 4); | |||
break; | |||
} | |||
} | |||
Write4Bytes(f_out, ' '); // end of 'from' | |||
ix = 0; | |||
while (to[ix] != 0) { | |||
Write4Bytes(f_out, to[ix]); | |||
ix++; | |||
if (ix > N_WORD_BYTES / 4) { | |||
fprintf(stderr, | |||
"Too long .replace 'to' part (limit is: %d characters)\n", | |||
N_WORD_BYTES / 4); | |||
break; | |||
} | |||
} | |||
Write4Bytes(f_out, ' '); // end of 'to' | |||
} | |||
} | |||
break; |
@@ -152,7 +152,7 @@ static void InitGroups(Translator *tr) | |||
pw = (unsigned int *)(((intptr_t)p+4) & ~3); // advance to next word boundary | |||
tr->langopts.replace_chars = pw; | |||
while (pw[0] != 0) | |||
pw += 2; // find the end of the replacement list, each entry is 2 words. | |||
pw ++; // find the end of the replacement list | |||
p = (char *)(pw+1); | |||
#ifdef ARCH_BIG |
@@ -1790,81 +1790,158 @@ static int EmbeddedCommand(unsigned int *source_index_out) | |||
return 1; | |||
} | |||
// handle .replace rule in xx_rules file | |||
static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags) | |||
static bool GetNextTranslateChars(const int **from, const int **to, bool next_step) | |||
{ | |||
int ix; | |||
unsigned int word; | |||
unsigned int new_c, c2 = ' ', c_lower; | |||
int upper_case = 0; | |||
static bool ignore_next = false; | |||
const unsigned int *replace_chars; | |||
if (ignore_next) { | |||
ignore_next = false; | |||
return 8; | |||
// Set pointers to next 'from' and 'to' part for replacement | |||
// next_step indicates second and succeeding replacement group | |||
// Return true if got data, false, if reached end | |||
if (next_step) { // first 'from' is pointed already | |||
int spaces = 0; | |||
while (**from != 0 && spaces < 2) { // Get next 'from' position skipping 'to' part | |||
(*from)++; | |||
if (**from == ' ') | |||
spaces++; | |||
} | |||
while (**from == ' ') // skip remaining spaces, if exists | |||
(*from)++; | |||
} | |||
if (c == 0) return 0; | |||
if (**from == 0) | |||
return false; | |||
if ((replace_chars = tr->langopts.replace_chars) == NULL) | |||
return c; | |||
*to = *from; // start seeking 'to' from 'from' position | |||
while (**to != 0 && **to != ' ') // get next 'to' position | |||
(*to)++; | |||
// there is a list of character codes to be substituted with alternative codes | |||
while (**to == ' ') // skip remaining spaces, if exists | |||
(*to)++; | |||
if (iswupper(c_lower = c)) { | |||
c_lower = towlower2(c, tr); | |||
upper_case = 1; | |||
return **to != 0; | |||
} | |||
static void ReplaceMatchingChars(char **matchstart, const int *from, const int *to, int upper_case) | |||
{ | |||
// Replace characters from 'from' to 'to' group in source clause, starting | |||
// with place indicated in matchstart. | |||
// Rewrites remaining part of source, if length of 'from' and 'to' groups differs. | |||
char tmpsource[N_TR_SOURCE + 40]; // Prepare working copy of source | |||
memcpy(&tmpsource, *matchstart, N_TR_SOURCE + 40); | |||
// prepare from part | |||
char charfrom[N_WORD_BYTES + 1] = { 0 }; | |||
int usedfrom = 0; | |||
int const *pintfrom = from; | |||
char *pcharfrom = charfrom; | |||
while (*pintfrom != ' ') { | |||
usedfrom += utf8_out(*pintfrom, pcharfrom + usedfrom); | |||
pintfrom++; | |||
} | |||
// prepare to part | |||
char charto[N_WORD_BYTES + 1] = { 0 }; | |||
int usedto = 0; | |||
int const *pintto = to; | |||
char *pcharto = charto; | |||
while (*pintto != 0 && *pintto != ' ') { | |||
usedto += utf8_out(*pintto, pcharto + usedto); | |||
pintto++; | |||
} | |||
// Set up replacement in temporary buffer | |||
char *psource = *matchstart; | |||
char *ptmpsource = tmpsource; | |||
pcharto = charto; | |||
// Do replacement writing 'to' part in buffer | |||
while (*pcharto != 0 ) { | |||
if(upper_case) { // TODO: should check for capital letter from source (maybe using word_flags) | |||
*ptmpsource = toupper(*pcharto); | |||
upper_case = 0; // capitalize only first character if necessary (will work in most cases) | |||
} | |||
else | |||
*ptmpsource = *pcharto; | |||
pcharto++; | |||
ptmpsource++; | |||
} | |||
// If 'to' part is different than 'from' part, fix remaining part of the buffer. | |||
// (Read after 'from' in source, write after 'to' in buffer. | |||
// Don't touch last 40 bytes of source, which could contain control data. | |||
if (usedfrom != usedto) { | |||
psource = *matchstart + usedfrom; | |||
while (*psource != 0 && psource < &source[N_TR_SOURCE] | |||
&& ptmpsource < &tmpsource[N_TR_SOURCE]) { | |||
*ptmpsource = *psource; | |||
ptmpsource++; | |||
psource++; | |||
} | |||
} | |||
new_c = 0; | |||
for (ix = 0; (word = replace_chars[ix]) != 0; ix += 2) { | |||
if (c_lower == (word & 0xffff)) { | |||
if ((word >> 16) == 0) { | |||
new_c = replace_chars[ix+1]; | |||
break; | |||
// Write temporary buffer back to source | |||
psource = *matchstart; | |||
ptmpsource = tmpsource; | |||
while (*ptmpsource != 0 && ptmpsource < &tmpsource[N_TR_SOURCE] | |||
&& psource < &source[N_TR_SOURCE]) { | |||
*psource = *ptmpsource; | |||
ptmpsource++; | |||
psource++; | |||
} | |||
// if requested, print trace | |||
if (option_phonemes & espeakPHONEMES_TRACE) | |||
fprintf(f_trans, "Replace %s > %s\n", charfrom, charto); | |||
} | |||
static void TranslateChars(Translator *tr, char *source) | |||
{ | |||
// Replace characters using Translator.replace_chars in passed source buffer | |||
char *startmatch; | |||
const int *from; | |||
const int *to; | |||
bool upper_case = false; | |||
if ((from = tr->langopts.replace_chars) == NULL) | |||
return; | |||
char *curchar = source; // pointer to the source clause | |||
int curint; // current UTF-16 character | |||
int used; // bytes used in buffer | |||
const int *curfrom; // pointer to current 'from' | |||
bool next_step = false; // signal to jumping to next 'from' part in dictionary | |||
while (GetNextTranslateChars(&from, &to, next_step)) { | |||
next_step = true; | |||
curfrom = from; | |||
curchar = source; | |||
startmatch = 0; | |||
curint = 0; | |||
do { | |||
used = utf8_in(&curint, curchar); | |||
if (curint < ' ') { // skip control characters | |||
curchar += used; | |||
continue; | |||
} | |||
if ((word >> 16) == (unsigned int)towlower2(next_in, tr)) { | |||
new_c = replace_chars[ix+1]; | |||
ignore_next = true; | |||
break; | |||
if ((upper_case = iswupper(curint)) == true) // remember, if uppercase | |||
curint = ucd_tolower(curint); | |||
if (curint == *curfrom) { // if matches | |||
if (startmatch == 0) | |||
startmatch = curchar; | |||
curfrom++; // move check to next character | |||
if (*curfrom == ' ') { // if full match, replace | |||
ReplaceMatchingChars(&startmatch, from, to, upper_case); | |||
startmatch = 0; // reset match to start of 'from' | |||
curfrom = from; | |||
} | |||
} else { // reset match to start of 'from' | |||
startmatch = 0; | |||
curfrom = from; | |||
} | |||
} | |||
curchar += used; | |||
} while (curint > 0); | |||
} | |||
if (new_c == 0) | |||
return c; // no substitution | |||
if (new_c & 0xffe00000) { | |||
// there is a second character to be inserted | |||
// don't convert the case of the second character unless the next letter is also upper case | |||
c2 = new_c >> 16; | |||
if (upper_case && iswupper(next_in)) | |||
c2 = ucd_toupper(c2); | |||
*insert = c2; | |||
new_c &= 0xffff; | |||
} | |||
if (upper_case) | |||
new_c = ucd_toupper(new_c); | |||
*wordflags |= FLAG_CHAR_REPLACED; | |||
if (option_phonemes & espeakPHONEMES_TRACE) { | |||
char msg[21] = {'R','e','p','l','a','c','e',':',' '}; | |||
char *index = msg; | |||
index += 9; | |||
index += utf8_out(c, index); | |||
*index++ = ' '; | |||
*index++ = '>'; | |||
*index++ = ' '; | |||
index += utf8_out(new_c, index); | |||
index += utf8_out(c2, index); | |||
*index = 0; | |||
fprintf(f_trans, "%s\n", msg); | |||
} | |||
return new_c; | |||
} | |||
static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags) | |||
static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert) | |||
{ | |||
// To allow language specific examination and replacement of characters | |||
@@ -1930,8 +2007,7 @@ static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, | |||
} | |||
break; | |||
} | |||
// handle .replace rule in xx_rules file | |||
return SubstituteChar(tr, c, next_in, insert, wordflags); | |||
return c; | |||
} | |||
static const char *UCase_ga[] = { "bp", "bhf", "dt", "gc", "hA", "mb", "nd", "ng", "ts", "tA", "nA", NULL }; | |||
@@ -2104,6 +2180,9 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change) | |||
} | |||
words[0].length = k; | |||
// handle .replace rule of xx_rules file | |||
TranslateChars(tr, source); | |||
while (!finished && (ix < (int)sizeof(sbuf) - 1) && (n_ph_list2 < N_PHONEME_LIST-4)) { | |||
prev_out2 = prev_out; | |||
utf8_in2(&prev_out, &sbuf[ix-1], 1); | |||
@@ -2206,7 +2285,7 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change) | |||
word_flags |= FLAG_COMMA_AFTER; | |||
} | |||
// language specific character translations | |||
c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted, &word_flags); | |||
c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted); | |||
if (c == 8) | |||
continue; // ignore this character | |||
@@ -71,7 +71,7 @@ test_lang fr-CH 4e312a224c8976da90a7faec8edabbc2fe311310 "ma na n^a Na pa ta ka | |||
test_lang ga 63433f113b6507eb2af79d03e0f28aa08f1d643c "ma m;a mja na n;a nja Na N;a pa p;a t[a t;a ka k;a ca ba b;a bja d[a d;a ga g;a fa f;a sa Sa xa Ca C;a ha h;a wa va v;a vja ra Ra Qa Q\"a Q\"ja ja la l;a lja _:_ m@ m@/ ma ma# mE mE# mI mi mO mO# mU mU# mu1 m0 m0# mo: mA: me: mi: mu: m@U mi@ mu@ maI maI# maU maU# meI" | |||
test_lang gd 744f0ab9fb2661984f2bf2da9f9f150402f7118a "ma na n[a n^a pa #pa ta #ta t;a #t;a ka k;a #ka #k;a ba b;a #ba da d;a #da #d;a ga g;a #ga #g;a fa fja va vja sa Sa Ca J^a xa Qa ha hja dZa la l^a ja *a *;a Ra _:_ m& mI m@ mi: mi me: me mE: mE ma: ma ma2 mO: mO mo: mo mu: mu mu-: mu- mo-: mo- mi@ mia mu@ maI meI m@I mu-I maU mOU" | |||
test_lang gn a4e5ea82c978c2f592b9086ff482157f46614ba5 "ma na n^a n.a Na pa Ba ta ka k:a ?a ba da dZa ga fa Ta sa Sa S;a J^a ha la ra R2a wa _:_ ma me mi mo mu my ma~ me~ mi~ mo~ mu~ my~" | |||
test_lang grc ad3beb0255c55f385f4212111aea6de83dd9cb59 "ma na Na pa ta ka fa ba da ga sa za Ta va xa ha za Ra la ja wa _:_ my me mE mo mO mEU mOI mOU myI my: me: mE: mo: mO: mEU: mOI: mOU: myI:" # f = p_f | |||
test_lang grc fa595d58e82556b8aca26c38027677bf402f37b7 "ma na Na pa ta ka fa ba da ga sa za Ta va xa ha za Ra la ja wa _:_ my me mE mo mO mEU mOI mOU myI my: me: mE: mo: mO: mEU: mOI: mOU: myI:" # f = p_f | |||
test_lang gu be5801d73a5692e34ee1c30729b393170cc9414f "ma na n.a Na pa t#a t.a ca tSa ka ba d#a d.a Ja dZa ga p#a t.#a c#a k#a b#a d.#a g#a fa sa Sa za ha v#a la l.a ja *a Ra wa Qa _:_ mi mu mI mU mo m@ mE mV mO mi~ mu~ mI~ mV~ me~ mo~ mi: ma: me: me~: mo: maI maI~ maU maU~" | |||
test_lang hak a2c353ce28a8399b657e6b603219c914e16e957b "na Na la ja pa pha fa ta tha ka kha tS;a tS;ha S;a s.a tsa tsha ts.a ts.ha N-a _:_ ma mA mai mAu m@ m@r mE mei mi mi[ mi. miA miAu miE mio miou mo mo- mou mong mu muA mua muai mu@ mei muo my myu my& myE my@ myi _:_ ma11 ma21 ma214 ma22 ma33 ma35 ma44 ma51 ma53 ma55" | |||
test_lang hi ae76141a2b3fb3090795e74f6e4a0a6ebbb510e6 "ma na n.a n^a pa t#a t.a tSa ka qa p#a t.#a c#a k#a ba d#a d.a dZa ga b#a d.#a J#a g#a fa sa Sa xa va za Za Qa Ha ra r.a la ja _:_ mI mU me m@ mo mE mV mO ma mi: mu: me: mo: mE: mO: ma: m&: mI~ mi~ mU~ mu~ mU~ me~ mo~ mE~ mV~ mO~ ma~ mAI maU" | |||
@@ -81,7 +81,7 @@ test_lang hy d3fc6d3778f8669abed91b397f28c0e2ecf3c1cc "ma na Na pa p#a ba ta t#a | |||
test_lang hyw 72f392c05e74e2fce620b9e1b0dc440481f597e3 "ma na Na pa p#a ba ta t#a da ka k#a ga tsa ts#a dza tSa tS#a dZa fa va sa za Sa Za Xa ha ja la ra r\"a Ra *a _:_ m@ mi my me ma mo mu mW maI meI mi@" | |||
test_lang ia 1be09d2d915404d00aacb7895edeed5ff947345a "ma na pa ba ta da ka ga tsa tSa dZa fa va sa za Sa ha ja wa la Ra _:_ ma me mo mi mu maI maU meI meU moI" | |||
test_lang id 75a57a020af2b62e3448792d3f6a945a9b2c6b75 "ma na n^a Na pa ba ta da ka ga ?a tSa dZa fa va Ta Da sa za Sa xa Qa ha ja wa la Ra R2a _:_ ma mE mO m@ me mo mi mu maI meI mOI maU m@U" | |||
test_lang is b27c88c4b4e47d80a80e690f3dfc2026821f39dd "m#a ma n#a na n^#a n^a N#a Na pa ta ca ka fa va Ta Da sa Ca J^a xa Qa ha l#a la tl#a r#a ra _:_ mi mi: mI mI: mE mE: ma ma: mO mO: mu mu: my my: mW mW: maI maI: meI meI: maU maU: moU moU: mYy mOI myI" | |||
test_lang is feb18f89c3bb8b7844efab08395fdc4aeec47530 "m#a ma n#a na n^#a n^a N#a Na pa ta ca ka fa va Ta Da sa Ca J^a xa Qa ha l#a la tl#a r#a ra _:_ mi mi: mI mI: mE mE: ma ma: mO mO: mu mu: my my: mW mW: maI maI: meI meI: maU maU: moU moU: mYy mOI myI" | |||
test_lang it 8780284c48a97327b120f554adb3c2dd5ffbc283 "ma na n^a pa ba ta da ka ga tsa dza tSa dZa fa va sa za Sa ja wa la l^a Ra *a ssa k~a tS~a dZ~a g~a Q~a S~a _:_ ma ma/ me me/ mE mi mi/ mi# mI mo mo/ mO mu mU maU maI moI" | |||
test_lang ja 602ba8d6db3e45b0966f72d6404eadb0e0837930 "ma na n^a Na N\\a pa ba ta da ka ga t_sa d_za t_s\\a d_z\\a p\\a sa za s\\a z\\a Ca ha r\`a ja wa _:_ mi mu me mo ma mi~ mu~" | |||
test_lang jbo 77a00420ef9c1b40f87800f5ffee921e68a48a32 "ma na Na pa ba ta da ka ga ?a fa va xa ha tSa dZa sa za Sa Za wa ja la Ra n-a r-a l-a _:_ ma me mi mo mu m@ maI maU meI moI" | |||
@@ -97,5 +97,5 @@ test_lang lfn 044e27a5100528760a185e0773dccaca504b5bd4 "ma na Na pa ba ta da ka | |||
test_lang lt 615e503b996ea5f7b267ebd77b91e77c5b874e18 "ma m;a na n;a pa p;a ta t;a ka k;a ba b;a da d;a ga g;a tsa ts;a tSa tS;a dza dz;a dZa dZ;a fa f;a sa s;a Sa S;a xa x;a va v;a za z;a Za Z;a la l;a ra r;a ja _:_ m@ ma mA ma: me mE me: mee meA mi mI mi: mo mO mo: mu mU mu: mw mW mai mei mau muo moi mui mie maU meU moU maI meI" | |||
test_lang lv ebd5378940b27e39ac35cda90e027ee0fa625fde "ma na n^a Na pa ba ta da ca Ja ka ga tsa dza Dz\`a tSa dZa DZ\`a fa va sa za Sa Za xa ha ja la l^a Ra ra _:_ mi my mu mE me mo ma mi: my: mu: mE: me: mo: ma: mai mau mei mie miu mui muo muo\` moi" | |||
test_lang mi b6e622de46c33181cdfea351b907f932da9a0a1a "ma na Na pa ta ka fa ha ra wa _:_ ma ma: me me: mi mi: mo mo: mu mu:" | |||
test_lang mk 072d0a74acf54bea528e7dde427eb04808d38364 "ma na n^a Na pa ta xa k^a ka ba da Ja ga tsa tSa tS;a dza dZa dZ;a fa sa Sa xa va za Za l^a la ja Ra @-*a ra _:_ ma me mi mo mu mA mE ma: me: mi: mo: mu: moU" | |||
test_lang mk b21aa811cf39be7102be35bd635a3eaf10625d70 "ma na n^a Na pa ta xa k^a ka ba da Ja ga tsa tSa tS;a dza dZa dZ;a fa sa Sa xa va za Za l^a la ja Ra @-*a ra _:_ ma me mi mo mu mA mE ma: me: mi: mo: mu: moU" | |||
test_lang shn e568aca66c2f58fdaf5dda8a67f4d21f05710234 "ma na Ja Na pa p_ha ba ta t_ha da ka k_ha ga ?a fa sa za Ta ha tS;a Ra ja wa la _:_ mi mI mW mu me m@ mo mE ma ma: mO miu meu mEu mau ma:u mWi mui m@i moi mai ma:i mOi maW _:_ ma1 ma2 ma3 ma4 ma5 ma6" |
@@ -47,7 +47,8 @@ test_phonemes en " 'e@ri:z r'eInboU" "♈ 🌈" | |||
test_phonemes en " Ekskla#m'eIS@n kw'EstS@n m'A@k" "⁉" | |||
test_phonemes en " Ekskla#m'eIS@n kw'EstS@n m'A@k r'eInboU" "⁉ 🌈" | |||
test_phonemes en " r'oUlIN 0nD@ fl'o@ l'aafIN" "🤣" # skip words | |||
# Test replacement rules | |||
test_phonemes en " 'i:T@ d'i:m@n Ens,aIkl@p'oUdi@r- ,0n@m,at@p'oUIk a#m'i:b@ f'i:t@s l'0ft h'Ist@ri" "Æther dæmon encyclopœdia onomatopœic amœba fœtus loſt history" | |||
# bug: https://github.com/espeak-ng/espeak-ng/issues/471 | |||
test_phonemes sk " sm'eju:tsa s'a tv'a:R" "☺" | |||
test_phonemes sk " bl'ax sm'eju:tsa s'a tv'a:R" "blah ☺" |