Browse Source

Merge remote-tracking branch 'valdisvi/master'

master
Reece H. Dunn 7 years ago
parent
commit
c8f84f1f46

+ 2
- 0
dictsource/en_rules View File

@@ -50,6 +50,8 @@
ff ff // ligatures
fi fi
fl fl
ſt ft
st st

ά α // some of polytonic greek to plain greek letters (list in el_rules is much longer)
έ ε

+ 3
- 3
docs/dictionary.md View File

@@ -509,9 +509,9 @@ each language. The number fragments are given in the `*_list` file.
## Character Substitution

Character substitutions can be specified by using a `.replace` section
at the start of the `*_rules` file. In each line one character can be
replaced by one or two characters. (Source and target of replacement can consume
up to four bytes.) This substitution is done to a word _before_ word is searched
at the start of the `*_rules` file. In each line several characters can be
replaced by several other characters. (Source and target of replacement can consume
up to 40 bytes.) This substitution is done to a word _before_ word is searched
in `*_list` or `*_listx` file and translated using the spelling-to-phoneme rules.
Only the lower-case version of the characters needs to be specified. e.g.:


+ 40
- 17
src/libespeak-ng/compiledict.c View File

@@ -1448,30 +1448,53 @@ static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_t
break;
case 2: // .replace
{
int replace1;
int replace2;
int from[N_WORD_BYTES + 1] = { 0 }; // Size of N_WORD_BYTES + null byte
int to[N_WORD_BYTES + 1] = { 0 };
char *p;

p = buf;
replace1 = 0;
replace2 = 0;
while (isspace2(*p)) p++;

while (isspace2(*p)) // skip spaces in xx_rules file
p++;

ix = 0;
while ((unsigned char)(*p) > 0x20) { // not space or zero-byte
p += utf8_in(&c, p);
replace1 += (c << ix);
ix += 16;
while ((unsigned char) (*p) > ' ') { // prepare 'from' characters
p += utf8_in(&from[ix], p);
ix++;
}
while (isspace2(*p)) p++;
while (isspace2(*p)) // skip spaces again
p++;
ix = 0;
while ((unsigned char)(*p) > 0x20) {
p += utf8_in(&c, p);
replace2 += (c << ix);
ix += 16;
while ((unsigned char) (*p) > ' ') { // prepare 'to' characters
p += utf8_in(&to[ix], p);
ix++;
}
if (replace1 != 0) {
Write4Bytes(f_out, replace1); // write as little-endian
Write4Bytes(f_out, replace2); // if big-endian, reverse the bytes in LoadDictionary()
// save into file
if (from[0] != 0) {
ix = 0;
while (from[ix] != 0) {
Write4Bytes(f_out, from[ix]);
ix++;
if (ix > N_WORD_BYTES / 4) {
fprintf(stderr,
"Too long .replace 'from' part (limit is: %d characters)\n",
N_WORD_BYTES / 4);
break;
}
}
Write4Bytes(f_out, ' '); // end of 'from'
ix = 0;
while (to[ix] != 0) {
Write4Bytes(f_out, to[ix]);
ix++;
if (ix > N_WORD_BYTES / 4) {
fprintf(stderr,
"Too long .replace 'to' part (limit is: %d characters)\n",
N_WORD_BYTES / 4);
break;
}
}
Write4Bytes(f_out, ' '); // end of 'to'
}
}
break;

+ 1
- 1
src/libespeak-ng/dictionary.c View File

@@ -152,7 +152,7 @@ static void InitGroups(Translator *tr)
pw = (unsigned int *)(((intptr_t)p+4) & ~3); // advance to next word boundary
tr->langopts.replace_chars = pw;
while (pw[0] != 0)
pw += 2; // find the end of the replacement list, each entry is 2 words.
pw ++; // find the end of the replacement list
p = (char *)(pw+1);

#ifdef ARCH_BIG

+ 145
- 66
src/libespeak-ng/translate.c View File

@@ -1790,81 +1790,158 @@ static int EmbeddedCommand(unsigned int *source_index_out)
return 1;
}

// handle .replace rule in xx_rules file
static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
static bool GetNextTranslateChars(const int **from, const int **to, bool next_step)
{
int ix;
unsigned int word;
unsigned int new_c, c2 = ' ', c_lower;
int upper_case = 0;
static bool ignore_next = false;
const unsigned int *replace_chars;

if (ignore_next) {
ignore_next = false;
return 8;
// Set pointers to next 'from' and 'to' part for replacement
// next_step indicates second and succeeding replacement group
// Return true if got data, false, if reached end
if (next_step) { // first 'from' is pointed already
int spaces = 0;
while (**from != 0 && spaces < 2) { // Get next 'from' position skipping 'to' part
(*from)++;
if (**from == ' ')
spaces++;
}
while (**from == ' ') // skip remaining spaces, if exists
(*from)++;
}
if (c == 0) return 0;
if (**from == 0)
return false;

if ((replace_chars = tr->langopts.replace_chars) == NULL)
return c;
*to = *from; // start seeking 'to' from 'from' position
while (**to != 0 && **to != ' ') // get next 'to' position
(*to)++;

// there is a list of character codes to be substituted with alternative codes
while (**to == ' ') // skip remaining spaces, if exists
(*to)++;

if (iswupper(c_lower = c)) {
c_lower = towlower2(c, tr);
upper_case = 1;
return **to != 0;
}

static void ReplaceMatchingChars(char **matchstart, const int *from, const int *to, int upper_case)
{
// Replace characters from 'from' to 'to' group in source clause, starting
// with place indicated in matchstart.
// Rewrites remaining part of source, if length of 'from' and 'to' groups differs.

char tmpsource[N_TR_SOURCE + 40]; // Prepare working copy of source
memcpy(&tmpsource, *matchstart, N_TR_SOURCE + 40);

// prepare from part
char charfrom[N_WORD_BYTES + 1] = { 0 };
int usedfrom = 0;
int const *pintfrom = from;
char *pcharfrom = charfrom;
while (*pintfrom != ' ') {
usedfrom += utf8_out(*pintfrom, pcharfrom + usedfrom);
pintfrom++;
}

// prepare to part
char charto[N_WORD_BYTES + 1] = { 0 };
int usedto = 0;
int const *pintto = to;
char *pcharto = charto;
while (*pintto != 0 && *pintto != ' ') {
usedto += utf8_out(*pintto, pcharto + usedto);
pintto++;
}

// Set up replacement in temporary buffer
char *psource = *matchstart;
char *ptmpsource = tmpsource;
pcharto = charto;

// Do replacement writing 'to' part in buffer
while (*pcharto != 0 ) {
if(upper_case) { // TODO: should check for capital letter from source (maybe using word_flags)
*ptmpsource = toupper(*pcharto);
upper_case = 0; // capitalize only first character if necessary (will work in most cases)
}
else
*ptmpsource = *pcharto;
pcharto++;
ptmpsource++;
}

// If 'to' part is different than 'from' part, fix remaining part of the buffer.
// (Read after 'from' in source, write after 'to' in buffer.
// Don't touch last 40 bytes of source, which could contain control data.
if (usedfrom != usedto) {
psource = *matchstart + usedfrom;
while (*psource != 0 && psource < &source[N_TR_SOURCE]
&& ptmpsource < &tmpsource[N_TR_SOURCE]) {
*ptmpsource = *psource;
ptmpsource++;
psource++;
}
}

new_c = 0;
for (ix = 0; (word = replace_chars[ix]) != 0; ix += 2) {
if (c_lower == (word & 0xffff)) {
if ((word >> 16) == 0) {
new_c = replace_chars[ix+1];
break;
// Write temporary buffer back to source
psource = *matchstart;
ptmpsource = tmpsource;
while (*ptmpsource != 0 && ptmpsource < &tmpsource[N_TR_SOURCE]
&& psource < &source[N_TR_SOURCE]) {
*psource = *ptmpsource;
ptmpsource++;
psource++;
}

// if requested, print trace
if (option_phonemes & espeakPHONEMES_TRACE)
fprintf(f_trans, "Replace %s > %s\n", charfrom, charto);
}

static void TranslateChars(Translator *tr, char *source)
{
// Replace characters using Translator.replace_chars in passed source buffer
char *startmatch;
const int *from;
const int *to;
bool upper_case = false;

if ((from = tr->langopts.replace_chars) == NULL)
return;

char *curchar = source; // pointer to the source clause
int curint; // current UTF-16 character
int used; // bytes used in buffer
const int *curfrom; // pointer to current 'from'
bool next_step = false; // signal to jumping to next 'from' part in dictionary
while (GetNextTranslateChars(&from, &to, next_step)) {
next_step = true;
curfrom = from;
curchar = source;
startmatch = 0;
curint = 0;
do {
used = utf8_in(&curint, curchar);
if (curint < ' ') { // skip control characters
curchar += used;
continue;
}
if ((word >> 16) == (unsigned int)towlower2(next_in, tr)) {
new_c = replace_chars[ix+1];
ignore_next = true;
break;
if ((upper_case = iswupper(curint)) == true) // remember, if uppercase
curint = ucd_tolower(curint);

if (curint == *curfrom) { // if matches
if (startmatch == 0)
startmatch = curchar;
curfrom++; // move check to next character
if (*curfrom == ' ') { // if full match, replace
ReplaceMatchingChars(&startmatch, from, to, upper_case);
startmatch = 0; // reset match to start of 'from'
curfrom = from;
}
} else { // reset match to start of 'from'
startmatch = 0;
curfrom = from;
}
}
curchar += used;
} while (curint > 0);
}

if (new_c == 0)
return c; // no substitution

if (new_c & 0xffe00000) {
// there is a second character to be inserted
// don't convert the case of the second character unless the next letter is also upper case
c2 = new_c >> 16;
if (upper_case && iswupper(next_in))
c2 = ucd_toupper(c2);
*insert = c2;
new_c &= 0xffff;
}

if (upper_case)
new_c = ucd_toupper(new_c);

*wordflags |= FLAG_CHAR_REPLACED;
if (option_phonemes & espeakPHONEMES_TRACE) {
char msg[21] = {'R','e','p','l','a','c','e',':',' '};
char *index = msg;
index += 9;
index += utf8_out(c, index);
*index++ = ' ';
*index++ = '>';
*index++ = ' ';
index += utf8_out(new_c, index);
index += utf8_out(c2, index);
*index = 0;
fprintf(f_trans, "%s\n", msg);
}
return new_c;
}

static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert)
{
// To allow language specific examination and replacement of characters

@@ -1930,8 +2007,7 @@ static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c,
}
break;
}
// handle .replace rule in xx_rules file
return SubstituteChar(tr, c, next_in, insert, wordflags);
return c;
}

static const char *UCase_ga[] = { "bp", "bhf", "dt", "gc", "hA", "mb", "nd", "ng", "ts", "tA", "nA", NULL };
@@ -2104,6 +2180,9 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
}
words[0].length = k;

// handle .replace rule of xx_rules file
TranslateChars(tr, source);

while (!finished && (ix < (int)sizeof(sbuf) - 1) && (n_ph_list2 < N_PHONEME_LIST-4)) {
prev_out2 = prev_out;
utf8_in2(&prev_out, &sbuf[ix-1], 1);
@@ -2206,7 +2285,7 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
word_flags |= FLAG_COMMA_AFTER;
}
// language specific character translations
c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted, &word_flags);
c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted);
if (c == 8)
continue; // ignore this character


+ 3
- 3
tests/language-phonemes.test View File

@@ -71,7 +71,7 @@ test_lang fr-CH 4e312a224c8976da90a7faec8edabbc2fe311310 "ma na n^a Na pa ta ka
test_lang ga 63433f113b6507eb2af79d03e0f28aa08f1d643c "ma m;a mja na n;a nja Na N;a pa p;a t[a t;a ka k;a ca ba b;a bja d[a d;a ga g;a fa f;a sa Sa xa Ca C;a ha h;a wa va v;a vja ra Ra Qa Q\"a Q\"ja ja la l;a lja _:_ m@ m@/ ma ma# mE mE# mI mi mO mO# mU mU# mu1 m0 m0# mo: mA: me: mi: mu: m@U mi@ mu@ maI maI# maU maU# meI"
test_lang gd 744f0ab9fb2661984f2bf2da9f9f150402f7118a "ma na n[a n^a pa #pa ta #ta t;a #t;a ka k;a #ka #k;a ba b;a #ba da d;a #da #d;a ga g;a #ga #g;a fa fja va vja sa Sa Ca J^a xa Qa ha hja dZa la l^a ja *a *;a Ra _:_ m& mI m@ mi: mi me: me mE: mE ma: ma ma2 mO: mO mo: mo mu: mu mu-: mu- mo-: mo- mi@ mia mu@ maI meI m@I mu-I maU mOU"
test_lang gn a4e5ea82c978c2f592b9086ff482157f46614ba5 "ma na n^a n.a Na pa Ba ta ka k:a ?a ba da dZa ga fa Ta sa Sa S;a J^a ha la ra R2a wa _:_ ma me mi mo mu my ma~ me~ mi~ mo~ mu~ my~"
test_lang grc ad3beb0255c55f385f4212111aea6de83dd9cb59 "ma na Na pa ta ka fa ba da ga sa za Ta va xa ha za Ra la ja wa _:_ my me mE mo mO mEU mOI mOU myI my: me: mE: mo: mO: mEU: mOI: mOU: myI:" # f = p_f
test_lang grc fa595d58e82556b8aca26c38027677bf402f37b7 "ma na Na pa ta ka fa ba da ga sa za Ta va xa ha za Ra la ja wa _:_ my me mE mo mO mEU mOI mOU myI my: me: mE: mo: mO: mEU: mOI: mOU: myI:" # f = p_f
test_lang gu be5801d73a5692e34ee1c30729b393170cc9414f "ma na n.a Na pa t#a t.a ca tSa ka ba d#a d.a Ja dZa ga p#a t.#a c#a k#a b#a d.#a g#a fa sa Sa za ha v#a la l.a ja *a Ra wa Qa _:_ mi mu mI mU mo m@ mE mV mO mi~ mu~ mI~ mV~ me~ mo~ mi: ma: me: me~: mo: maI maI~ maU maU~"
test_lang hak a2c353ce28a8399b657e6b603219c914e16e957b "na Na la ja pa pha fa ta tha ka kha tS;a tS;ha S;a s.a tsa tsha ts.a ts.ha N-a _:_ ma mA mai mAu m@ m@r mE mei mi mi[ mi. miA miAu miE mio miou mo mo- mou mong mu muA mua muai mu@ mei muo my myu my& myE my@ myi _:_ ma11 ma21 ma214 ma22 ma33 ma35 ma44 ma51 ma53 ma55"
test_lang hi ae76141a2b3fb3090795e74f6e4a0a6ebbb510e6 "ma na n.a n^a pa t#a t.a tSa ka qa p#a t.#a c#a k#a ba d#a d.a dZa ga b#a d.#a J#a g#a fa sa Sa xa va za Za Qa Ha ra r.a la ja _:_ mI mU me m@ mo mE mV mO ma mi: mu: me: mo: mE: mO: ma: m&: mI~ mi~ mU~ mu~ mU~ me~ mo~ mE~ mV~ mO~ ma~ mAI maU"
@@ -81,7 +81,7 @@ test_lang hy d3fc6d3778f8669abed91b397f28c0e2ecf3c1cc "ma na Na pa p#a ba ta t#a
test_lang hyw 72f392c05e74e2fce620b9e1b0dc440481f597e3 "ma na Na pa p#a ba ta t#a da ka k#a ga tsa ts#a dza tSa tS#a dZa fa va sa za Sa Za Xa ha ja la ra r\"a Ra *a _:_ m@ mi my me ma mo mu mW maI meI mi@"
test_lang ia 1be09d2d915404d00aacb7895edeed5ff947345a "ma na pa ba ta da ka ga tsa tSa dZa fa va sa za Sa ha ja wa la Ra _:_ ma me mo mi mu maI maU meI meU moI"
test_lang id 75a57a020af2b62e3448792d3f6a945a9b2c6b75 "ma na n^a Na pa ba ta da ka ga ?a tSa dZa fa va Ta Da sa za Sa xa Qa ha ja wa la Ra R2a _:_ ma mE mO m@ me mo mi mu maI meI mOI maU m@U"
test_lang is b27c88c4b4e47d80a80e690f3dfc2026821f39dd "m#a ma n#a na n^#a n^a N#a Na pa ta ca ka fa va Ta Da sa Ca J^a xa Qa ha l#a la tl#a r#a ra _:_ mi mi: mI mI: mE mE: ma ma: mO mO: mu mu: my my: mW mW: maI maI: meI meI: maU maU: moU moU: mYy mOI myI"
test_lang is feb18f89c3bb8b7844efab08395fdc4aeec47530 "m#a ma n#a na n^#a n^a N#a Na pa ta ca ka fa va Ta Da sa Ca J^a xa Qa ha l#a la tl#a r#a ra _:_ mi mi: mI mI: mE mE: ma ma: mO mO: mu mu: my my: mW mW: maI maI: meI meI: maU maU: moU moU: mYy mOI myI"
test_lang it 8780284c48a97327b120f554adb3c2dd5ffbc283 "ma na n^a pa ba ta da ka ga tsa dza tSa dZa fa va sa za Sa ja wa la l^a Ra *a ssa k~a tS~a dZ~a g~a Q~a S~a _:_ ma ma/ me me/ mE mi mi/ mi# mI mo mo/ mO mu mU maU maI moI"
test_lang ja 602ba8d6db3e45b0966f72d6404eadb0e0837930 "ma na n^a Na N\\a pa ba ta da ka ga t_sa d_za t_s\\a d_z\\a p\\a sa za s\\a z\\a Ca ha r\`a ja wa _:_ mi mu me mo ma mi~ mu~"
test_lang jbo 77a00420ef9c1b40f87800f5ffee921e68a48a32 "ma na Na pa ba ta da ka ga ?a fa va xa ha tSa dZa sa za Sa Za wa ja la Ra n-a r-a l-a _:_ ma me mi mo mu m@ maI maU meI moI"
@@ -97,5 +97,5 @@ test_lang lfn 044e27a5100528760a185e0773dccaca504b5bd4 "ma na Na pa ba ta da ka
test_lang lt 615e503b996ea5f7b267ebd77b91e77c5b874e18 "ma m;a na n;a pa p;a ta t;a ka k;a ba b;a da d;a ga g;a tsa ts;a tSa tS;a dza dz;a dZa dZ;a fa f;a sa s;a Sa S;a xa x;a va v;a za z;a Za Z;a la l;a ra r;a ja _:_ m@ ma mA ma: me mE me: mee meA mi mI mi: mo mO mo: mu mU mu: mw mW mai mei mau muo moi mui mie maU meU moU maI meI"
test_lang lv ebd5378940b27e39ac35cda90e027ee0fa625fde "ma na n^a Na pa ba ta da ca Ja ka ga tsa dza Dz\`a tSa dZa DZ\`a fa va sa za Sa Za xa ha ja la l^a Ra ra _:_ mi my mu mE me mo ma mi: my: mu: mE: me: mo: ma: mai mau mei mie miu mui muo muo\` moi"
test_lang mi b6e622de46c33181cdfea351b907f932da9a0a1a "ma na Na pa ta ka fa ha ra wa _:_ ma ma: me me: mi mi: mo mo: mu mu:"
test_lang mk 072d0a74acf54bea528e7dde427eb04808d38364 "ma na n^a Na pa ta xa k^a ka ba da Ja ga tsa tSa tS;a dza dZa dZ;a fa sa Sa xa va za Za l^a la ja Ra @-*a ra _:_ ma me mi mo mu mA mE ma: me: mi: mo: mu: moU"
test_lang mk b21aa811cf39be7102be35bd635a3eaf10625d70 "ma na n^a Na pa ta xa k^a ka ba da Ja ga tsa tSa tS;a dza dZa dZ;a fa sa Sa xa va za Za l^a la ja Ra @-*a ra _:_ ma me mi mo mu mA mE ma: me: mi: mo: mu: moU"
test_lang shn e568aca66c2f58fdaf5dda8a67f4d21f05710234 "ma na Ja Na pa p_ha ba ta t_ha da ka k_ha ga ?a fa sa za Ta ha tS;a Ra ja wa la _:_ mi mI mW mu me m@ mo mE ma ma: mO miu meu mEu mau ma:u mWi mui m@i moi mai ma:i mOi maW _:_ ma1 ma2 ma3 ma4 ma5 ma6"

+ 2
- 1
tests/translate.test View File

@@ -47,7 +47,8 @@ test_phonemes en " 'e@ri:z r'eInboU" "♈ 🌈"
test_phonemes en " Ekskla#m'eIS@n kw'EstS@n m'A@k" "⁉"
test_phonemes en " Ekskla#m'eIS@n kw'EstS@n m'A@k r'eInboU" "⁉ 🌈"
test_phonemes en " r'oUlIN 0nD@ fl'o@ l'aafIN" "🤣" # skip words

# Test replacement rules
test_phonemes en " 'i:T@ d'i:m@n Ens,aIkl@p'oUdi@r- ,0n@m,at@p'oUIk a#m'i:b@ f'i:t@s l'0ft h'Ist@ri" "Æther dæmon encyclopœdia onomatopœic amœba fœtus loſt history"
# bug: https://github.com/espeak-ng/espeak-ng/issues/471
test_phonemes sk " sm'eju:tsa s'a tv'a:R" "☺"
test_phonemes sk " bl'ax sm'eju:tsa s'a tv'a:R" "blah ☺"

Loading…
Cancel
Save