Browse Source

Part of issue #199 — extend .replace rule to allow using groups of characters

master
Valdis Vitolins 7 years ago
parent
commit
9f0667de86
4 changed files with 199 additions and 23 deletions
  1. 3
    3
      docs/dictionary.md
  2. 40
    17
      src/libespeak-ng/compiledict.c
  3. 1
    1
      src/libespeak-ng/dictionary.c
  4. 155
    2
      src/libespeak-ng/translate.c

+ 3
- 3
docs/dictionary.md View File

@@ -509,9 +509,9 @@ each language. The number fragments are given in the `*_list` file.
## Character Substitution

Character substitutions can be specified by using a `.replace` section
at the start of the `*_rules` file. In each line one character can be
replaced by one or two characters. (Source and target of replacement can consume
up to four bytes.) This substitution is done to a word _before_ word is searched
at the start of the `*_rules` file. In each line several characters can be
replaced by several other characters. (Source and target of replacement can consume
up to 40 bytes.) This substitution is done to a word _before_ word is searched
in `*_list` or `*_listx` file and translated using the spelling-to-phoneme rules.
Only the lower-case version of the characters needs to be specified. e.g.:


+ 40
- 17
src/libespeak-ng/compiledict.c View File

@@ -1448,30 +1448,53 @@ static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_t
break;
case 2: // .replace
{
int replace1;
int replace2;
int from[N_WORD_BYTES + 1] = { 0 }; // Size of N_WORD_BYTES + null byte
int to[N_WORD_BYTES + 1] = { 0 };
char *p;

p = buf;
replace1 = 0;
replace2 = 0;
while (isspace2(*p)) p++;

while (isspace2(*p)) // skip spaces in xx_rules file
p++;

ix = 0;
while ((unsigned char)(*p) > 0x20) { // not space or zero-byte
p += utf8_in(&c, p);
replace1 += (c << ix);
ix += 16;
while ((unsigned char) (*p) > ' ') { // prepare 'from' characters
p += utf8_in(&from[ix], p);
ix++;
}
while (isspace2(*p)) p++;
while (isspace2(*p)) // skip spaces again
p++;
ix = 0;
while ((unsigned char)(*p) > 0x20) {
p += utf8_in(&c, p);
replace2 += (c << ix);
ix += 16;
while ((unsigned char) (*p) > ' ') { // prepare 'to' characters
p += utf8_in(&to[ix], p);
ix++;
}
if (replace1 != 0) {
Write4Bytes(f_out, replace1); // write as little-endian
Write4Bytes(f_out, replace2); // if big-endian, reverse the bytes in LoadDictionary()
// save into file
if (from[0] != 0) {
ix = 0;
while (from[ix] != 0) {
Write4Bytes(f_out, from[ix]);
ix++;
if (ix > N_WORD_BYTES / 4) {
fprintf(stderr,
"Too long .replace 'from' part (limit is: %d characters)\n",
N_WORD_BYTES / 4);
break;
}
}
Write4Bytes(f_out, ' '); // end of 'from'
ix = 0;
while (to[ix] != 0) {
Write4Bytes(f_out, to[ix]);
ix++;
if (ix > N_WORD_BYTES / 4) {
fprintf(stderr,
"Too long .replace 'to' part (limit is: %d characters)\n",
N_WORD_BYTES / 4);
break;
}
}
Write4Bytes(f_out, ' '); // end of 'to'
}
}
break;

+ 1
- 1
src/libespeak-ng/dictionary.c View File

@@ -152,7 +152,7 @@ static void InitGroups(Translator *tr)
pw = (unsigned int *)(((intptr_t)p+4) & ~3); // advance to next word boundary
tr->langopts.replace_chars = pw;
while (pw[0] != 0)
pw += 2; // find the end of the replacement list, each entry is 2 words.
pw ++; // find the end of the replacement list
p = (char *)(pw+1);

#ifdef ARCH_BIG

+ 155
- 2
src/libespeak-ng/translate.c View File

@@ -1864,6 +1864,157 @@ static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in,
return new_c;
}

static bool GetNextTranslateChars(const int **from, const int **to, bool next_step)
{
// Set pointers to next 'from' and 'to' part for replacement
// next_step indicates second and succeeding replacement group
// Return true if got data, false, if reached end
if (next_step) { // first 'from' is pointed already
int spaces = 0;
while (**from != 0 && spaces < 2) { // Get next 'from' position skipping 'to' part
(*from)++;
if (**from == ' ')
spaces++;
}
while (**from == ' ') // skip remaining spaces, if exists
(*from)++;
}
if (**from == 0)
return false;

*to = *from; // start seeking 'to' from 'from' position
while (**to != 0 && **to != ' ') // get next 'to' position
(*to)++;

while (**to == ' ') // skip remaining spaces, if exists
(*to)++;

return **to != 0;
}

static void ReplaceMatchingChars(char **matchstart, const int *from, const int *to, int upper_case)
{
// Replace characters from 'from' to 'to' group in source clause, starting
// with place indicated in matchstart.
// Rewrites remaining part of source, if length of 'from' and 'to' groups differs.

char tmpsource[N_TR_SOURCE + 40]; // Prepare working copy of source
memcpy(&tmpsource, *matchstart, N_TR_SOURCE + 40);

// prepare from part
char charfrom[N_WORD_BYTES + 1] = { 0 };
int usedfrom = 0;
int const *pintfrom = from;
char *pcharfrom = charfrom;
while (*pintfrom != ' ') {
usedfrom += utf8_out(*pintfrom, pcharfrom + usedfrom);
pintfrom++;
}

// prepare to part
char charto[N_WORD_BYTES + 1] = { 0 };
int usedto = 0;
int const *pintto = to;
char *pcharto = charto;
while (*pintto != 0 && *pintto != ' ') {
usedto += utf8_out(*pintto, pcharto + usedto);
pintto++;
}

// Set up replacement in temporary buffer
char *psource = *matchstart;
char *ptmpsource = tmpsource;
pcharto = charto;

// Do replacement writing 'to' part in buffer
while (*pcharto != 0 ) {
if(upper_case) { // TODO: should check for capital letter from source
*ptmpsource = toupper(*pcharto);
upper_case = 0; // capitalize only first character if necessary (will work in most cases)
}
else
*ptmpsource = *pcharto;
pcharto++;
ptmpsource++;
}

// If 'to' part is different than 'from' part, fix remaining part of the buffer.
// (Read after 'from' in source, write after 'to' in buffer.
// Don't touch last 40 bytes of source, which could contain control data.
if (usedfrom != usedto) {
psource = *matchstart + usedfrom;
while (*psource != 0 && psource < &source[N_TR_SOURCE]
&& ptmpsource < &tmpsource[N_TR_SOURCE]) {
*ptmpsource = *psource;
ptmpsource++;
psource++;
}
}

// Write temporary buffer back to source
psource = *matchstart;
ptmpsource = tmpsource;
while (*ptmpsource != 0 && ptmpsource < &tmpsource[N_TR_SOURCE]
&& psource < &source[N_TR_SOURCE]) {
*psource = *ptmpsource;
ptmpsource++;
psource++;
}

// if requested, print trace
if (option_phonemes & espeakPHONEMES_TRACE)
fprintf(f_trans, "Replace %s > %s\n", charfrom, charto);
}

static void TranslateChars(Translator *tr, char *source)
{
// Replace characters using Translator.replace_chars in passed source buffer
char *startmatch;
const int *from;
const int *to;
bool upper_case = false;

if ((from = tr->langopts.replace_chars) == NULL)
return;

char *curchar = source; // pointer to the source clause
int curint; // current UTF-16 character
int used; // bytes used in buffer
const int *curfrom; // pointer to current 'from'
bool next_step = false; // signal to jumping to next 'from' part in dictionary
while (GetNextTranslateChars(&from, &to, next_step)) {
next_step = true;
curfrom = from;
curchar = source;
startmatch = 0;
curint = 0;
do {
used = utf8_in(&curint, curchar);
if (curint < ' ') { // skip control characters
curchar += used;
continue;
}
if ((upper_case = iswupper(curint)) == true) // remember, if uppercase
curint = ucd_tolower(curint);

if (curint == *curfrom) { // if matches
if (startmatch == 0)
startmatch = curchar;
curfrom++; // move check to next character
if (*curfrom == ' ') { // if full match, replace
ReplaceMatchingChars(&startmatch, from, to, upper_case);
startmatch = 0; // reset match to start of 'from'
curfrom = from;
}
} else { // reset match to start of 'from'
startmatch = 0;
curfrom = from;
}
curchar += used;
} while (curint > 0);
}
}

static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
{
// To allow language specific examination and replacement of characters
@@ -1930,8 +2081,7 @@ static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c,
}
break;
}
// handle .replace rule in xx_rules file
return SubstituteChar(tr, c, next_in, insert, wordflags);
return c;
}

static const char *UCase_ga[] = { "bp", "bhf", "dt", "gc", "hA", "mb", "nd", "ng", "ts", "tA", "nA", NULL };
@@ -2104,6 +2254,9 @@ void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
}
words[0].length = k;

// handle .replace rule of xx_rules file
TranslateChars(tr, source);

while (!finished && (ix < (int)sizeof(sbuf) - 1) && (n_ph_list2 < N_PHONEME_LIST-4)) {
prev_out2 = prev_out;
utf8_in2(&prev_out, &sbuf[ix-1], 1);

Loading…
Cancel
Save