Browse Source

Use UTF-8 strings in replace rules, instead of a packed UTF-16 pair.

master
Reece H. Dunn 7 years ago
parent
commit
55c64036e0

+ 15
- 24
src/libespeak-ng/compiledict.c View File

@@ -1369,6 +1369,7 @@ static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_t
if (compile_mode == 2) {
// end of the character replacements section
fwrite(&n_rules, 1, 4, f_out); // write a zero word to terminate the replacemenmt list
fputc(RULE_GROUP_END, f_out);
compile_mode = 0;
}

@@ -1447,33 +1448,23 @@ static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_t
}
break;
case 2: // .replace
{
int replace1;
int replace2;
char *p;
p = (unsigned char *)buf;

p = buf;
replace1 = 0;
replace2 = 0;
while (isspace2(*p)) p++;
ix = 0;
while ((unsigned char)(*p) > 0x20) { // not space or zero-byte
p += utf8_in(&c, p);
replace1 += (c << ix);
ix += 16;
}
while (isspace2(*p)) p++;
ix = 0;
while ((unsigned char)(*p) > 0x20) {
p += utf8_in(&c, p);
replace2 += (c << ix);
ix += 16;
}
if (replace1 != 0) {
Write4Bytes(f_out, replace1); // write as little-endian
Write4Bytes(f_out, replace2); // if big-endian, reverse the bytes in LoadDictionary()
if ((unsigned char)(*p) > 0x20) {
while ((unsigned char)(*p) > 0x20) { // not space or zero-byte
fputc(*p, f_out);
p++;
}
fputc(0, f_out);

while (isspace2(*p)) p++;
while ((unsigned char)(*p) > 0x20) {
fputc(*p, f_out);
p++;
}
fputc(0, f_out);
}
}
break;
}
}

+ 4
- 13
src/libespeak-ng/dictionary.c View File

@@ -149,20 +149,11 @@ static void InitGroups(Translator *tr)

if (p[0] == RULE_REPLACEMENTS) {
p = (char *)(((intptr_t)p+4) & ~3); // advance to next word boundary
tr->langopts.replace_chars = (unsigned int *)p;
tr->langopts.replace_chars = (unsigned char *)p;
while (*(unsigned int *)p != 0)
p += 8; // find the end of the replacement list, each entry is 2 words.
p += 4;

#ifdef ARCH_BIG
pw = (unsigned int *)(tr->langopts.replace_chars);
while (*pw != 0) {
*pw = Reverse4Bytes(*pw);
pw++;
*pw = Reverse4Bytes(*pw);
pw++;
}
#endif
p++;
while (*p != RULE_GROUP_END) p++;
p++;
continue;
}


+ 32
- 23
src/libespeak-ng/translate.c View File

@@ -1790,15 +1790,38 @@ static int EmbeddedCommand(unsigned int *source_index_out)
return 1;
}

static const char *
FindReplacementChars(Translator *tr, unsigned int c, unsigned int nextc, bool *ignore_next) {
unsigned int uc = 0;
const char *from = (const char *)tr->langopts.replace_chars;
while (*(unsigned int *)from != 0) {
from += utf8_in((int *)&uc, from);
if (c == uc) {
if (*from == 0) return from + 1;
from += utf8_in((int *)&uc, from);
if (*from == 0 && uc == (unsigned int)towlower2(nextc, tr)) {
*ignore_next = true;
return from + 1;
}
}

// replacement 'from' string (skip the remaining part, if any)
while (*from != '\0') from++;
from++;

// replacement 'to' string
while (*from != '\0') from++;
from++;
}
return NULL;
}

// handle .replace rule in xx_rules file
static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
{
int ix;
unsigned int word;
unsigned int new_c, c2 = ' ', c_lower;
int upper_case = 0;
static bool ignore_next = false;
const unsigned int *replace_chars;

if (ignore_next) {
ignore_next = false;
@@ -1806,7 +1829,7 @@ static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in,
}
if (c == 0) return 0;

if ((replace_chars = tr->langopts.replace_chars) == NULL)
if (tr->langopts.replace_chars == NULL)
return c;

// there is a list of character codes to be substituted with alternative codes
@@ -1816,32 +1839,18 @@ static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in,
upper_case = 1;
}

new_c = 0;
for (ix = 0; (word = replace_chars[ix]) != 0; ix += 2) {
if (c_lower == (word & 0xffff)) {
if ((word >> 16) == 0) {
new_c = replace_chars[ix+1];
break;
}
if ((word >> 16) == (unsigned int)towlower2(next_in, tr)) {
new_c = replace_chars[ix+1];
ignore_next = true;
break;
}
}
}

if (new_c == 0)
const char *to = FindReplacementChars(tr, c_lower, next_in, &ignore_next);
if (to == NULL)
return c; // no substitution

if (new_c & 0xffe00000) {
to += utf8_in((int *)&new_c, to);
if (*to != 0) {
// there is a second character to be inserted
// don't convert the case of the second character unless the next letter is also upper case
c2 = new_c >> 16;
to += utf8_in((int *)&c2, to);
if (upper_case && iswupper(next_in))
c2 = ucd_toupper(c2);
*insert = c2;
new_c &= 0xffff;
}

if (upper_case)

+ 1
- 1
src/libespeak-ng/translate.h View File

@@ -556,7 +556,7 @@ typedef struct {
bool textmode; // the meaning of FLAG_TEXTMODE is reversed (to save data when *_list file is compiled)
char dotless_i; // uses letter U+0131
int listx; // compile *_listx after *list
const unsigned int *replace_chars; // characters to be substitutes
const unsigned char *replace_chars; // characters to be substitutes
int our_alphabet; // offset for main alphabet (if not set in letter_bits_offset)
int alt_alphabet; // offset for another language to recognize
int alt_alphabet_lang; // language for the alt_alphabet

Loading…
Cancel
Save