if (compile_mode == 2) { | if (compile_mode == 2) { | ||||
// end of the character replacements section | // end of the character replacements section | ||||
fwrite(&n_rules, 1, 4, f_out); // write a zero word to terminate the replacemenmt list | fwrite(&n_rules, 1, 4, f_out); // write a zero word to terminate the replacemenmt list | ||||
fputc(RULE_GROUP_END, f_out); | |||||
compile_mode = 0; | compile_mode = 0; | ||||
} | } | ||||
} | } | ||||
break; | break; | ||||
case 2: // .replace | case 2: // .replace | ||||
{ | |||||
int replace1; | |||||
int replace2; | |||||
char *p; | |||||
p = (unsigned char *)buf; | |||||
p = buf; | |||||
replace1 = 0; | |||||
replace2 = 0; | |||||
while (isspace2(*p)) p++; | |||||
ix = 0; | |||||
while ((unsigned char)(*p) > 0x20) { // not space or zero-byte | |||||
p += utf8_in(&c, p); | |||||
replace1 += (c << ix); | |||||
ix += 16; | |||||
} | |||||
while (isspace2(*p)) p++; | while (isspace2(*p)) p++; | ||||
ix = 0; | |||||
while ((unsigned char)(*p) > 0x20) { | |||||
p += utf8_in(&c, p); | |||||
replace2 += (c << ix); | |||||
ix += 16; | |||||
} | |||||
if (replace1 != 0) { | |||||
Write4Bytes(f_out, replace1); // write as little-endian | |||||
Write4Bytes(f_out, replace2); // if big-endian, reverse the bytes in LoadDictionary() | |||||
if ((unsigned char)(*p) > 0x20) { | |||||
while ((unsigned char)(*p) > 0x20) { // not space or zero-byte | |||||
fputc(*p, f_out); | |||||
p++; | |||||
} | |||||
fputc(0, f_out); | |||||
while (isspace2(*p)) p++; | |||||
while ((unsigned char)(*p) > 0x20) { | |||||
fputc(*p, f_out); | |||||
p++; | |||||
} | |||||
fputc(0, f_out); | |||||
} | } | ||||
} | |||||
break; | break; | ||||
} | } | ||||
} | } |
if (p[0] == RULE_REPLACEMENTS) { | if (p[0] == RULE_REPLACEMENTS) { | ||||
p = (char *)(((intptr_t)p+4) & ~3); // advance to next word boundary | p = (char *)(((intptr_t)p+4) & ~3); // advance to next word boundary | ||||
tr->langopts.replace_chars = (unsigned int *)p; | |||||
tr->langopts.replace_chars = (unsigned char *)p; | |||||
while (*(unsigned int *)p != 0) | while (*(unsigned int *)p != 0) | ||||
p += 8; // find the end of the replacement list, each entry is 2 words. | |||||
p += 4; | |||||
#ifdef ARCH_BIG | |||||
pw = (unsigned int *)(tr->langopts.replace_chars); | |||||
while (*pw != 0) { | |||||
*pw = Reverse4Bytes(*pw); | |||||
pw++; | |||||
*pw = Reverse4Bytes(*pw); | |||||
pw++; | |||||
} | |||||
#endif | |||||
p++; | |||||
while (*p != RULE_GROUP_END) p++; | |||||
p++; | |||||
continue; | continue; | ||||
} | } | ||||
return 1; | return 1; | ||||
} | } | ||||
static const char * | |||||
FindReplacementChars(Translator *tr, unsigned int c, unsigned int nextc, bool *ignore_next) { | |||||
unsigned int uc = 0; | |||||
const char *from = (const char *)tr->langopts.replace_chars; | |||||
while (*(unsigned int *)from != 0) { | |||||
from += utf8_in((int *)&uc, from); | |||||
if (c == uc) { | |||||
if (*from == 0) return from + 1; | |||||
from += utf8_in((int *)&uc, from); | |||||
if (*from == 0 && uc == (unsigned int)towlower2(nextc, tr)) { | |||||
*ignore_next = true; | |||||
return from + 1; | |||||
} | |||||
} | |||||
// replacement 'from' string (skip the remaining part, if any) | |||||
while (*from != '\0') from++; | |||||
from++; | |||||
// replacement 'to' string | |||||
while (*from != '\0') from++; | |||||
from++; | |||||
} | |||||
return NULL; | |||||
} | |||||
// handle .replace rule in xx_rules file | // handle .replace rule in xx_rules file | ||||
static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags) | static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags) | ||||
{ | { | ||||
int ix; | |||||
unsigned int word; | |||||
unsigned int new_c, c2 = ' ', c_lower; | unsigned int new_c, c2 = ' ', c_lower; | ||||
int upper_case = 0; | int upper_case = 0; | ||||
static bool ignore_next = false; | static bool ignore_next = false; | ||||
const unsigned int *replace_chars; | |||||
if (ignore_next) { | if (ignore_next) { | ||||
ignore_next = false; | ignore_next = false; | ||||
} | } | ||||
if (c == 0) return 0; | if (c == 0) return 0; | ||||
if ((replace_chars = tr->langopts.replace_chars) == NULL) | |||||
if (tr->langopts.replace_chars == NULL) | |||||
return c; | return c; | ||||
// there is a list of character codes to be substituted with alternative codes | // there is a list of character codes to be substituted with alternative codes | ||||
upper_case = 1; | upper_case = 1; | ||||
} | } | ||||
new_c = 0; | |||||
for (ix = 0; (word = replace_chars[ix]) != 0; ix += 2) { | |||||
if (c_lower == (word & 0xffff)) { | |||||
if ((word >> 16) == 0) { | |||||
new_c = replace_chars[ix+1]; | |||||
break; | |||||
} | |||||
if ((word >> 16) == (unsigned int)towlower2(next_in, tr)) { | |||||
new_c = replace_chars[ix+1]; | |||||
ignore_next = true; | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
if (new_c == 0) | |||||
const char *to = FindReplacementChars(tr, c_lower, next_in, &ignore_next); | |||||
if (to == NULL) | |||||
return c; // no substitution | return c; // no substitution | ||||
if (new_c & 0xffe00000) { | |||||
to += utf8_in((int *)&new_c, to); | |||||
if (*to != 0) { | |||||
// there is a second character to be inserted | // there is a second character to be inserted | ||||
// don't convert the case of the second character unless the next letter is also upper case | // don't convert the case of the second character unless the next letter is also upper case | ||||
c2 = new_c >> 16; | |||||
to += utf8_in((int *)&c2, to); | |||||
if (upper_case && iswupper(next_in)) | if (upper_case && iswupper(next_in)) | ||||
c2 = ucd_toupper(c2); | c2 = ucd_toupper(c2); | ||||
*insert = c2; | *insert = c2; | ||||
new_c &= 0xffff; | |||||
} | } | ||||
if (upper_case) | if (upper_case) |
bool textmode; // the meaning of FLAG_TEXTMODE is reversed (to save data when *_list file is compiled) | bool textmode; // the meaning of FLAG_TEXTMODE is reversed (to save data when *_list file is compiled) | ||||
char dotless_i; // uses letter U+0131 | char dotless_i; // uses letter U+0131 | ||||
int listx; // compile *_listx after *list | int listx; // compile *_listx after *list | ||||
const unsigned int *replace_chars; // characters to be substitutes | |||||
const unsigned char *replace_chars; // characters to be substitutes | |||||
int our_alphabet; // offset for main alphabet (if not set in letter_bits_offset) | int our_alphabet; // offset for main alphabet (if not set in letter_bits_offset) | ||||
int alt_alphabet; // offset for another language to recognize | int alt_alphabet; // offset for another language to recognize | ||||
int alt_alphabet_lang; // language for the alt_alphabet | int alt_alphabet_lang; // language for the alt_alphabet |