#include <stdio.h> | #include <stdio.h> | ||||
#include <stdlib.h> | #include <stdlib.h> | ||||
#include <string.h> | #include <string.h> | ||||
#include <wctype.h> | |||||
#include <espeak-ng/espeak_ng.h> | #include <espeak-ng/espeak_ng.h> | ||||
#include <espeak-ng/speak_lib.h> | #include <espeak-ng/speak_lib.h> | ||||
ix = utf8_in(&c2, p); | ix = utf8_in(&c2, p); | ||||
if (c2 == 0) | if (c2 == 0) | ||||
break; | break; | ||||
if (iswupper2(c2)) | |||||
if (iswupper(c2)) | |||||
utf8_out(towlower2(c2), p); | utf8_out(towlower2(c2), p); | ||||
else | else | ||||
all_upper_case = 0; | all_upper_case = 0; |
#include <stdio.h> | #include <stdio.h> | ||||
#include <stdlib.h> | #include <stdlib.h> | ||||
#include <string.h> | #include <string.h> | ||||
#include <wctype.h> | |||||
#include <wchar.h> | #include <wchar.h> | ||||
#include <espeak-ng/espeak_ng.h> | #include <espeak-ng/espeak_ng.h> | ||||
p += utf8_in(&c, p); | p += utf8_in(&c, p); | ||||
if (use_tie != 0) { | if (use_tie != 0) { | ||||
// look for non-inital alphabetic character, but not diacritic, superscript etc. | // look for non-inital alphabetic character, but not diacritic, superscript etc. | ||||
if ((count > 0) && !(flags & (1 << (count-1))) && ((c < 0x2b0) || (c > 0x36f)) && iswalpha2(c)) | |||||
if ((count > 0) && !(flags & (1 << (count-1))) && ((c < 0x2b0) || (c > 0x36f)) && iswalpha(c)) | |||||
buf += utf8_out(use_tie, buf); | buf += utf8_out(use_tie, buf); | ||||
} | } | ||||
buf += utf8_out(c, buf); | buf += utf8_out(c, buf); | ||||
break; | break; | ||||
} | } | ||||
if ((c != '\'') && !iswalpha2(c)) | |||||
if ((c != '\'') && !iswalpha(c)) | |||||
return 0; | return 0; | ||||
} | } | ||||
failed = 1; | failed = 1; | ||||
break; | break; | ||||
case RULE_NONALPHA: | case RULE_NONALPHA: | ||||
if (!iswalpha2(letter_w)) { | |||||
if (!iswalpha(letter_w)) { | |||||
add_points = (21-distance_right); | add_points = (21-distance_right); | ||||
post_ptr += letter_xbytes; | post_ptr += letter_xbytes; | ||||
} else | } else | ||||
failed = 1; | failed = 1; | ||||
break; | break; | ||||
case RULE_NONALPHA: | case RULE_NONALPHA: | ||||
if (!iswalpha2(letter_w)) { | |||||
if (!iswalpha(letter_w)) { | |||||
add_points = (21-distance_right); | add_points = (21-distance_right); | ||||
pre_ptr -= letter_xbytes; | pre_ptr -= letter_xbytes; | ||||
} else | } else | ||||
if (tr->letter_bits_offset > 0) { | if (tr->letter_bits_offset > 0) { | ||||
// not a Latin alphabet, switch to the default Latin alphabet language | // not a Latin alphabet, switch to the default Latin alphabet language | ||||
if ((letter <= 0x241) && iswalpha2(letter)) { | |||||
if ((letter <= 0x241) && iswalpha(letter)) { | |||||
sprintf(phonemes, "%c%s", phonSWITCH, tr->langopts.ascii_language); | sprintf(phonemes, "%c%s", phonSWITCH, tr->langopts.ascii_language); | ||||
return 0; | return 0; | ||||
} | } |
if (control & 2) { | if (control & 2) { | ||||
// include CAPITAL information | // include CAPITAL information | ||||
if (iswupper2(letter)) | |||||
if (iswupper(letter)) | |||||
Lookup(tr, "_cap", capital); | Lookup(tr, "_cap", capital); | ||||
} | } | ||||
letter = towlower2(letter); | letter = towlower2(letter); | ||||
if (ph_buf[0] == 0) { | if (ph_buf[0] == 0) { | ||||
speak_letter_number = 1; | speak_letter_number = 1; | ||||
if (!(al_flags & AL_NO_SYMBOL)) { | if (!(al_flags & AL_NO_SYMBOL)) { | ||||
if (iswalpha2(letter)) | |||||
if (iswalpha(letter)) | |||||
Lookup(translator, "_?A", ph_buf); | Lookup(translator, "_?A", ph_buf); | ||||
if ((ph_buf[0] == 0) && !iswspace(letter)) | if ((ph_buf[0] == 0) && !iswspace(letter)) | ||||
if ((tr->langopts.numbers & NUM_NOPAUSE) && (next_char == ' ')) | if ((tr->langopts.numbers & NUM_NOPAUSE) && (next_char == ' ')) | ||||
utf8_in(&next_char, p); | utf8_in(&next_char, p); | ||||
if (!iswalpha2(next_char) && (thousands_exact == 0)) | |||||
if (!iswalpha(next_char) && (thousands_exact == 0)) | |||||
strcat(ph_out, str_pause); // don't add pause for 100s, 6th, etc. | strcat(ph_out, str_pause); // don't add pause for 100s, 6th, etc. | ||||
} | } | ||||
0, 0 | 0, 0 | ||||
}; | }; | ||||
// use internal data for iswalpha up to U+024F | |||||
// iswalpha() on Windows is unreliable (U+AA, U+BA). | |||||
int iswalpha2(int c) | |||||
{ | |||||
if (c < 0x80) | |||||
return isalpha(c); | |||||
if ((c > 0x3040) && (c <= 0xa700)) | |||||
return 1; // japanese, chinese characters | |||||
if (c > MAX_WALPHA) | |||||
return iswalpha(c); | |||||
return walpha_tab[c-0x80]; | |||||
} | |||||
int iswlower2(int c) | |||||
{ | |||||
if (c < 0x80) | |||||
return islower(c); | |||||
if (c > MAX_WALPHA) | |||||
return iswlower(c); | |||||
if (walpha_tab[c-0x80] == 0xff) | |||||
return 1; | |||||
return 0; | |||||
} | |||||
int iswupper2(int c) | |||||
{ | |||||
int x; | |||||
if (c < 0x80) | |||||
return isupper(c); | |||||
if (c > MAX_WALPHA) | |||||
return iswupper(c); | |||||
if (((x = walpha_tab[c-0x80]) > 0) && (x < 0xfe)) | |||||
return 1; | |||||
return 0; | |||||
} | |||||
int towlower2(unsigned int c) | int towlower2(unsigned int c) | ||||
{ | { | ||||
int x; | |||||
int ix; | |||||
// check for non-standard upper to lower case conversions | // check for non-standard upper to lower case conversions | ||||
if (c == 'I') { | |||||
if (translator->langopts.dotless_i) | |||||
c = 0x131; // I -> ı | |||||
} | |||||
if (c < 0x80) | |||||
return tolower(c); | |||||
if (c == 'I' && translator->langopts.dotless_i) | |||||
return 0x131; // I -> ı | |||||
if (c > MAX_WALPHA) | |||||
return towlower(c); | |||||
if ((x = walpha_tab[c-0x80]) >= 0xfe) | |||||
return c; // this is not an upper case letter | |||||
if (x == 0xfd) { | |||||
// special cases, lookup translation table | |||||
for (ix = 0; wchar_tolower[ix] != 0; ix += 2) { | |||||
if (wchar_tolower[ix] == (int)c) | |||||
return wchar_tolower[ix+1]; | |||||
} | |||||
} | |||||
return c + x; // convert to lower case | |||||
} | |||||
int towupper2(unsigned int c) | |||||
{ | |||||
int ix; | |||||
if (c > MAX_WALPHA) | |||||
return towupper(c); | |||||
// check whether a previous character code is the upper-case equivalent of this character | |||||
if (towlower2(c-32) == (int)c) | |||||
return c-32; // yes, use it | |||||
if (towlower2(c-1) == (int)c) | |||||
return c-1; | |||||
for (ix = 0; wchar_toupper[ix] != 0; ix += 2) { | |||||
if (wchar_toupper[ix] == (int)c) | |||||
return wchar_toupper[ix+1]; | |||||
} | |||||
return c; // no | |||||
return towlower(c); | |||||
} | } | ||||
static int IsRomanU(unsigned int c) | static int IsRomanU(unsigned int c) | ||||
while (!Eof() && (c1 != '>')) | while (!Eof() && (c1 != '>')) | ||||
c1 = GetC(); | c1 = GetC(); | ||||
c2 = ' '; | c2 = ' '; | ||||
} else if ((c2 == '/') || iswalpha2(c2)) { | |||||
} else if ((c2 == '/') || iswalpha(c2)) { | |||||
// check for space in the output buffer for embedded commands produced by the SSML tag | // check for space in the output buffer for embedded commands produced by the SSML tag | ||||
if (ix > (n_buf - 20)) { | if (ix > (n_buf - 20)) { | ||||
// Perhaps not enough room, end the clause before the SSML tag | // Perhaps not enough room, end the clause before the SSML tag | ||||
} | } | ||||
} | } | ||||
if (iswupper2(c1)) { | |||||
if (iswupper(c1)) { | |||||
tr->clause_upper_count++; | tr->clause_upper_count++; | ||||
if ((option_capitals == 2) && (sayas_mode == 0) && !iswupper2(cprev)) { | |||||
if ((option_capitals == 2) && (sayas_mode == 0) && !iswupper(cprev)) { | |||||
char text_buf[40]; | char text_buf[40]; | ||||
char text_buf2[30]; | char text_buf2[30]; | ||||
if (LookupSpecial(tr, "_cap", text_buf2) != NULL) { | if (LookupSpecial(tr, "_cap", text_buf2) != NULL) { | ||||
} | } | ||||
} | } | ||||
} | } | ||||
} else if (iswalpha2(c1)) | |||||
} else if (iswalpha(c1)) | |||||
tr->clause_lower_count++; | tr->clause_lower_count++; | ||||
if (option_phoneme_input) { | if (option_phoneme_input) { | ||||
// i.e. is dot followed by an upper-case letter? | // i.e. is dot followed by an upper-case letter? | ||||
if (!iswspace(c1)) { | if (!iswspace(c1)) { | ||||
if (!IsAlpha(c1) || !iswlower2(c1)) { | |||||
if (!IsAlpha(c1) || !iswlower(c1)) { | |||||
UngetC(c2); | UngetC(c2); | ||||
ungot_char2 = c1; | ungot_char2 = c1; | ||||
buf[end_clause_index] = ' '; // delete the end-clause punctuation | buf[end_clause_index] = ' '; // delete the end-clause punctuation | ||||
punct_data |= CLAUSE_DOT; | punct_data |= CLAUSE_DOT; | ||||
if (nl_count == 0) { | if (nl_count == 0) { | ||||
if ((c1 == ',') && (cprev == '.') && (tr->translator_name == L('h', 'u')) && iswdigit(cprev2) && (iswdigit(c_next) || (iswlower2(c_next)))) { | |||||
if ((c1 == ',') && (cprev == '.') && (tr->translator_name == L('h', 'u')) && iswdigit(cprev2) && (iswdigit(c_next) || (iswlower(c_next)))) { | |||||
// lang=hu, fix for ordinal numbers, eg: "december 2., szerda", ignore ',' after ordinal number | // lang=hu, fix for ordinal numbers, eg: "december 2., szerda", ignore ',' after ordinal number | ||||
c1 = CHAR_COMMA_BREAK; | c1 = CHAR_COMMA_BREAK; | ||||
is_end_clause = 0; | is_end_clause = 0; | ||||
// dot after a number indicates an ordinal number | // dot after a number indicates an ordinal number | ||||
if (!iswdigit(cprev)) | if (!iswdigit(cprev)) | ||||
is_end_clause = 0; // Roman number followed by dot | is_end_clause = 0; // Roman number followed by dot | ||||
else if (iswlower2(c_next) || (c_next == '-')) // hyphen is needed for lang-hu (eg. 2.-kal) | |||||
else if (iswlower(c_next) || (c_next == '-')) // hyphen is needed for lang-hu (eg. 2.-kal) | |||||
is_end_clause = 0; // only if followed by lower-case, (or if there is a XML tag) | is_end_clause = 0; // only if followed by lower-case, (or if there is a XML tag) | ||||
} else if (c_next == '\'') | } else if (c_next == '\'') | ||||
is_end_clause = 0; // eg. u.s.a.'s | is_end_clause = 0; // eg. u.s.a.'s | ||||
if (iswlower2(c_next)) { | |||||
if (iswlower(c_next)) { | |||||
// next word has no capital letter, this dot is probably from an abbreviation | // next word has no capital letter, this dot is probably from an abbreviation | ||||
is_end_clause = 0; | is_end_clause = 0; | ||||
} | } |
0 | 0 | ||||
}; | }; | ||||
if (iswalpha2(c)) | |||||
if (iswalpha(c)) | |||||
return 1; | return 1; | ||||
if (c < 0x300) | if (c < 0x300) | ||||
int c; | int c; | ||||
while ((*word != 0) && !isspace2(*word)) { | while ((*word != 0) && !isspace2(*word)) { | ||||
word += utf8_in(&c, word); | word += utf8_in(&c, word); | ||||
if (!iswupper2(c)) | |||||
if (!iswupper(c)) | |||||
return 0; | return 0; | ||||
} | } | ||||
return 1; | return 1; | ||||
} | } | ||||
} | } | ||||
if ((wflags & FLAG_ALL_UPPER) && (word_length > 1) && iswalpha2(first_char)) { | |||||
if ((wflags & FLAG_ALL_UPPER) && (word_length > 1) && iswalpha(first_char)) { | |||||
if ((option_tone_flags & OPTION_EMPHASIZE_ALLCAPS) && !(dictionary_flags[0] & FLAG_ABBREV)) { | if ((option_tone_flags & OPTION_EMPHASIZE_ALLCAPS) && !(dictionary_flags[0] & FLAG_ABBREV)) { | ||||
// emphasize words which are in capitals | // emphasize words which are in capitals | ||||
emphasize_allcaps = FLAG_EMPHASIZED; | emphasize_allcaps = FLAG_EMPHASIZED; | ||||
tr->expect_past--; | tr->expect_past--; | ||||
} | } | ||||
if ((word_length == 1) && (tr->translator_name == L('e', 'n')) && iswalpha2(first_char) && (first_char != 'i')) { | |||||
if ((word_length == 1) && (tr->translator_name == L('e', 'n')) && iswalpha(first_char) && (first_char != 'i')) { | |||||
// English Specific !!!! | // English Specific !!!! | ||||
// any single letter before a dot is an abbreviation, except 'I' | // any single letter before a dot is an abbreviation, except 'I' | ||||
dictionary_flags[0] |= FLAG_ALLOW_DOT; | dictionary_flags[0] |= FLAG_ALLOW_DOT; | ||||
while (*p2 != ' ') p2++; | while (*p2 != ' ') p2++; | ||||
utf8_in(&c_word2, p2+1); // first character of the next word; | utf8_in(&c_word2, p2+1); // first character of the next word; | ||||
if (!iswalpha2(c_word2)) | |||||
if (!iswalpha(c_word2)) | |||||
ok = 0; | ok = 0; | ||||
if (ok != 0) { | if (ok != 0) { | ||||
// there is a list of character codes to be substituted with alternative codes | // there is a list of character codes to be substituted with alternative codes | ||||
if (iswupper2(c_lower = c)) { | |||||
if (iswupper(c_lower = c)) { | |||||
c_lower = towlower2(c); | c_lower = towlower2(c); | ||||
upper_case = 1; | upper_case = 1; | ||||
} | } | ||||
// there is a second character to be inserted | // there is a second character to be inserted | ||||
// don't convert the case of the second character unless the next letter is also upper case | // don't convert the case of the second character unless the next letter is also upper case | ||||
c2 = new_c >> 16; | c2 = new_c >> 16; | ||||
if (upper_case && iswupper2(next_in)) | |||||
c2 = towupper2(c2); | |||||
if (upper_case && iswupper(next_in)) | |||||
c2 = towupper(c2); | |||||
*insert = c2; | *insert = c2; | ||||
new_c &= 0xffff; | new_c &= 0xffff; | ||||
} | } | ||||
if (upper_case) | if (upper_case) | ||||
new_c = towupper2(new_c); | |||||
new_c = towupper(new_c); | |||||
*wordflags |= FLAG_CHAR_REPLACED; | *wordflags |= FLAG_CHAR_REPLACED; | ||||
return new_c; | return new_c; | ||||
case L('n', 'l'): | case L('n', 'l'): | ||||
// look for 'n and replace by a special character (unicode: schwa) | // look for 'n and replace by a special character (unicode: schwa) | ||||
if (!iswalpha2(prev_in)) { | |||||
if (!iswalpha(prev_in)) { | |||||
utf8_in(&next2, &ptr[1]); | utf8_in(&next2, &ptr[1]); | ||||
if ((c == '\'') && IsSpace(next2)) { | if ((c == '\'') && IsSpace(next2)) { | ||||
if (!IsBracket(prev_out)) // ?? perhaps only set FLAG_NOSPACE for . - / (hyphenated words, URLs, etc) | if (!IsBracket(prev_out)) // ?? perhaps only set FLAG_NOSPACE for . - / (hyphenated words, URLs, etc) | ||||
next_word_flags |= FLAG_NOSPACE; | next_word_flags |= FLAG_NOSPACE; | ||||
} else { | } else { | ||||
if (iswupper2(c)) | |||||
if (iswupper(c)) | |||||
word_flags |= FLAG_FIRST_UPPER; | word_flags |= FLAG_FIRST_UPPER; | ||||
if ((prev_out == ' ') && iswdigit(sbuf[ix-2]) && !iswdigit(prev_in)) { | if ((prev_out == ' ') && iswdigit(sbuf[ix-2]) && !iswdigit(prev_in)) { | ||||
} | } | ||||
} | } | ||||
if (iswupper2(c)) { | |||||
if (iswupper(c)) { | |||||
c = towlower2(c); | c = towlower2(c); | ||||
if ((j = tr->langopts.param[LOPT_CAPS_IN_WORD]) > 0) { | if ((j = tr->langopts.param[LOPT_CAPS_IN_WORD]) > 0) { | ||||
syllable_marked = 1; | syllable_marked = 1; | ||||
} | } | ||||
} else { | } else { | ||||
if (iswlower2(prev_in)) { | |||||
if (iswlower(prev_in)) { | |||||
// lower case followed by upper case in a word | // lower case followed by upper case in a word | ||||
if (UpperCaseInWord(tr, &sbuf[ix], c) == 1) { | if (UpperCaseInWord(tr, &sbuf[ix], c) == 1) { | ||||
// convert to lower case and continue | // convert to lower case and continue | ||||
space_inserted = 1; | space_inserted = 1; | ||||
prev_in_save = c; | prev_in_save = c; | ||||
} | } | ||||
} else if ((c != ' ') && iswupper2(prev_in) && iswlower2(next_in)) { | |||||
} else if ((c != ' ') && iswupper(prev_in) && iswlower(next_in)) { | |||||
int next2_in; | int next2_in; | ||||
utf8_in(&next2_in, &source[source_index + next_in_nbytes]); | utf8_in(&next2_in, &source[source_index + next_in_nbytes]); | ||||
int IsAlpha(unsigned int c); | int IsAlpha(unsigned int c); | ||||
int IsVowel(Translator *tr, int c); | int IsVowel(Translator *tr, int c); | ||||
int IsSuperscript(int letter); | int IsSuperscript(int letter); | ||||
int iswalpha2(int c); | |||||
int isspace2(unsigned int c); | int isspace2(unsigned int c); | ||||
int iswlower2(int c); | |||||
int iswupper2(int c); | |||||
int towlower2(unsigned int c); | |||||
int towupper2(unsigned int c); | |||||
int towlower2(unsigned int c); // Supports Turkish I | |||||
const char *GetTranslatedPhonemeString(int phoneme_mode); | const char *GetTranslatedPhonemeString(int phoneme_mode); | ||||
const char *WordToString2(unsigned int word); | const char *WordToString2(unsigned int word); | ||||
ALPHABET *AlphabetFromChar(int c); | ALPHABET *AlphabetFromChar(int c); |