| @@ -602,6 +602,19 @@ string_decoder_getc_wchar(espeak_ng_TEXT_DECODER *decoder) | |||
| return c; | |||
| } | |||
| static uint32_t | |||
| string_decoder_getc_auto(espeak_ng_TEXT_DECODER *decoder) | |||
| { | |||
| const uint8_t *ptr = decoder->current; | |||
| uint32_t c = string_decoder_getc_utf_8(decoder); | |||
| if (c == 0xFFFD) { | |||
| decoder->get = string_decoder_getc_codepage; | |||
| decoder->current = ptr; | |||
| c = decoder->get(decoder); | |||
| } | |||
| return c; | |||
| } | |||
| typedef struct | |||
| { | |||
| uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder); | |||
| @@ -672,6 +685,26 @@ text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder, | |||
| return ENS_OK; | |||
| } | |||
| espeak_ng_STATUS | |||
| text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder, | |||
| const char *string, | |||
| int length, | |||
| espeak_ng_ENCODING encoding) | |||
| { | |||
| if (encoding > ESPEAKNG_ENCODING_ISO_10646_UCS_2) | |||
| return ENS_UNKNOWN_TEXT_ENCODING; | |||
| const encoding_t *enc = string_decoders + encoding; | |||
| if (enc->get == NULL) | |||
| return ENS_UNKNOWN_TEXT_ENCODING; | |||
| decoder->get = string_decoder_getc_auto; | |||
| decoder->codepage = enc->codepage; | |||
| decoder->current = (const uint8_t *)string; | |||
| decoder->end = (const uint8_t *)(string + length); | |||
| return ENS_OK; | |||
| } | |||
| espeak_ng_STATUS | |||
| text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder, | |||
| const wchar_t *string, | |||
| @@ -36,6 +36,12 @@ text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder, | |||
| int length, | |||
| espeak_ng_ENCODING encoding); | |||
| espeak_ng_STATUS | |||
| text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder, | |||
| const char *string, | |||
| int length, | |||
| espeak_ng_ENCODING encoding); | |||
| espeak_ng_STATUS | |||
| text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder, | |||
| const wchar_t *string, | |||
| @@ -1916,16 +1916,21 @@ int UpperCaseInWord(Translator *tr, char *word, int c) | |||
| return 0; | |||
| } | |||
| static espeak_ng_STATUS init_wstring_decoder(const wchar_t *text) | |||
| static inline espeak_ng_STATUS init_wstring_decoder(const wchar_t *text) | |||
| { | |||
| return text_decoder_decode_wstring(p_decoder, text, wcslen(text) + 1); | |||
| } | |||
| static espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding) | |||
| static inline espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding) | |||
| { | |||
| return text_decoder_decode_string(p_decoder, text, strlen(text) + 1, encoding); | |||
| } | |||
| static inline espeak_ng_STATUS init_string_decoder_auto(const char *text, espeak_ng_ENCODING encoding) | |||
| { | |||
| return text_decoder_decode_string_auto(p_decoder, text, strlen(text) + 1, encoding); | |||
| } | |||
| const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out, char **voice_change) | |||
| { | |||
| int ix; | |||
| @@ -1985,7 +1990,9 @@ const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out, | |||
| case espeakCHARS_WCHAR: | |||
| init_wstring_decoder((const wchar_t *)vp_input); | |||
| break; | |||
| case espeakCHARS_AUTO: // TODO: Implement UTF-8 => 8BIT fallback on 0xFFFD UTF-8 characters. | |||
| case espeakCHARS_AUTO: | |||
| init_string_decoder_auto((const char *)vp_input, tr->encoding); | |||
| break; | |||
| case espeakCHARS_UTF8: | |||
| init_string_decoder((const char *)vp_input, ESPEAKNG_ENCODING_UTF_8); | |||
| break; | |||
| @@ -749,6 +749,40 @@ test_wchar_decoder() | |||
| destroy_text_decoder(decoder); | |||
| } | |||
| void | |||
| test_auto_decoder() | |||
| { | |||
| printf("testing auto decoder (UTF-8 + codepage-based fallback)\n"); | |||
| espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||
| // UTF-8 | |||
| assert(text_decoder_decode_string_auto(decoder, "aG\xC2\xA0 ", 5, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == 'a'); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == 'G'); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == 0xA0); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == ' '); | |||
| assert(text_decoder_eof(decoder) == 1); | |||
| // ISO-8859-1 | |||
| assert(text_decoder_decode_string_auto(decoder, "aG\240f", 4, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == 'a'); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == 'G'); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == 0xA0); | |||
| assert(text_decoder_eof(decoder) == 0); | |||
| assert(text_decoder_getc(decoder) == 'f'); | |||
| assert(text_decoder_eof(decoder) == 1); | |||
| destroy_text_decoder(decoder); | |||
| } | |||
| int | |||
| main(int argc, char **argv) | |||
| { | |||
| @@ -780,6 +814,7 @@ main(int argc, char **argv) | |||
| test_iso_10646_ucs_2_encoding(); | |||
| test_wchar_decoder(); | |||
| test_auto_decoder(); | |||
| printf("done\n"); | |||