@@ -602,6 +602,19 @@ string_decoder_getc_wchar(espeak_ng_TEXT_DECODER *decoder) | |||
return c; | |||
} | |||
static uint32_t | |||
string_decoder_getc_auto(espeak_ng_TEXT_DECODER *decoder) | |||
{ | |||
const uint8_t *ptr = decoder->current; | |||
uint32_t c = string_decoder_getc_utf_8(decoder); | |||
if (c == 0xFFFD) { | |||
decoder->get = string_decoder_getc_codepage; | |||
decoder->current = ptr; | |||
c = decoder->get(decoder); | |||
} | |||
return c; | |||
} | |||
typedef struct | |||
{ | |||
uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder); | |||
@@ -672,6 +685,26 @@ text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder, | |||
return ENS_OK; | |||
} | |||
espeak_ng_STATUS | |||
text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder, | |||
const char *string, | |||
int length, | |||
espeak_ng_ENCODING encoding) | |||
{ | |||
if (encoding > ESPEAKNG_ENCODING_ISO_10646_UCS_2) | |||
return ENS_UNKNOWN_TEXT_ENCODING; | |||
const encoding_t *enc = string_decoders + encoding; | |||
if (enc->get == NULL) | |||
return ENS_UNKNOWN_TEXT_ENCODING; | |||
decoder->get = string_decoder_getc_auto; | |||
decoder->codepage = enc->codepage; | |||
decoder->current = (const uint8_t *)string; | |||
decoder->end = (const uint8_t *)(string + length); | |||
return ENS_OK; | |||
} | |||
espeak_ng_STATUS | |||
text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder, | |||
const wchar_t *string, |
@@ -36,6 +36,12 @@ text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder, | |||
int length, | |||
espeak_ng_ENCODING encoding); | |||
espeak_ng_STATUS | |||
text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder, | |||
const char *string, | |||
int length, | |||
espeak_ng_ENCODING encoding); | |||
espeak_ng_STATUS | |||
text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder, | |||
const wchar_t *string, |
@@ -1916,16 +1916,21 @@ int UpperCaseInWord(Translator *tr, char *word, int c) | |||
return 0; | |||
} | |||
static espeak_ng_STATUS init_wstring_decoder(const wchar_t *text) | |||
static inline espeak_ng_STATUS init_wstring_decoder(const wchar_t *text) | |||
{ | |||
return text_decoder_decode_wstring(p_decoder, text, wcslen(text) + 1); | |||
} | |||
static espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding) | |||
static inline espeak_ng_STATUS init_string_decoder(const char *text, espeak_ng_ENCODING encoding) | |||
{ | |||
return text_decoder_decode_string(p_decoder, text, strlen(text) + 1, encoding); | |||
} | |||
static inline espeak_ng_STATUS init_string_decoder_auto(const char *text, espeak_ng_ENCODING encoding) | |||
{ | |||
return text_decoder_decode_string_auto(p_decoder, text, strlen(text) + 1, encoding); | |||
} | |||
const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out, char **voice_change) | |||
{ | |||
int ix; | |||
@@ -1985,7 +1990,9 @@ const void *TranslateClause(Translator *tr, const void *vp_input, int *tone_out, | |||
case espeakCHARS_WCHAR: | |||
init_wstring_decoder((const wchar_t *)vp_input); | |||
break; | |||
case espeakCHARS_AUTO: // TODO: Implement UTF-8 => 8BIT fallback on 0xFFFD UTF-8 characters. | |||
case espeakCHARS_AUTO: | |||
init_string_decoder_auto((const char *)vp_input, tr->encoding); | |||
break; | |||
case espeakCHARS_UTF8: | |||
init_string_decoder((const char *)vp_input, ESPEAKNG_ENCODING_UTF_8); | |||
break; |
@@ -749,6 +749,40 @@ test_wchar_decoder() | |||
destroy_text_decoder(decoder); | |||
} | |||
void | |||
test_auto_decoder() | |||
{ | |||
printf("testing auto decoder (UTF-8 + codepage-based fallback)\n"); | |||
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||
// UTF-8 | |||
assert(text_decoder_decode_string_auto(decoder, "aG\xC2\xA0 ", 5, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == 'a'); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == 'G'); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == 0xA0); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == ' '); | |||
assert(text_decoder_eof(decoder) == 1); | |||
// ISO-8859-1 | |||
assert(text_decoder_decode_string_auto(decoder, "aG\240f", 4, ESPEAKNG_ENCODING_ISO_8859_1) == ENS_OK); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == 'a'); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == 'G'); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == 0xA0); | |||
assert(text_decoder_eof(decoder) == 0); | |||
assert(text_decoder_getc(decoder) == 'f'); | |||
assert(text_decoder_eof(decoder) == 1); | |||
destroy_text_decoder(decoder); | |||
} | |||
int | |||
main(int argc, char **argv) | |||
{ | |||
@@ -780,6 +814,7 @@ main(int argc, char **argv) | |||
test_iso_10646_ucs_2_encoding(); | |||
test_wchar_decoder(); | |||
test_auto_decoder(); | |||
printf("done\n"); | |||