| @@ -227,6 +227,8 @@ tests_tokenizer_test_CFLAGS = \ | |||
| -D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} | |||
| tests_tokenizer_test_SOURCES = \ | |||
| $(UCD_TOOLS_SOURCES) \ | |||
| src/libespeak-ng/encoding.c \ | |||
| src/libespeak-ng/mnemonics.c \ | |||
| src/libespeak-ng/tokenizer.c \ | |||
| tests/tokenizer.c | |||
| @@ -30,6 +30,40 @@ | |||
| #define LEADING_2_BITS 0xC0 // 0b11000000 | |||
| #define UTF8_TAIL_BITS 0x80 // 0b10000000 | |||
| int utf8_out(unsigned int c, char *buf) | |||
| { | |||
| // write a unicode character into a buffer as utf8 | |||
| // returns the number of bytes written | |||
| int n_bytes; | |||
| int j; | |||
| int shift; | |||
| static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
| if (c < 0x80) { | |||
| buf[0] = c; | |||
| return 1; | |||
| } | |||
| if (c >= 0x110000) { | |||
| buf[0] = ' '; // out of range character code | |||
| return 1; | |||
| } | |||
| if (c < 0x0800) | |||
| n_bytes = 1; | |||
| else if (c < 0x10000) | |||
| n_bytes = 2; | |||
| else | |||
| n_bytes = 3; | |||
| shift = 6*n_bytes; | |||
| buf[0] = code[n_bytes] | (c >> shift); | |||
| for (j = 0; j < n_bytes; j++) { | |||
| shift -= 6; | |||
| buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
| } | |||
| return n_bytes+1; | |||
| } | |||
| // http://www.iana.org/assignments/character-sets/character-sets.xhtml | |||
| MNEM_TAB mnem_encoding[] = { | |||
| { "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII }, | |||
| @@ -86,8 +86,47 @@ int clause_type_from_codepoint(uint32_t c) | |||
| struct espeak_ng_TOKENIZER_ | |||
| { | |||
| espeak_ng_TEXT_DECODER *decoder; | |||
| char token[256]; | |||
| espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); | |||
| }; | |||
| static espeak_ng_TOKEN_TYPE | |||
| tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer) | |||
| { | |||
| *tokenizer->token = '\0'; | |||
| return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||
| } | |||
| static espeak_ng_TOKEN_TYPE | |||
| tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | |||
| { | |||
| if (text_decoder_eof(tokenizer->decoder)) { | |||
| tokenizer->read = tokenizer_state_end_of_buffer; | |||
| return tokenizer_state_end_of_buffer(tokenizer); | |||
| } | |||
| uint32_t c; | |||
| char *current = tokenizer->token; | |||
| switch (c = text_decoder_getc(tokenizer->decoder)) | |||
| { | |||
| case '\n': | |||
| current += utf8_out(c, current); | |||
| *current = '\0'; | |||
| return ESPEAKNG_TOKEN_NEWLINE; | |||
| case '\0': | |||
| tokenizer->read = tokenizer_state_end_of_buffer; | |||
| return tokenizer_state_end_of_buffer(tokenizer); | |||
| default: | |||
| current += utf8_out(c, current); | |||
| *current = '\0'; | |||
| return ESPEAKNG_TOKEN_UNKNOWN; | |||
| } | |||
| return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||
| } | |||
| espeak_ng_TOKENIZER * | |||
| create_tokenizer(void) | |||
| { | |||
| @@ -95,6 +134,9 @@ create_tokenizer(void) | |||
| if (!tokenizer) return NULL; | |||
| tokenizer->decoder = NULL; | |||
| tokenizer->read = tokenizer_state_end_of_buffer; | |||
| *tokenizer->token = '\0'; | |||
| return tokenizer; | |||
| } | |||
| @@ -108,20 +150,21 @@ int | |||
| tokenizer_reset(espeak_ng_TOKENIZER *tokenizer, | |||
| espeak_ng_TEXT_DECODER *decoder) | |||
| { | |||
| if (!tokenizer || !decoder) return 0; | |||
| if (!tokenizer) return 0; | |||
| tokenizer->decoder = decoder; | |||
| tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; | |||
| return 1; | |||
| } | |||
| espeak_ng_TOKEN_TYPE | |||
| tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer) | |||
| { | |||
| return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||
| return tokenizer->read(tokenizer); | |||
| } | |||
| const char * | |||
| tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer) | |||
| { | |||
| return ""; | |||
| return tokenizer->token; | |||
| } | |||
| @@ -38,6 +38,7 @@ typedef enum | |||
| { | |||
| ESPEAKNG_TOKEN_END_OF_BUFFER, | |||
| ESPEAKNG_TOKEN_UNKNOWN, | |||
| ESPEAKNG_TOKEN_NEWLINE, | |||
| } espeak_ng_TOKEN_TYPE; | |||
| espeak_ng_TOKEN_TYPE | |||
| @@ -305,40 +305,6 @@ int IsBracket(int c) | |||
| return lookupwchar(brackets, c); | |||
| } | |||
| int utf8_out(unsigned int c, char *buf) | |||
| { | |||
| // write a unicode character into a buffer as utf8 | |||
| // returns the number of bytes written | |||
| int n_bytes; | |||
| int j; | |||
| int shift; | |||
| static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
| if (c < 0x80) { | |||
| buf[0] = c; | |||
| return 1; | |||
| } | |||
| if (c >= 0x110000) { | |||
| buf[0] = ' '; // out of range character code | |||
| return 1; | |||
| } | |||
| if (c < 0x0800) | |||
| n_bytes = 1; | |||
| else if (c < 0x10000) | |||
| n_bytes = 2; | |||
| else | |||
| n_bytes = 3; | |||
| shift = 6*n_bytes; | |||
| buf[0] = code[n_bytes] | (c >> shift); | |||
| for (j = 0; j < n_bytes; j++) { | |||
| shift -= 6; | |||
| buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
| } | |||
| return n_bytes+1; | |||
| } | |||
| int utf8_nbytes(const char *buf) | |||
| { | |||
| // Returns the number of bytes for the first UTF-8 character in buf | |||
| @@ -21,6 +21,7 @@ | |||
| #include <assert.h> | |||
| #include <stdint.h> | |||
| #include <stdlib.h> | |||
| #include <string.h> | |||
| #include <stdio.h> | |||
| #include <espeak-ng/espeak_ng.h> | |||
| @@ -169,7 +170,7 @@ test_unbound_tokenizer() | |||
| assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
| assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||
| assert(tokenizer_reset(tokenizer, NULL) == 0); | |||
| assert(tokenizer_reset(tokenizer, NULL) == 1); | |||
| assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||
| assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
| @@ -178,6 +179,33 @@ test_unbound_tokenizer() | |||
| destroy_tokenizer(tokenizer); | |||
| } | |||
| void | |||
| test_linux_newline_tokens() | |||
| { | |||
| printf("testing linux newline tokens\n"); | |||
| espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||
| espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||
| assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK); | |||
| assert(tokenizer_reset(tokenizer, decoder) == 1); | |||
| assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
| assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
| assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
| assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
| assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
| assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
| assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||
| assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
| assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||
| destroy_text_decoder(decoder); | |||
| destroy_tokenizer(tokenizer); | |||
| } | |||
| int | |||
| main(int argc, char **argv) | |||
| { | |||
| @@ -194,6 +222,7 @@ main(int argc, char **argv) | |||
| test_fullwidth(); | |||
| test_unbound_tokenizer(); | |||
| test_linux_newline_tokens(); | |||
| printf("done\n"); | |||