-D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} | -D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} | ||||
tests_tokenizer_test_SOURCES = \ | tests_tokenizer_test_SOURCES = \ | ||||
$(UCD_TOOLS_SOURCES) \ | $(UCD_TOOLS_SOURCES) \ | ||||
src/libespeak-ng/encoding.c \ | |||||
src/libespeak-ng/mnemonics.c \ | |||||
src/libespeak-ng/tokenizer.c \ | src/libespeak-ng/tokenizer.c \ | ||||
tests/tokenizer.c | tests/tokenizer.c | ||||
#define LEADING_2_BITS 0xC0 // 0b11000000 | #define LEADING_2_BITS 0xC0 // 0b11000000 | ||||
#define UTF8_TAIL_BITS 0x80 // 0b10000000 | #define UTF8_TAIL_BITS 0x80 // 0b10000000 | ||||
int utf8_out(unsigned int c, char *buf) | |||||
{ | |||||
// write a unicode character into a buffer as utf8 | |||||
// returns the number of bytes written | |||||
int n_bytes; | |||||
int j; | |||||
int shift; | |||||
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||||
if (c < 0x80) { | |||||
buf[0] = c; | |||||
return 1; | |||||
} | |||||
if (c >= 0x110000) { | |||||
buf[0] = ' '; // out of range character code | |||||
return 1; | |||||
} | |||||
if (c < 0x0800) | |||||
n_bytes = 1; | |||||
else if (c < 0x10000) | |||||
n_bytes = 2; | |||||
else | |||||
n_bytes = 3; | |||||
shift = 6*n_bytes; | |||||
buf[0] = code[n_bytes] | (c >> shift); | |||||
for (j = 0; j < n_bytes; j++) { | |||||
shift -= 6; | |||||
buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||||
} | |||||
return n_bytes+1; | |||||
} | |||||
// http://www.iana.org/assignments/character-sets/character-sets.xhtml | // http://www.iana.org/assignments/character-sets/character-sets.xhtml | ||||
MNEM_TAB mnem_encoding[] = { | MNEM_TAB mnem_encoding[] = { | ||||
{ "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII }, | { "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII }, |
struct espeak_ng_TOKENIZER_ | struct espeak_ng_TOKENIZER_ | ||||
{ | { | ||||
espeak_ng_TEXT_DECODER *decoder; | espeak_ng_TEXT_DECODER *decoder; | ||||
char token[256]; | |||||
espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); | |||||
}; | }; | ||||
static espeak_ng_TOKEN_TYPE | |||||
tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer) | |||||
{ | |||||
*tokenizer->token = '\0'; | |||||
return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||||
} | |||||
static espeak_ng_TOKEN_TYPE | |||||
tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | |||||
{ | |||||
if (text_decoder_eof(tokenizer->decoder)) { | |||||
tokenizer->read = tokenizer_state_end_of_buffer; | |||||
return tokenizer_state_end_of_buffer(tokenizer); | |||||
} | |||||
uint32_t c; | |||||
char *current = tokenizer->token; | |||||
switch (c = text_decoder_getc(tokenizer->decoder)) | |||||
{ | |||||
case '\n': | |||||
current += utf8_out(c, current); | |||||
*current = '\0'; | |||||
return ESPEAKNG_TOKEN_NEWLINE; | |||||
case '\0': | |||||
tokenizer->read = tokenizer_state_end_of_buffer; | |||||
return tokenizer_state_end_of_buffer(tokenizer); | |||||
default: | |||||
current += utf8_out(c, current); | |||||
*current = '\0'; | |||||
return ESPEAKNG_TOKEN_UNKNOWN; | |||||
} | |||||
return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||||
} | |||||
espeak_ng_TOKENIZER * | espeak_ng_TOKENIZER * | ||||
create_tokenizer(void) | create_tokenizer(void) | ||||
{ | { | ||||
if (!tokenizer) return NULL; | if (!tokenizer) return NULL; | ||||
tokenizer->decoder = NULL; | tokenizer->decoder = NULL; | ||||
tokenizer->read = tokenizer_state_end_of_buffer; | |||||
*tokenizer->token = '\0'; | |||||
return tokenizer; | return tokenizer; | ||||
} | } | ||||
tokenizer_reset(espeak_ng_TOKENIZER *tokenizer, | tokenizer_reset(espeak_ng_TOKENIZER *tokenizer, | ||||
espeak_ng_TEXT_DECODER *decoder) | espeak_ng_TEXT_DECODER *decoder) | ||||
{ | { | ||||
if (!tokenizer || !decoder) return 0; | |||||
if (!tokenizer) return 0; | |||||
tokenizer->decoder = decoder; | tokenizer->decoder = decoder; | ||||
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; | |||||
return 1; | return 1; | ||||
} | } | ||||
espeak_ng_TOKEN_TYPE | espeak_ng_TOKEN_TYPE | ||||
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer) | tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer) | ||||
{ | { | ||||
return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||||
return tokenizer->read(tokenizer); | |||||
} | } | ||||
const char * | const char * | ||||
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer) | tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer) | ||||
{ | { | ||||
return ""; | |||||
return tokenizer->token; | |||||
} | } |
{ | { | ||||
ESPEAKNG_TOKEN_END_OF_BUFFER, | ESPEAKNG_TOKEN_END_OF_BUFFER, | ||||
ESPEAKNG_TOKEN_UNKNOWN, | ESPEAKNG_TOKEN_UNKNOWN, | ||||
ESPEAKNG_TOKEN_NEWLINE, | |||||
} espeak_ng_TOKEN_TYPE; | } espeak_ng_TOKEN_TYPE; | ||||
espeak_ng_TOKEN_TYPE | espeak_ng_TOKEN_TYPE |
return lookupwchar(brackets, c); | return lookupwchar(brackets, c); | ||||
} | } | ||||
int utf8_out(unsigned int c, char *buf) | |||||
{ | |||||
// write a unicode character into a buffer as utf8 | |||||
// returns the number of bytes written | |||||
int n_bytes; | |||||
int j; | |||||
int shift; | |||||
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||||
if (c < 0x80) { | |||||
buf[0] = c; | |||||
return 1; | |||||
} | |||||
if (c >= 0x110000) { | |||||
buf[0] = ' '; // out of range character code | |||||
return 1; | |||||
} | |||||
if (c < 0x0800) | |||||
n_bytes = 1; | |||||
else if (c < 0x10000) | |||||
n_bytes = 2; | |||||
else | |||||
n_bytes = 3; | |||||
shift = 6*n_bytes; | |||||
buf[0] = code[n_bytes] | (c >> shift); | |||||
for (j = 0; j < n_bytes; j++) { | |||||
shift -= 6; | |||||
buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||||
} | |||||
return n_bytes+1; | |||||
} | |||||
int utf8_nbytes(const char *buf) | int utf8_nbytes(const char *buf) | ||||
{ | { | ||||
// Returns the number of bytes for the first UTF-8 character in buf | // Returns the number of bytes for the first UTF-8 character in buf |
#include <assert.h> | #include <assert.h> | ||||
#include <stdint.h> | #include <stdint.h> | ||||
#include <stdlib.h> | #include <stdlib.h> | ||||
#include <string.h> | |||||
#include <stdio.h> | #include <stdio.h> | ||||
#include <espeak-ng/espeak_ng.h> | #include <espeak-ng/espeak_ng.h> | ||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | assert(tokenizer_get_token_text(tokenizer) != NULL); | ||||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | assert(*tokenizer_get_token_text(tokenizer) == '\0'); | ||||
assert(tokenizer_reset(tokenizer, NULL) == 0); | |||||
assert(tokenizer_reset(tokenizer, NULL) == 1); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | ||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | assert(tokenizer_get_token_text(tokenizer) != NULL); | ||||
destroy_tokenizer(tokenizer); | destroy_tokenizer(tokenizer); | ||||
} | } | ||||
void | |||||
test_linux_newline_tokens() | |||||
{ | |||||
printf("testing linux newline tokens\n"); | |||||
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||||
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||||
assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK); | |||||
assert(tokenizer_reset(tokenizer, decoder) == 1); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||||
destroy_text_decoder(decoder); | |||||
destroy_tokenizer(tokenizer); | |||||
} | |||||
int | int | ||||
main(int argc, char **argv) | main(int argc, char **argv) | ||||
{ | { | ||||
test_fullwidth(); | test_fullwidth(); | ||||
test_unbound_tokenizer(); | test_unbound_tokenizer(); | ||||
test_linux_newline_tokens(); | |||||
printf("done\n"); | printf("done\n"); | ||||