@@ -227,6 +227,8 @@ tests_tokenizer_test_CFLAGS = \ | |||
-D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} | |||
tests_tokenizer_test_SOURCES = \ | |||
$(UCD_TOOLS_SOURCES) \ | |||
src/libespeak-ng/encoding.c \ | |||
src/libespeak-ng/mnemonics.c \ | |||
src/libespeak-ng/tokenizer.c \ | |||
tests/tokenizer.c | |||
@@ -30,6 +30,40 @@ | |||
#define LEADING_2_BITS 0xC0 // 0b11000000 | |||
#define UTF8_TAIL_BITS 0x80 // 0b10000000 | |||
int utf8_out(unsigned int c, char *buf) | |||
{ | |||
// write a unicode character into a buffer as utf8 | |||
// returns the number of bytes written | |||
int n_bytes; | |||
int j; | |||
int shift; | |||
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
if (c < 0x80) { | |||
buf[0] = c; | |||
return 1; | |||
} | |||
if (c >= 0x110000) { | |||
buf[0] = ' '; // out of range character code | |||
return 1; | |||
} | |||
if (c < 0x0800) | |||
n_bytes = 1; | |||
else if (c < 0x10000) | |||
n_bytes = 2; | |||
else | |||
n_bytes = 3; | |||
shift = 6*n_bytes; | |||
buf[0] = code[n_bytes] | (c >> shift); | |||
for (j = 0; j < n_bytes; j++) { | |||
shift -= 6; | |||
buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
} | |||
return n_bytes+1; | |||
} | |||
// http://www.iana.org/assignments/character-sets/character-sets.xhtml | |||
MNEM_TAB mnem_encoding[] = { | |||
{ "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII }, |
@@ -86,8 +86,47 @@ int clause_type_from_codepoint(uint32_t c) | |||
struct espeak_ng_TOKENIZER_ | |||
{ | |||
espeak_ng_TEXT_DECODER *decoder; | |||
char token[256]; | |||
espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); | |||
}; | |||
static espeak_ng_TOKEN_TYPE | |||
tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer) | |||
{ | |||
*tokenizer->token = '\0'; | |||
return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||
} | |||
static espeak_ng_TOKEN_TYPE | |||
tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | |||
{ | |||
if (text_decoder_eof(tokenizer->decoder)) { | |||
tokenizer->read = tokenizer_state_end_of_buffer; | |||
return tokenizer_state_end_of_buffer(tokenizer); | |||
} | |||
uint32_t c; | |||
char *current = tokenizer->token; | |||
switch (c = text_decoder_getc(tokenizer->decoder)) | |||
{ | |||
case '\n': | |||
current += utf8_out(c, current); | |||
*current = '\0'; | |||
return ESPEAKNG_TOKEN_NEWLINE; | |||
case '\0': | |||
tokenizer->read = tokenizer_state_end_of_buffer; | |||
return tokenizer_state_end_of_buffer(tokenizer); | |||
default: | |||
current += utf8_out(c, current); | |||
*current = '\0'; | |||
return ESPEAKNG_TOKEN_UNKNOWN; | |||
} | |||
return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||
} | |||
espeak_ng_TOKENIZER * | |||
create_tokenizer(void) | |||
{ | |||
@@ -95,6 +134,9 @@ create_tokenizer(void) | |||
if (!tokenizer) return NULL; | |||
tokenizer->decoder = NULL; | |||
tokenizer->read = tokenizer_state_end_of_buffer; | |||
*tokenizer->token = '\0'; | |||
return tokenizer; | |||
} | |||
@@ -108,20 +150,21 @@ int | |||
tokenizer_reset(espeak_ng_TOKENIZER *tokenizer, | |||
espeak_ng_TEXT_DECODER *decoder) | |||
{ | |||
if (!tokenizer || !decoder) return 0; | |||
if (!tokenizer) return 0; | |||
tokenizer->decoder = decoder; | |||
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; | |||
return 1; | |||
} | |||
espeak_ng_TOKEN_TYPE | |||
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer) | |||
{ | |||
return ESPEAKNG_TOKEN_END_OF_BUFFER; | |||
return tokenizer->read(tokenizer); | |||
} | |||
const char * | |||
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer) | |||
{ | |||
return ""; | |||
return tokenizer->token; | |||
} |
@@ -38,6 +38,7 @@ typedef enum | |||
{ | |||
ESPEAKNG_TOKEN_END_OF_BUFFER, | |||
ESPEAKNG_TOKEN_UNKNOWN, | |||
ESPEAKNG_TOKEN_NEWLINE, | |||
} espeak_ng_TOKEN_TYPE; | |||
espeak_ng_TOKEN_TYPE |
@@ -305,40 +305,6 @@ int IsBracket(int c) | |||
return lookupwchar(brackets, c); | |||
} | |||
int utf8_out(unsigned int c, char *buf) | |||
{ | |||
// write a unicode character into a buffer as utf8 | |||
// returns the number of bytes written | |||
int n_bytes; | |||
int j; | |||
int shift; | |||
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
if (c < 0x80) { | |||
buf[0] = c; | |||
return 1; | |||
} | |||
if (c >= 0x110000) { | |||
buf[0] = ' '; // out of range character code | |||
return 1; | |||
} | |||
if (c < 0x0800) | |||
n_bytes = 1; | |||
else if (c < 0x10000) | |||
n_bytes = 2; | |||
else | |||
n_bytes = 3; | |||
shift = 6*n_bytes; | |||
buf[0] = code[n_bytes] | (c >> shift); | |||
for (j = 0; j < n_bytes; j++) { | |||
shift -= 6; | |||
buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
} | |||
return n_bytes+1; | |||
} | |||
int utf8_nbytes(const char *buf) | |||
{ | |||
// Returns the number of bytes for the first UTF-8 character in buf |
@@ -21,6 +21,7 @@ | |||
#include <assert.h> | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
#include <stdio.h> | |||
#include <espeak-ng/espeak_ng.h> | |||
@@ -169,7 +170,7 @@ test_unbound_tokenizer() | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||
assert(tokenizer_reset(tokenizer, NULL) == 0); | |||
assert(tokenizer_reset(tokenizer, NULL) == 1); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
@@ -178,6 +179,33 @@ test_unbound_tokenizer() | |||
destroy_tokenizer(tokenizer); | |||
} | |||
void | |||
test_linux_newline_tokens() | |||
{ | |||
printf("testing linux newline tokens\n"); | |||
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||
assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK); | |||
assert(tokenizer_reset(tokenizer, decoder) == 1); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||
destroy_text_decoder(decoder); | |||
destroy_tokenizer(tokenizer); | |||
} | |||
int | |||
main(int argc, char **argv) | |||
{ | |||
@@ -194,6 +222,7 @@ main(int argc, char **argv) | |||
test_fullwidth(); | |||
test_unbound_tokenizer(); | |||
test_linux_newline_tokens(); | |||
printf("done\n"); | |||