Browse Source

tokenizer.c: Support linux newlines.

master
Reece H. Dunn 8 years ago
parent
commit
7602c9ac18

+ 2
- 0
Makefile.am View File

-D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} -D _POSIX_C_SOURCE=200112L ${AM_CFLAGS}
tests_tokenizer_test_SOURCES = \ tests_tokenizer_test_SOURCES = \
$(UCD_TOOLS_SOURCES) \ $(UCD_TOOLS_SOURCES) \
src/libespeak-ng/encoding.c \
src/libespeak-ng/mnemonics.c \
src/libespeak-ng/tokenizer.c \ src/libespeak-ng/tokenizer.c \
tests/tokenizer.c tests/tokenizer.c



+ 34
- 0
src/libespeak-ng/encoding.c View File

#define LEADING_2_BITS 0xC0 // 0b11000000 #define LEADING_2_BITS 0xC0 // 0b11000000
#define UTF8_TAIL_BITS 0x80 // 0b10000000 #define UTF8_TAIL_BITS 0x80 // 0b10000000


int utf8_out(unsigned int c, char *buf)
{
// write a unicode character into a buffer as utf8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

// http://www.iana.org/assignments/character-sets/character-sets.xhtml // http://www.iana.org/assignments/character-sets/character-sets.xhtml
MNEM_TAB mnem_encoding[] = { MNEM_TAB mnem_encoding[] = {
{ "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII }, { "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII },

+ 46
- 3
src/libespeak-ng/tokenizer.c View File

struct espeak_ng_TOKENIZER_ struct espeak_ng_TOKENIZER_
{ {
espeak_ng_TEXT_DECODER *decoder; espeak_ng_TEXT_DECODER *decoder;
char token[256];

espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
}; };


static espeak_ng_TOKEN_TYPE
tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer)
{
*tokenizer->token = '\0';
return ESPEAKNG_TOKEN_END_OF_BUFFER;
}

static espeak_ng_TOKEN_TYPE
tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
{
if (text_decoder_eof(tokenizer->decoder)) {
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
}

uint32_t c;
char *current = tokenizer->token;

switch (c = text_decoder_getc(tokenizer->decoder))
{
case '\n':
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_NEWLINE;
case '\0':
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
default:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_UNKNOWN;
}

return ESPEAKNG_TOKEN_END_OF_BUFFER;
}

espeak_ng_TOKENIZER * espeak_ng_TOKENIZER *
create_tokenizer(void) create_tokenizer(void)
{ {
if (!tokenizer) return NULL; if (!tokenizer) return NULL;


tokenizer->decoder = NULL; tokenizer->decoder = NULL;
tokenizer->read = tokenizer_state_end_of_buffer;

*tokenizer->token = '\0';
return tokenizer; return tokenizer;
} }


tokenizer_reset(espeak_ng_TOKENIZER *tokenizer, tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
espeak_ng_TEXT_DECODER *decoder) espeak_ng_TEXT_DECODER *decoder)
{ {
if (!tokenizer || !decoder) return 0;
if (!tokenizer) return 0;


tokenizer->decoder = decoder; tokenizer->decoder = decoder;
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
return 1; return 1;
} }


espeak_ng_TOKEN_TYPE espeak_ng_TOKEN_TYPE
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer) tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer)
{ {
return ESPEAKNG_TOKEN_END_OF_BUFFER;
return tokenizer->read(tokenizer);
} }


const char * const char *
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer) tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer)
{ {
return "";
return tokenizer->token;
} }

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

{ {
ESPEAKNG_TOKEN_END_OF_BUFFER, ESPEAKNG_TOKEN_END_OF_BUFFER,
ESPEAKNG_TOKEN_UNKNOWN, ESPEAKNG_TOKEN_UNKNOWN,
ESPEAKNG_TOKEN_NEWLINE,
} espeak_ng_TOKEN_TYPE; } espeak_ng_TOKEN_TYPE;


espeak_ng_TOKEN_TYPE espeak_ng_TOKEN_TYPE

+ 0
- 34
src/libespeak-ng/translate.c View File

return lookupwchar(brackets, c); return lookupwchar(brackets, c);
} }


int utf8_out(unsigned int c, char *buf)
{
// write a unicode character into a buffer as utf8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

int utf8_nbytes(const char *buf) int utf8_nbytes(const char *buf)
{ {
// Returns the number of bytes for the first UTF-8 character in buf // Returns the number of bytes for the first UTF-8 character in buf

+ 30
- 1
tests/tokenizer.c View File

#include <assert.h> #include <assert.h>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <stdio.h> #include <stdio.h>


#include <espeak-ng/espeak_ng.h> #include <espeak-ng/espeak_ng.h>
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0'); assert(*tokenizer_get_token_text(tokenizer) == '\0');


assert(tokenizer_reset(tokenizer, NULL) == 0);
assert(tokenizer_reset(tokenizer, NULL) == 1);


assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
destroy_tokenizer(tokenizer); destroy_tokenizer(tokenizer);
} }


void
test_linux_newline_tokens()
{
printf("testing linux newline tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
test_fullwidth(); test_fullwidth();


test_unbound_tokenizer(); test_unbound_tokenizer();
test_linux_newline_tokens();


printf("done\n"); printf("done\n");



Loading…
Cancel
Save