Browse Source

tokenizer.c: Support linux newlines.

master
Reece H. Dunn 8 years ago
parent
commit
7602c9ac18

+ 2
- 0
Makefile.am View File

@@ -227,6 +227,8 @@ tests_tokenizer_test_CFLAGS = \
-D _POSIX_C_SOURCE=200112L ${AM_CFLAGS}
tests_tokenizer_test_SOURCES = \
$(UCD_TOOLS_SOURCES) \
src/libespeak-ng/encoding.c \
src/libespeak-ng/mnemonics.c \
src/libespeak-ng/tokenizer.c \
tests/tokenizer.c


+ 34
- 0
src/libespeak-ng/encoding.c View File

@@ -30,6 +30,40 @@
#define LEADING_2_BITS 0xC0 // 0b11000000
#define UTF8_TAIL_BITS 0x80 // 0b10000000

int utf8_out(unsigned int c, char *buf)
{
// write a unicode character into a buffer as utf8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

// http://www.iana.org/assignments/character-sets/character-sets.xhtml
MNEM_TAB mnem_encoding[] = {
{ "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII },

+ 46
- 3
src/libespeak-ng/tokenizer.c View File

@@ -86,8 +86,47 @@ int clause_type_from_codepoint(uint32_t c)
struct espeak_ng_TOKENIZER_
{
espeak_ng_TEXT_DECODER *decoder;
char token[256];

espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
};

static espeak_ng_TOKEN_TYPE
tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer)
{
*tokenizer->token = '\0';
return ESPEAKNG_TOKEN_END_OF_BUFFER;
}

static espeak_ng_TOKEN_TYPE
tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
{
if (text_decoder_eof(tokenizer->decoder)) {
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
}

uint32_t c;
char *current = tokenizer->token;

switch (c = text_decoder_getc(tokenizer->decoder))
{
case '\n':
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_NEWLINE;
case '\0':
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
default:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_UNKNOWN;
}

return ESPEAKNG_TOKEN_END_OF_BUFFER;
}

espeak_ng_TOKENIZER *
create_tokenizer(void)
{
@@ -95,6 +134,9 @@ create_tokenizer(void)
if (!tokenizer) return NULL;

tokenizer->decoder = NULL;
tokenizer->read = tokenizer_state_end_of_buffer;

*tokenizer->token = '\0';
return tokenizer;
}

@@ -108,20 +150,21 @@ int
tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
espeak_ng_TEXT_DECODER *decoder)
{
if (!tokenizer || !decoder) return 0;
if (!tokenizer) return 0;

tokenizer->decoder = decoder;
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
return 1;
}

espeak_ng_TOKEN_TYPE
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer)
{
return ESPEAKNG_TOKEN_END_OF_BUFFER;
return tokenizer->read(tokenizer);
}

const char *
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer)
{
return "";
return tokenizer->token;
}

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

@@ -38,6 +38,7 @@ typedef enum
{
ESPEAKNG_TOKEN_END_OF_BUFFER,
ESPEAKNG_TOKEN_UNKNOWN,
ESPEAKNG_TOKEN_NEWLINE,
} espeak_ng_TOKEN_TYPE;

espeak_ng_TOKEN_TYPE

+ 0
- 34
src/libespeak-ng/translate.c View File

@@ -305,40 +305,6 @@ int IsBracket(int c)
return lookupwchar(brackets, c);
}

int utf8_out(unsigned int c, char *buf)
{
// write a unicode character into a buffer as utf8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

int utf8_nbytes(const char *buf)
{
// Returns the number of bytes for the first UTF-8 character in buf

+ 30
- 1
tests/tokenizer.c View File

@@ -21,6 +21,7 @@
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include <espeak-ng/espeak_ng.h>
@@ -169,7 +170,7 @@ test_unbound_tokenizer()
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

assert(tokenizer_reset(tokenizer, NULL) == 0);
assert(tokenizer_reset(tokenizer, NULL) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
@@ -178,6 +179,33 @@ test_unbound_tokenizer()
destroy_tokenizer(tokenizer);
}

void
test_linux_newline_tokens()
{
printf("testing linux newline tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

int
main(int argc, char **argv)
{
@@ -194,6 +222,7 @@ main(int argc, char **argv)
test_fullwidth();

test_unbound_tokenizer();
test_linux_newline_tokens();

printf("done\n");


Loading…
Cancel
Save