Browse Source

Remove the tokenizer code. This is not currently used, and should be designed from espeak's behaviour.

master
Reece H. Dunn 7 years ago
parent
commit
cec0e773a4

+ 1
- 10
Makefile.am View File

@@ -131,8 +131,7 @@ espeak_ng_includedir = $(includedir)/espeak-ng
espeak_ng_include_HEADERS = \
src/include/espeak-ng/encoding.h \
src/include/espeak-ng/espeak_ng.h \
src/include/espeak-ng/speak_lib.h \
src/include/espeak-ng/tokenizer.h
src/include/espeak-ng/speak_lib.h

lib_LTLIBRARIES += src/libespeak-ng.la

@@ -171,7 +170,6 @@ src_libespeak_ng_la_SOURCES = \
src/libespeak-ng/synthdata.c \
src/libespeak-ng/synthesize.c \
src/libespeak-ng/synth_mbrola.c \
src/libespeak-ng/tokenizer.c \
src/libespeak-ng/translate.c \
src/libespeak-ng/tr_languages.c \
src/libespeak-ng/voices.c \
@@ -233,12 +231,6 @@ noinst_bin_PROGRAMS += tests/encoding.test
tests_encoding_test_LDADD = src/libespeak-ng.la
tests_encoding_test_SOURCES = tests/encoding.c

noinst_bin_PROGRAMS += tests/tokenizer.test

tests_tokenizer_test_CFLAGS = -Isrc/libespeak-ng ${AM_CFLAGS}
tests_tokenizer_test_LDADD = src/libespeak-ng.la
tests_tokenizer_test_SOURCES = tests/tokenizer.c

noinst_bin_PROGRAMS += tests/readclause.test

tests_readclause_test_CFLAGS = -Isrc/libespeak-ng ${AM_CFLAGS}
@@ -256,7 +248,6 @@ tests_api_test_SOURCES = tests/api.c
@ESPEAK_DATA_PATH=$(PWD) $< && echo " PASSED $<"

check: tests/encoding.check \
tests/tokenizer.check \
tests/readclause.check \
tests/api.check \
tests/languages.check

+ 0
- 1
android/jni/Android.mk View File

@@ -33,7 +33,6 @@ ESPEAK_SOURCES := \
src/libespeak-ng/synthdata.c \
src/libespeak-ng/synthesize.c \
src/libespeak-ng/synth_mbrola.c \
src/libespeak-ng/tokenizer.c \
src/libespeak-ng/translate.c \
src/libespeak-ng/tr_languages.c \
src/libespeak-ng/voices.c \

+ 0
- 75
src/include/espeak-ng/tokenizer.h View File

@@ -1,75 +0,0 @@
/*
* Copyright (C) 2017 Reece H. Dunn
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see: <http://www.gnu.org/licenses/>.
*/
#ifndef ESPEAK_NG_TOKENIZER_H
#define ESPEAK_NG_TOKENIZER_H

#ifdef __cplusplus
extern "C"
{
#endif

typedef struct espeak_ng_TOKENIZER_ espeak_ng_TOKENIZER;

ESPEAK_NG_API espeak_ng_TOKENIZER *
create_tokenizer(void);

ESPEAK_NG_API void
destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer);

typedef enum
{
ESPEAKNG_TOKENIZER_OPTION_TEXT = 0,
} espeak_ng_TOKENIZER_OPTIONS;

ESPEAK_NG_API int
tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
espeak_ng_TEXT_DECODER *decoder,
espeak_ng_TOKENIZER_OPTIONS options);

typedef enum
{
ESPEAKNG_TOKEN_END_OF_BUFFER,
ESPEAKNG_TOKEN_UNKNOWN,
ESPEAKNG_TOKEN_NEWLINE,
ESPEAKNG_TOKEN_PARAGRAPH,
ESPEAKNG_TOKEN_WHITESPACE,
ESPEAKNG_TOKEN_WORD_UPPERCASE,
ESPEAKNG_TOKEN_WORD_LOWERCASE,
ESPEAKNG_TOKEN_WORD_MIXEDCASE,
ESPEAKNG_TOKEN_WORD_CAPITALIZED,
ESPEAKNG_TOKEN_FULL_STOP,
ESPEAKNG_TOKEN_QUESTION_MARK,
ESPEAKNG_TOKEN_EXCLAMATION_MARK,
ESPEAKNG_TOKEN_COMMA,
ESPEAKNG_TOKEN_COLON,
ESPEAKNG_TOKEN_SEMICOLON,
ESPEAKNG_TOKEN_ELLIPSIS,
ESPEAKNG_TOKEN_PUNCTUATION,
ESPEAKNG_TOKEN_SYMBOL,
} espeak_ng_TOKEN_TYPE;

ESPEAK_NG_API espeak_ng_TOKEN_TYPE
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer);

ESPEAK_NG_API const char *
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer);

#ifdef __cplusplus
}
#endif

#endif

+ 49
- 0
src/libespeak-ng/readclause.c View File

@@ -89,6 +89,55 @@ PARAM_STACK param_stack[N_PARAM_STACK];
static int speech_parameters[N_SPEECH_PARAM]; // current values, from param_stack
int saved_parameters[N_SPEECH_PARAM]; // Parameters saved on synthesis start

#define ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK 0xFFF0000000000000ull

int clause_type_from_codepoint(uint32_t c)
{
ucd_category cat = ucd_lookup_category(c);
ucd_property props = ucd_properties(c, cat);

switch (props & ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK)
{
case ESPEAKNG_PROPERTY_FULL_STOP:
return CLAUSE_PERIOD;
case ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_QUESTION_MARK:
return CLAUSE_QUESTION;
case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
return CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
return CLAUSE_EXCLAMATION;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
return CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD;
case ESPEAKNG_PROPERTY_COMMA:
return CLAUSE_COMMA;
case ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_COLON:
return CLAUSE_COLON;
case ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_SEMI_COLON:
case ESPEAKNG_PROPERTY_EXTENDED_DASH:
return CLAUSE_SEMICOLON;
case ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
return CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_ELLIPSIS:
return CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR:
return CLAUSE_PARAGRAPH;
}

return CLAUSE_NONE;
}

const int param_defaults[N_SPEECH_PARAM] = {
0, // silence (internal use)
175, // rate wpm

+ 0
- 410
src/libespeak-ng/tokenizer.c View File

@@ -1,410 +0,0 @@
/* Tokenizer APIs.
*
* Copyright (C) 2017 Reece H. Dunn
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see: <http://www.gnu.org/licenses/>.
*/

#include "config.h"

#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <espeak-ng/espeak_ng.h>
#include <espeak-ng/encoding.h>
#include <espeak-ng/tokenizer.h>
#include <ucd/ucd.h>

#include "speech.h"
#include "phoneme.h"
#include "synthesize.h"
#include "translate.h"

#define ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK 0xFFF0000000000000ull

int clause_type_from_codepoint(uint32_t c)
{
ucd_category cat = ucd_lookup_category(c);
ucd_property props = ucd_properties(c, cat);

switch (props & ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK)
{
case ESPEAKNG_PROPERTY_FULL_STOP:
return CLAUSE_PERIOD;
case ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_QUESTION_MARK:
return CLAUSE_QUESTION;
case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
return CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
return CLAUSE_EXCLAMATION;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
return CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD;
case ESPEAKNG_PROPERTY_COMMA:
return CLAUSE_COMMA;
case ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_COLON:
return CLAUSE_COLON;
case ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_SEMI_COLON:
case ESPEAKNG_PROPERTY_EXTENDED_DASH:
return CLAUSE_SEMICOLON;
case ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
return CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_ELLIPSIS:
return CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR:
return CLAUSE_PARAGRAPH;
}

return CLAUSE_NONE;
}

typedef enum {
ESPEAKNG_CTYPE_OTHER,
ESPEAKNG_CTYPE_CARRIAGE_RETURN,
ESPEAKNG_CTYPE_NEWLINE,
ESPEAKNG_CTYPE_END_OF_STRING,
ESPEAKNG_CTYPE_PARAGRAPH,
ESPEAKNG_CTYPE_WHITESPACE,
ESPEAKNG_CTYPE_LOWERCASE,
ESPEAKNG_CTYPE_UPPERCASE,
ESPEAKNG_CTYPE_FULL_STOP,
ESPEAKNG_CTYPE_QUESTION_MARK,
ESPEAKNG_CTYPE_EXCLAMATION_MARK,
ESPEAKNG_CTYPE_COMMA,
ESPEAKNG_CTYPE_COLON,
ESPEAKNG_CTYPE_SEMICOLON,
ESPEAKNG_CTYPE_ELLIPSIS,
ESPEAKNG_CTYPE_PUNCTUATION,
ESPEAKNG_CTYPE_SYMBOL,
} espeakng_CTYPE;

#define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull

// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
static espeakng_CTYPE codepoint_type(uint32_t c)
{
// 1. Detect and classify specific codepoints.

switch (c)
{
case 0x0000: return ESPEAKNG_CTYPE_END_OF_STRING; // NULL
case 0x000A: return ESPEAKNG_CTYPE_NEWLINE; // LINE FEED (LF)
case 0x000C: return ESPEAKNG_CTYPE_NEWLINE; // FORM FEED (FF)
case 0x000D: return ESPEAKNG_CTYPE_CARRIAGE_RETURN; // CARRIAGE RETURN (CR)
case 0x0085: return ESPEAKNG_CTYPE_NEWLINE; // NEW LINE (NEL)
}

// 2. Override property types for codepoints by their Unicode General Category.

ucd_category cat = ucd_lookup_category(c);
switch (cat)
{
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE;
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH;
}

// 3. Classify codepoints by their Unicode properties.

ucd_property props = ucd_properties(c, cat);
switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK)
{
case UCD_PROPERTY_WHITE_SPACE:
return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_PROPERTY_OTHER_LOWERCASE:
return ESPEAKNG_CTYPE_LOWERCASE;
case UCD_PROPERTY_OTHER_UPPERCASE:
return ESPEAKNG_CTYPE_UPPERCASE;
case ESPEAKNG_PROPERTY_FULL_STOP:
return ESPEAKNG_CTYPE_FULL_STOP;
case ESPEAKNG_PROPERTY_QUESTION_MARK:
return ESPEAKNG_CTYPE_QUESTION_MARK;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
return ESPEAKNG_CTYPE_EXCLAMATION_MARK;
case ESPEAKNG_PROPERTY_COMMA:
return ESPEAKNG_CTYPE_COMMA;
case ESPEAKNG_PROPERTY_COLON:
return ESPEAKNG_CTYPE_COLON;
case ESPEAKNG_PROPERTY_SEMI_COLON:
return ESPEAKNG_CTYPE_SEMICOLON;
case ESPEAKNG_PROPERTY_ELLIPSIS:
return ESPEAKNG_CTYPE_ELLIPSIS;
}

// 4. Classify codepoints by their Unicode General Category.

switch (cat)
{
case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE;
case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE;
case UCD_CATEGORY_Pc: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pd: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pe: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pf: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pi: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Po: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Ps: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Sc: return ESPEAKNG_CTYPE_SYMBOL;
case UCD_CATEGORY_Sk: return ESPEAKNG_CTYPE_SYMBOL;
case UCD_CATEGORY_Sm: return ESPEAKNG_CTYPE_SYMBOL;
case UCD_CATEGORY_So: return ESPEAKNG_CTYPE_SYMBOL;
}

// 5. Classify the remaining codepoints.

return ESPEAKNG_CTYPE_OTHER;
}

#define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF

struct espeak_ng_TOKENIZER_
{
espeak_ng_TEXT_DECODER *decoder;
char token[256];
uint32_t keepc;

espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
};

static espeak_ng_TOKEN_TYPE
tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer)
{
*tokenizer->token = '\0';
return ESPEAKNG_TOKEN_END_OF_BUFFER;
}

static espeak_ng_TOKEN_TYPE
tokenizer_read_word_token(espeak_ng_TOKENIZER *tokenizer, char *current, espeak_ng_TOKEN_TYPE type)
{
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
int initial_state = 1;

while (current < end && !text_decoder_eof(tokenizer->decoder)) {
uint32_t c = text_decoder_getc(tokenizer->decoder);
switch (codepoint_type(c))
{
case ESPEAKNG_CTYPE_LOWERCASE:
current += utf8_out(c, current);
switch (type)
{
case ESPEAKNG_TOKEN_WORD_LOWERCASE:
case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
break;
case ESPEAKNG_TOKEN_WORD_UPPERCASE:
type = initial_state
? ESPEAKNG_TOKEN_WORD_CAPITALIZED
: ESPEAKNG_TOKEN_WORD_MIXEDCASE;
break;
}
initial_state = 0;
break;
case ESPEAKNG_CTYPE_UPPERCASE:
current += utf8_out(c, current);
switch (type)
{
case ESPEAKNG_TOKEN_WORD_UPPERCASE:
case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
break;
case ESPEAKNG_TOKEN_WORD_LOWERCASE:
case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
type = ESPEAKNG_TOKEN_WORD_MIXEDCASE;
break;
}
initial_state = 0;
break;
default:
tokenizer->keepc = c;
*current = '\0';
return type;
}
}

*current = '\0';
return type;
}

static espeak_ng_TOKEN_TYPE
tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
{
if (text_decoder_eof(tokenizer->decoder)) {
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
}

char *current = tokenizer->token;
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes

uint32_t c;
if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) {
c = tokenizer->keepc;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
} else {
c = text_decoder_getc(tokenizer->decoder);
}

switch (codepoint_type(c))
{
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r'
if (text_decoder_peekc(tokenizer->decoder) == '\n') {
current += utf8_out(c, current);
c = text_decoder_getc(tokenizer->decoder);
}
// fallthrough
case ESPEAKNG_CTYPE_NEWLINE:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_NEWLINE;
case ESPEAKNG_CTYPE_PARAGRAPH:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_PARAGRAPH;
case ESPEAKNG_CTYPE_WHITESPACE:
current += utf8_out(c, current);
while (!text_decoder_eof(tokenizer->decoder) &&
current < end &&
codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE)
{
current += utf8_out(c, current);
}
tokenizer->keepc = c;
*current = '\0';
return ESPEAKNG_TOKEN_WHITESPACE;
case ESPEAKNG_CTYPE_LOWERCASE:
current += utf8_out(c, current);
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_LOWERCASE);
case ESPEAKNG_CTYPE_UPPERCASE:
current += utf8_out(c, current);
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE);
case ESPEAKNG_CTYPE_FULL_STOP:
current += utf8_out(c, current);
if (c == '.' && text_decoder_peekc(tokenizer->decoder) == '.') {
c = text_decoder_getc(tokenizer->decoder);
if (text_decoder_peekc(tokenizer->decoder) == '.') {
c = text_decoder_getc(tokenizer->decoder);
current += utf8_out('.', current);
current += utf8_out('.', current);
*current = '\0';
return ESPEAKNG_TOKEN_ELLIPSIS;
} else {
tokenizer->keepc = c;
}
}
*current = '\0';
return ESPEAKNG_TOKEN_FULL_STOP;
case ESPEAKNG_CTYPE_QUESTION_MARK:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_QUESTION_MARK;
case ESPEAKNG_CTYPE_EXCLAMATION_MARK:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_EXCLAMATION_MARK;
case ESPEAKNG_CTYPE_COMMA:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_COMMA;
case ESPEAKNG_CTYPE_COLON:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_COLON;
case ESPEAKNG_CTYPE_SEMICOLON:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_SEMICOLON;
case ESPEAKNG_CTYPE_ELLIPSIS:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_ELLIPSIS;
case ESPEAKNG_CTYPE_PUNCTUATION:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_PUNCTUATION;
case ESPEAKNG_CTYPE_SYMBOL:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_SYMBOL;
default:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_UNKNOWN;
}

return ESPEAKNG_TOKEN_END_OF_BUFFER;
}

#pragma GCC visibility push(default)

espeak_ng_TOKENIZER *
create_tokenizer(void)
{
espeak_ng_TOKENIZER *tokenizer = malloc(sizeof(espeak_ng_TOKENIZER));
if (!tokenizer) return NULL;

tokenizer->decoder = NULL;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
tokenizer->read = tokenizer_state_end_of_buffer;

*tokenizer->token = '\0';
return tokenizer;
}

void
destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer)
{
if (tokenizer) free(tokenizer);
}

int
tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
espeak_ng_TEXT_DECODER *decoder,
espeak_ng_TOKENIZER_OPTIONS options)
{
if (!tokenizer) return 0;

tokenizer->decoder = decoder;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
return 1;
}

espeak_ng_TOKEN_TYPE
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer)
{
return tokenizer->read(tokenizer);
}

const char *
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer)
{
return tokenizer->token;
}

#pragma GCC visibility pop

+ 1
- 2
src/windows/libespeak-ng.vcxproj View File

@@ -186,7 +186,6 @@
<ClCompile Include="..\libespeak-ng\synthdata.c" />
<ClCompile Include="..\libespeak-ng\synthesize.c" />
<ClCompile Include="..\libespeak-ng\synth_mbrola.c" />
<ClCompile Include="..\libespeak-ng\tokenizer.c" />
<ClCompile Include="..\libespeak-ng\translate.c" />
<ClCompile Include="..\libespeak-ng\tr_languages.c" />
<ClCompile Include="..\libespeak-ng\voices.c" />
@@ -221,4 +220,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

+ 1
- 4
src/windows/libespeak-ng.vcxproj.filters View File

@@ -135,9 +135,6 @@
<ClCompile Include="..\libespeak-ng\phoneme.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\libespeak-ng\tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="config.h">
@@ -194,4 +191,4 @@
<Filter>Source Files</Filter>
</None>
</ItemGroup>
</Project>
</Project>

+ 0
- 704
tests/tokenizer.c View File

@@ -1,704 +0,0 @@
/*
* Copyright (C) 2017 Reece H. Dunn
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write see:
* <http://www.gnu.org/licenses/>.
*/

#include "config.h"

#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <sys/stat.h>

#include <espeak-ng/espeak_ng.h>
#include <espeak-ng/encoding.h>
#include <espeak-ng/tokenizer.h>

#include "speech.h"
#include "phoneme.h"
#include "synthesize.h"
#include "translate.h"

// TODO: Find a better place for this than speech.c, so it can be implemented
// in one place without having to include all of speech.c.
int GetFileLength(const char *filename)
{
struct stat statbuf;

if (stat(filename, &statbuf) != 0)
return -errno;

if (S_ISDIR(statbuf.st_mode))
return -EISDIR;

return statbuf.st_size;
}

void
test_unbound_tokenizer()
{
printf("testing unbound tokenizer\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
assert(tokenizer != NULL);

assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

assert(tokenizer_reset(tokenizer, NULL, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_tokenizer(tokenizer);
}

void
test_linux_newline_tokens()
{
printf("testing linux newline tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_mac_newline_tokens()
{
printf("testing mac newline tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\r\r", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_windows_newline_tokens()
{
printf("testing windows newline tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\r\n\r\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_unicode_newline_tokens()
{
printf("testing unicode newline tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

// FORM FEED (FF) -- Used as a page (not paragraph) break.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);

// NEXT LINE (NEL) [U+0085] -- Used in EBCDIC systems as a combined CR+LF character.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);

// General Category: Zl -- LINE SEPARATOR [U+2028]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_paragraph_tokens()
{
printf("testing paragraph tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

// General Category: Zp, PARAGRAPH SEPARATOR [U+2029]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_whitespace_tokens()
{
printf("testing whitespace tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\t\t\n\x0B\x0B\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

// General Category: Cc, Property: White_Space
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Cc, Property: White_Space, VERTICAL TAB (VT) -- Not treated as newline tokens.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0B\x0B") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zs, Property: White_Space
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zs, Property: White_Space, Decomposition: <noBreak>, NO-BREAK SPACE [U+00A0]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_Latn_word_tokens()
{
printf("testing Latin (Latn) script word tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "One one ONE OneTwo ONETwo", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_CAPITALIZED);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "One") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_LOWERCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "one") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_UPPERCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "ONE") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "OneTwo") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "ONETwo") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_Latn_punctuation_tokens()
{
printf("testing Latin (Latn) script punctuation tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, ". ? .. ! ... , .... : ; \xE2\x80\xA6", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_QUESTION_MARK);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "?") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_EXCLAMATION_MARK);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "!") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COMMA);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ",") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COLON);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ":") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SEMICOLON);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ";") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// HORIZONTAL ELLIPSIS [U+2026]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA6") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_Latn_general_punctuation_tokens()
{
printf("testing Latin (Latn) script general punctuation tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\" () - _ \xC2\xAB\xC2\xBB", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

// General Category: Po
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\"") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Ps
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "(") == 0);

// General Category: Pe
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ")") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Pd
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "-") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Pc
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "_") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Pi, LEFT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00AB]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xAB") == 0);

// General Category: Pf, RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00BB]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xBB") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_Latn_symbol_tokens()
{
printf("testing Latin (Latn) script symbol tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "$ ^ + \xC2\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

// General Category: Sc
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "$") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Sk
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "^") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Sm
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "+") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: So, COPYRIGHT SIGN [U+00A9]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
run_tests()
{
test_unbound_tokenizer();

test_linux_newline_tokens();
test_mac_newline_tokens();
test_windows_newline_tokens();
test_unicode_newline_tokens();
test_paragraph_tokens();
test_whitespace_tokens();

test_Latn_word_tokens();
test_Latn_punctuation_tokens();
test_Latn_general_punctuation_tokens();
test_Latn_symbol_tokens();
}

void
escape_newline(const char *s)
{
for ( ; *s; ++s) switch (*s)
{
case '\r': printf("\\r"); break;
case '\n': printf("\\n"); break;
default: putc(*s, stdout); break;
}
}

void
print_tokens(espeak_ng_TEXT_DECODER *decoder)
{
espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
if (!tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT)) {
destroy_tokenizer(tokenizer);
return;
}

while (1) switch (tokenizer_read_next_token(tokenizer))
{
case ESPEAKNG_TOKEN_END_OF_BUFFER:
destroy_tokenizer(tokenizer);
return;
case ESPEAKNG_TOKEN_UNKNOWN:
printf("unknown : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_NEWLINE:
printf("newline : ");
escape_newline(tokenizer_get_token_text(tokenizer));
putc('\n', stdout);
break;
case ESPEAKNG_TOKEN_PARAGRAPH:
printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WHITESPACE:
printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_UPPERCASE:
printf("word (upper case) : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_LOWERCASE:
printf("word (lower case) : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
printf("word (mixed case) : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_FULL_STOP:
printf("full stop : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_QUESTION_MARK:
printf("question mark : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_EXCLAMATION_MARK:
printf("exclamation mark : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_COMMA:
printf("comma : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_COLON:
printf("colon : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_SEMICOLON:
printf("semicolon : %s\n", tokenizer_get_token_text(tokenizer));
break;
}
}

void
print_tokens_from_file(const char *filename, const char *encoding_name)
{
espeak_ng_ENCODING encoding = espeak_ng_EncodingFromName(encoding_name);
if (encoding == ESPEAKNG_ENCODING_UNKNOWN) {
printf("Unknown encoding \"%s\".\n", encoding_name);
return;
}

int length = GetFileLength(filename);
FILE *f = (length > 0) ? fopen(filename, "rb") : NULL;
if (!f) {
printf("Cannot open file: %s\n", filename);
return;
}

char *buffer = malloc(length);
if (!buffer) {
fclose(f);
printf("Out of memory!\n");
return;
}

fread(buffer, 1, length, f);
fclose(f);

espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
if (text_decoder_decode_string(decoder, buffer, length, encoding) == ENS_OK)
print_tokens(decoder);

destroy_text_decoder(decoder);
}

void
usage(const char *program)
{
printf("%s -- Run the tokenizer tests.\n", program);
printf("%s ENCODING FILENAME -- Print the tokens for FILENAME.\n", program);
}

int
main(int argc, char **argv)
{
switch (argc)
{
case 1: run_tests(); break;
case 3: print_tokens_from_file(argv[2], argv[1]); break;
default: usage(argv[0]); return EXIT_FAILURE;
}

return EXIT_SUCCESS;
}

Loading…
Cancel
Save