Browse Source

tokenizer.c: create and use a clause_type_from_codepoint function, with tests.

master
Reece H. Dunn 8 years ago
parent
commit
1c4ce3dcd3

+ 14
- 3
Makefile.am View File

@@ -112,9 +112,6 @@ docs: docs/index.html \
src/speak-ng.1.html \
README.html

check: tests/encoding.test
tests/encoding.test

##### build targets:

espeak_includedir = $(includedir)/espeak
@@ -163,6 +160,7 @@ src_libespeak_ng_la_SOURCES = \
src/libespeak-ng/synthdata.c \
src/libespeak-ng/synthesize.c \
src/libespeak-ng/synth_mbrola.c \
src/libespeak-ng/tokenizer.c \
src/libespeak-ng/translate.c \
src/libespeak-ng/tr_languages.c \
src/libespeak-ng/voices.c \
@@ -217,6 +215,19 @@ tests_encoding_test_SOURCES = \
src/libespeak-ng/mnemonics.c \
tests/encoding.c

noinst_bin_PROGRAMS += tests/tokenizer.test

tests_tokenizer_test_CFLAGS = \
-Isrc/libespeak-ng -Isrc/include -Isrc/include/compat -I src/ucd-tools/src/include \
-D _POSIX_C_SOURCE=200112L ${AM_CFLAGS}
tests_tokenizer_test_SOURCES = \
src/libespeak-ng/tokenizer.c \
tests/tokenizer.c

check: tests/encoding.test tests/tokenizer.test
tests/encoding.test
tests/tokenizer.test

##### phoneme data:

espeak-ng-data/phondata: phsource/phonemes.stamp

+ 1
- 0
android/jni/Android.mk View File

@@ -33,6 +33,7 @@ ESPEAK_SOURCES := \
src/libespeak-ng/synthdata.c \
src/libespeak-ng/synthesize.c \
src/libespeak-ng/synth_mbrola.c \
src/libespeak-ng/tokenizer.c \
src/libespeak-ng/translate.c \
src/libespeak-ng/tr_languages.c \
src/libespeak-ng/voices.c \

+ 4
- 119
src/libespeak-ng/readclause.c View File

@@ -63,117 +63,6 @@ static int sayas_mode;
static int sayas_start;
static int ssml_ignore_l_angle = 0;

// punctuations symbols that can end a clause
static const unsigned short punct_chars[] = {
',', '.', '?', '!', ':', ';',

0x00a1, // inverted exclamation
0x00bf, // inverted question
0x2013, // en-dash
0x2014, // em-dash
0x2026, // elipsis

0x037e, // Greek question mark (looks like semicolon)
0x0387, // Greek semicolon, ano teleia
0x0964, // Devanagari Danda (fullstop)

0x0589, // Armenian period
0x055d, // Armenian comma
0x055c, // Armenian exclamation
0x055e, // Armenian question
0x055b, // Armenian emphasis mark

0x060c, // Arabic ,
0x061b, // Arabic ;
0x061f, // Arabic ?
0x06d4, // Arabic .

0x0df4, // Singhalese Kunddaliya
0x0f0d, // Tibet Shad
0x0f0e,

0x1362, // Ethiopic period
0x1363,
0x1364,
0x1365,
0x1366,
0x1367,
0x1368,
0x10fb, // Georgian paragraph

0x3001, // ideograph comma
0x3002, // ideograph period

0xff01, // fullwidth exclamation
0xff0c, // fullwidth comma
0xff0e, // fullwidth period
0xff1a, // fullwidth colon
0xff1b, // fullwidth semicolon
0xff1f, // fullwidth question mark

0
};

// indexed by (entry num. in punct_chars) + 1
// bits 0-7 pause x 10mS, bits 12-14 intonation type, bit 15 don't need following space or bracket
static const unsigned int punct_attributes[] = {
0,

CLAUSE_COMMA,
CLAUSE_PERIOD,
CLAUSE_QUESTION,
CLAUSE_EXCLAMATION,
CLAUSE_COLON,
CLAUSE_SEMICOLON,

CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question
CLAUSE_SEMICOLON, // en-dash
CLAUSE_SEMICOLON, // em-dash
CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis

CLAUSE_QUESTION, // Greek question mark
CLAUSE_SEMICOLON, // Greek semicolon
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop)

CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period
CLAUSE_COMMA, // Armenian comma
CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation
CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question
CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark

CLAUSE_COMMA, // Arabic ,
CLAUSE_SEMICOLON, // Arabic ;
CLAUSE_QUESTION, // Arabic question mark
CLAUSE_PERIOD, // Arabic full stop

CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period
CLAUSE_PARAGRAPH,

CLAUSE_PERIOD, // Ethiopic period
CLAUSE_COMMA, // Ethiopic comma
CLAUSE_SEMICOLON, // Ethiopic semicolon
CLAUSE_COLON, // Ethiopic colon
CLAUSE_COLON, // Ethiopic preface colon
CLAUSE_QUESTION, // Ethiopic question mark
CLAUSE_PARAGRAPH, // Ethiopic paragraph
CLAUSE_PARAGRAPH, // Georgian paragraph

CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period

CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth
CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER,

CLAUSE_SEMICOLON, // spare
0
};

// stack for language and voice properties
// frame 0 is for the defaults, before any ssml tags.
typedef struct {
@@ -604,7 +493,7 @@ static int AnnouncePunctuation(Translator *tr, int c1, int *c2_ptr, char *output
if (c1 == '-')
return CLAUSE_NONE; // no pause

attributes = punct_attributes[lookupwchar(punct_chars, c1)];
attributes = clause_type_from_codepoint(c1);

short_pause = CLAUSE_SHORTFALL;
if ((attributes & CLAUSE_INTONATION_TYPE) == 0x1000)
@@ -1624,7 +1513,6 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_
int phoneme_mode = 0;
int n_xml_buf;
int terminator;
int punct;
int found;
int any_alnum = 0;
int self_closing;
@@ -1797,13 +1685,12 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_

if ((c2 == '\n') && (option_linelength == -1)) {
// single-line mode, return immediately on NL
if ((punct = lookupwchar(punct_chars, c1)) == 0) {
if ((terminator = clause_type_from_codepoint(c1)) == CLAUSE_NONE) {
charix[ix] = count_characters - clause_start_char;
*charix_top = ix;
ix += utf8_out(c1, &buf[ix]);
terminator = CLAUSE_PERIOD; // line doesn't end in punctuation, assume period
} else
terminator = punct_attributes[punct];
}
buf[ix] = ' ';
buf[ix+1] = 0;
return terminator;
@@ -1990,9 +1877,7 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_
}

punct_data = 0;
if ((punct = lookupwchar(punct_chars, c1)) != 0) {
punct_data = punct_attributes[punct];

if ((punct_data = clause_type_from_codepoint(c1)) != CLAUSE_NONE) {
if (punct_data & CLAUSE_PUNCTUATION_IN_WORD) {
// Armenian punctuation inside a word
stressed_word = 1;

+ 151
- 0
src/libespeak-ng/tokenizer.c View File

@@ -0,0 +1,151 @@
/* Tokenizer APIs.
*
* Copyright (C) 2005 to 2015 by Jonathan Duddington
* email: [email protected]
* Copyright (C) 2017 Reece H. Dunn
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see: <http://www.gnu.org/licenses/>.
*/

#include "config.h"

#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <espeak-ng/espeak_ng.h>

#include "encoding.h"
#include "speech.h"
#include "phoneme.h"
#include "synthesize.h"
#include "translate.h"

// punctuations symbols that can end a clause
static const unsigned short punct_chars[] = {
',', '.', '?', '!', ':', ';',

0x00a1, // inverted exclamation
0x00bf, // inverted question
0x2013, // en-dash
0x2014, // em-dash
0x2026, // elipsis

0x037e, // Greek question mark (looks like semicolon)
0x0387, // Greek semicolon, ano teleia
0x0964, // Devanagari Danda (fullstop)

0x0589, // Armenian period
0x055d, // Armenian comma
0x055c, // Armenian exclamation
0x055e, // Armenian question
0x055b, // Armenian emphasis mark

0x060c, // Arabic ,
0x061b, // Arabic ;
0x061f, // Arabic ?
0x06d4, // Arabic .

0x0df4, // Singhalese Kunddaliya
0x0f0d, // Tibet Shad
0x0f0e,

0x1362, // Ethiopic period
0x1363,
0x1364,
0x1365,
0x1366,
0x1367,
0x1368,
0x10fb, // Georgian paragraph

0x3001, // ideograph comma
0x3002, // ideograph period

0xff01, // fullwidth exclamation
0xff0c, // fullwidth comma
0xff0e, // fullwidth period
0xff1a, // fullwidth colon
0xff1b, // fullwidth semicolon
0xff1f, // fullwidth question mark

0
};

// indexed by entry num. in punct_chars
static const unsigned int punct_attributes[] = {
CLAUSE_COMMA,
CLAUSE_PERIOD,
CLAUSE_QUESTION,
CLAUSE_EXCLAMATION,
CLAUSE_COLON,
CLAUSE_SEMICOLON,

CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question
CLAUSE_SEMICOLON, // en-dash
CLAUSE_SEMICOLON, // em-dash
CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis

CLAUSE_QUESTION, // Greek question mark
CLAUSE_SEMICOLON, // Greek semicolon
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop)

CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period
CLAUSE_COMMA, // Armenian comma
CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation
CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question
CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark

CLAUSE_COMMA, // Arabic ,
CLAUSE_SEMICOLON, // Arabic ;
CLAUSE_QUESTION, // Arabic question mark
CLAUSE_PERIOD, // Arabic full stop

CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period
CLAUSE_PARAGRAPH,

CLAUSE_PERIOD, // Ethiopic period
CLAUSE_COMMA, // Ethiopic comma
CLAUSE_SEMICOLON, // Ethiopic semicolon
CLAUSE_COLON, // Ethiopic colon
CLAUSE_COLON, // Ethiopic preface colon
CLAUSE_QUESTION, // Ethiopic question mark
CLAUSE_PARAGRAPH, // Ethiopic paragraph
CLAUSE_PARAGRAPH, // Georgian paragraph

CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period

CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth
CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER,

0
};

int clause_type_from_codepoint(uint32_t c)
{
for (int ix = 0; punct_chars[ix] != 0; ++ix) {
if (punct_chars[ix] == c)
return punct_attributes[ix];
}
return CLAUSE_NONE;
}

+ 2
- 0
src/libespeak-ng/translate.h View File

@@ -235,6 +235,8 @@ extern "C"
#define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE)
#define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE)

int clause_type_from_codepoint(uint32_t c);

//@}

#define SAYAS_CHARS 0x12

+ 1
- 0
src/windows/libespeak-ng.vcxproj View File

@@ -186,6 +186,7 @@
<ClCompile Include="..\libespeak-ng\synthdata.c" />
<ClCompile Include="..\libespeak-ng\synthesize.c" />
<ClCompile Include="..\libespeak-ng\synth_mbrola.c" />
<ClCompile Include="..\libespeak-ng\tokenizer.c" />
<ClCompile Include="..\libespeak-ng\translate.c" />
<ClCompile Include="..\libespeak-ng\tr_languages.c" />
<ClCompile Include="..\libespeak-ng\voices.c" />

+ 174
- 0
tests/tokenizer.c View File

@@ -0,0 +1,174 @@
/*
* Copyright (C) 2017 Reece H. Dunn
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write see:
* <http://www.gnu.org/licenses/>.
*/

#include "config.h"

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>

#include <espeak-ng/espeak_ng.h>

#include "encoding.h"
#include "speech.h"
#include "phoneme.h"
#include "synthesize.h"
#include "translate.h"

void
test_latin_common()
{
printf("testing Latin/Common (Latn/Zyyy) script classification\n");

assert(clause_type_from_codepoint('a') == CLAUSE_NONE);
assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD);
assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION);
assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION);
assert(clause_type_from_codepoint(',') == CLAUSE_COMMA);
assert(clause_type_from_codepoint(':') == CLAUSE_COLON);
assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON);

assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));

assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON);
assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON);
assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER));
}

void
test_greek()
{
printf("testing Greek (Grek) script classification\n");

assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION);
assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON);
}

void
test_armenian()
{
printf("testing Armenian (Armn) script classification\n");

assert(clause_type_from_codepoint(0x055B) == (CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD));
assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA);
assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD));
assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
}

void
test_arabic()
{
printf("testing Arabic (Arab) script classification\n");

assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA);
assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON);
assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION);
assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD);
}

void
test_devanagari()
{
printf("testing Devanagari (Deva) script classification\n");

assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
}

void
test_tibetan()
{
printf("testing Tibetan (Tibt) script classification\n");

assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH);
}

void
test_sinhala()
{
printf("testing Sinhala (Sinh) script classification\n");

assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
}

void
test_georgian()
{
printf("testing Georgian (Geor) script classification\n");

assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH);
}

void
test_ethiopic()
{
printf("testing Ethiopic (Ethi) script classification\n");

assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD);
assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA);
assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON);
assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON);
assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON);
assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION);
assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH);
}

void
test_ideographic()
{
printf("testing Ideographic (Hani) script classification\n");

assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
}

void
test_fullwidth()
{
printf("testing Full Width/Common (Zyyy) script classification\n");

assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER));
}

int
main(int argc, char **argv)
{
test_latin_common();
test_greek();
test_armenian();
test_arabic();
test_devanagari();
test_tibetan();
test_sinhala();
test_georgian();
test_ethiopic();
test_ideographic();
test_fullwidth();

printf("done\n");

return EXIT_SUCCESS;
}

Loading…
Cancel
Save