8 years ago · 1c4ce3dcd3
--- a/Makefile.am
+++ b/Makefile.am
 	src/speak-ng.1.html \
 	README.html
 check:	tests/encoding.test
 	tests/encoding.test
 ##### build targets:
 espeak_includedir = $(includedir)/espeak
 	src/libespeak-ng/synthdata.c \
 	src/libespeak-ng/synthesize.c \
 	src/libespeak-ng/synth_mbrola.c \
 	src/libespeak-ng/tokenizer.c \
 	src/libespeak-ng/translate.c \
 	src/libespeak-ng/tr_languages.c \
 	src/libespeak-ng/voices.c \
 	src/libespeak-ng/mnemonics.c \
 	tests/encoding.c
 noinst_bin_PROGRAMS += tests/tokenizer.test
 tests_tokenizer_test_CFLAGS = \
 	-Isrc/libespeak-ng -Isrc/include -Isrc/include/compat -I src/ucd-tools/src/include \
 	-D _POSIX_C_SOURCE=200112L ${AM_CFLAGS}
 tests_tokenizer_test_SOURCES = \
 	src/libespeak-ng/tokenizer.c \
 	tests/tokenizer.c
 check:	tests/encoding.test tests/tokenizer.test
 	tests/encoding.test
 	tests/tokenizer.test
 ##### phoneme data:
 espeak-ng-data/phondata: phsource/phonemes.stamp
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
  src/libespeak-ng/synthdata.c \
  src/libespeak-ng/synthesize.c \
  src/libespeak-ng/synth_mbrola.c \
  src/libespeak-ng/tokenizer.c \
  src/libespeak-ng/translate.c \
  src/libespeak-ng/tr_languages.c \
  src/libespeak-ng/voices.c \
--- a/src/libespeak-ng/readclause.c
+++ b/src/libespeak-ng/readclause.c
 static int sayas_start;
 static int ssml_ignore_l_angle = 0;
 // punctuations symbols that can end a clause
 static const unsigned short punct_chars[] = {
 	',', '.', '?', '!', ':', ';',
 	0x00a1, // inverted exclamation
 	0x00bf, // inverted question
 	0x2013, // en-dash
 	0x2014, // em-dash
 	0x2026, // elipsis
 	0x037e, // Greek question mark (looks like semicolon)
 	0x0387, // Greek semicolon, ano teleia
 	0x0964, // Devanagari Danda (fullstop)
 	0x0589, // Armenian period
 	0x055d, // Armenian comma
 	0x055c, // Armenian exclamation
 	0x055e, // Armenian question
 	0x055b, // Armenian emphasis mark
 	0x060c, // Arabic ,
 	0x061b, // Arabic ;
 	0x061f, // Arabic ?
 	0x06d4, // Arabic .
 	0x0df4, // Singhalese Kunddaliya
 	0x0f0d, // Tibet Shad
 	0x0f0e,
 	0x1362, // Ethiopic period
 	0x1363,
 	0x1364,
 	0x1365,
 	0x1366,
 	0x1367,
 	0x1368,
 	0x10fb, // Georgian paragraph
 	0x3001, // ideograph comma
 	0x3002, // ideograph period
 	0xff01, // fullwidth exclamation
 	0xff0c, // fullwidth comma
 	0xff0e, // fullwidth period
 	0xff1a, // fullwidth colon
 	0xff1b, // fullwidth semicolon
 	0xff1f, // fullwidth question mark
 	0
 };
 // indexed by (entry num. in punct_chars) + 1
 // bits 0-7 pause x 10mS, bits 12-14 intonation type, bit 15 don't need following space or bracket
 static const unsigned int punct_attributes[] = {
 	0,
 	CLAUSE_COMMA,
 	CLAUSE_PERIOD,
 	CLAUSE_QUESTION,
 	CLAUSE_EXCLAMATION,
 	CLAUSE_COLON,
 	CLAUSE_SEMICOLON,
 	CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,  // inverted exclamation
 	CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,  // inverted question
 	CLAUSE_SEMICOLON,  // en-dash
 	CLAUSE_SEMICOLON,  // em-dash
 	CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER,  // elipsis
 	CLAUSE_QUESTION,  // Greek question mark
 	CLAUSE_SEMICOLON,  // Greek semicolon
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Devanagari Danda (fullstop)
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Armenian period
 	CLAUSE_COMMA,  // Armenian comma
 	CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD,  // Armenian exclamation
 	CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD,  // Armenian question
 	CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD,  // Armenian emphasis mark
 	CLAUSE_COMMA,  // Arabic ,
 	CLAUSE_SEMICOLON,  // Arabic ;
 	CLAUSE_QUESTION,  // Arabic question mark
 	CLAUSE_PERIOD,  // Arabic full stop
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Singhalese period
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Tibet period
 	CLAUSE_PARAGRAPH,
 	CLAUSE_PERIOD,  // Ethiopic period
 	CLAUSE_COMMA,  // Ethiopic comma
 	CLAUSE_SEMICOLON,  // Ethiopic semicolon
 	CLAUSE_COLON,  // Ethiopic colon
 	CLAUSE_COLON,  // Ethiopic preface colon
 	CLAUSE_QUESTION,  // Ethiopic question mark
 	CLAUSE_PARAGRAPH,  // Ethiopic paragraph
 	CLAUSE_PARAGRAPH,  // Georgian paragraph
 	CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER,  // ideograph comma
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // ideograph period
 	CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER,  // fullwidth
 	CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_SEMICOLON,  // spare
 	0
 };
 // stack for language and voice properties
 // frame 0 is for the defaults, before any ssml tags.
 typedef struct {
 	if (c1 == '-')
 		return CLAUSE_NONE; // no pause
 	attributes = punct_attributes[lookupwchar(punct_chars, c1)];
 	attributes = clause_type_from_codepoint(c1);
 	short_pause = CLAUSE_SHORTFALL;
 	if ((attributes & CLAUSE_INTONATION_TYPE) == 0x1000)
 	int phoneme_mode = 0;
 	int n_xml_buf;
 	int terminator;
 	int punct;
 	int found;
 	int any_alnum = 0;
 	int self_closing;
 		if ((c2 == '\n') && (option_linelength == -1)) {
 			// single-line mode, return immediately on NL
 			if ((punct = lookupwchar(punct_chars, c1)) == 0) {
 			if ((terminator = clause_type_from_codepoint(c1)) == CLAUSE_NONE) {
 				charix[ix] = count_characters - clause_start_char;
 				*charix_top = ix;
 				ix += utf8_out(c1, &buf[ix]);
 				terminator = CLAUSE_PERIOD; // line doesn't end in punctuation, assume period
 			} else
 				terminator = punct_attributes[punct];
 			}
 			buf[ix] = ' ';
 			buf[ix+1] = 0;
 			return terminator;
 			}
 			punct_data = 0;
 			if ((punct = lookupwchar(punct_chars, c1)) != 0) {
 				punct_data = punct_attributes[punct];
 			if ((punct_data = clause_type_from_codepoint(c1)) != CLAUSE_NONE) {
 				if (punct_data & CLAUSE_PUNCTUATION_IN_WORD) {
 					// Armenian punctuation inside a word
 					stressed_word = 1;
--- a/src/libespeak-ng/tokenizer.c
+++ b/src/libespeak-ng/tokenizer.c
 /* Tokenizer APIs.
 *
 * Copyright (C) 2005 to 2015 by Jonathan Duddington
 * email: [email protected]
 * Copyright (C) 2017 Reece H. Dunn
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see: <http://www.gnu.org/licenses/>.
 */
 #include "config.h"
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <espeak-ng/espeak_ng.h>
 #include "encoding.h"
 #include "speech.h"
 #include "phoneme.h"
 #include "synthesize.h"
 #include "translate.h"
 // punctuations symbols that can end a clause
 static const unsigned short punct_chars[] = {
 	',', '.', '?', '!', ':', ';',
 	0x00a1, // inverted exclamation
 	0x00bf, // inverted question
 	0x2013, // en-dash
 	0x2014, // em-dash
 	0x2026, // elipsis
 	0x037e, // Greek question mark (looks like semicolon)
 	0x0387, // Greek semicolon, ano teleia
 	0x0964, // Devanagari Danda (fullstop)
 	0x0589, // Armenian period
 	0x055d, // Armenian comma
 	0x055c, // Armenian exclamation
 	0x055e, // Armenian question
 	0x055b, // Armenian emphasis mark
 	0x060c, // Arabic ,
 	0x061b, // Arabic ;
 	0x061f, // Arabic ?
 	0x06d4, // Arabic .
 	0x0df4, // Singhalese Kunddaliya
 	0x0f0d, // Tibet Shad
 	0x0f0e,
 	0x1362, // Ethiopic period
 	0x1363,
 	0x1364,
 	0x1365,
 	0x1366,
 	0x1367,
 	0x1368,
 	0x10fb, // Georgian paragraph
 	0x3001, // ideograph comma
 	0x3002, // ideograph period
 	0xff01, // fullwidth exclamation
 	0xff0c, // fullwidth comma
 	0xff0e, // fullwidth period
 	0xff1a, // fullwidth colon
 	0xff1b, // fullwidth semicolon
 	0xff1f, // fullwidth question mark
 	0
 };
 // indexed by entry num. in punct_chars
 static const unsigned int punct_attributes[] = {
 	CLAUSE_COMMA,
 	CLAUSE_PERIOD,
 	CLAUSE_QUESTION,
 	CLAUSE_EXCLAMATION,
 	CLAUSE_COLON,
 	CLAUSE_SEMICOLON,
 	CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,  // inverted exclamation
 	CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,  // inverted question
 	CLAUSE_SEMICOLON,  // en-dash
 	CLAUSE_SEMICOLON,  // em-dash
 	CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER,  // elipsis
 	CLAUSE_QUESTION,  // Greek question mark
 	CLAUSE_SEMICOLON,  // Greek semicolon
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Devanagari Danda (fullstop)
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Armenian period
 	CLAUSE_COMMA,  // Armenian comma
 	CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD,  // Armenian exclamation
 	CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD,  // Armenian question
 	CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD,  // Armenian emphasis mark
 	CLAUSE_COMMA,  // Arabic ,
 	CLAUSE_SEMICOLON,  // Arabic ;
 	CLAUSE_QUESTION,  // Arabic question mark
 	CLAUSE_PERIOD,  // Arabic full stop
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Singhalese period
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // Tibet period
 	CLAUSE_PARAGRAPH,
 	CLAUSE_PERIOD,  // Ethiopic period
 	CLAUSE_COMMA,  // Ethiopic comma
 	CLAUSE_SEMICOLON,  // Ethiopic semicolon
 	CLAUSE_COLON,  // Ethiopic colon
 	CLAUSE_COLON,  // Ethiopic preface colon
 	CLAUSE_QUESTION,  // Ethiopic question mark
 	CLAUSE_PARAGRAPH,  // Ethiopic paragraph
 	CLAUSE_PARAGRAPH,  // Georgian paragraph
 	CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER,  // ideograph comma
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,  // ideograph period
 	CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER,  // fullwidth
 	CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,
 	CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER,
 	0
 };
 int clause_type_from_codepoint(uint32_t c)
 {
 	for (int ix = 0; punct_chars[ix] != 0; ++ix) {
 		if (punct_chars[ix] == c)
 			return punct_attributes[ix];
 	}
 	return CLAUSE_NONE;
 }
--- a/src/libespeak-ng/translate.h
+++ b/src/libespeak-ng/translate.h
 #define CLAUSE_COLON       (30 | CLAUSE_INTONATION_FULL_STOP   | CLAUSE_TYPE_CLAUSE)
 #define CLAUSE_SEMICOLON   (30 | CLAUSE_INTONATION_COMMA       | CLAUSE_TYPE_CLAUSE)
 int clause_type_from_codepoint(uint32_t c);
 //@}
 #define SAYAS_CHARS        0x12
--- a/src/windows/libespeak-ng.vcxproj
+++ b/src/windows/libespeak-ng.vcxproj
    <ClCompile Include="..\libespeak-ng\synthdata.c" />
    <ClCompile Include="..\libespeak-ng\synthesize.c" />
    <ClCompile Include="..\libespeak-ng\synth_mbrola.c" />
    <ClCompile Include="..\libespeak-ng\tokenizer.c" />
    <ClCompile Include="..\libespeak-ng\translate.c" />
    <ClCompile Include="..\libespeak-ng\tr_languages.c" />
    <ClCompile Include="..\libespeak-ng\voices.c" />
--- a/tests/tokenizer.c
+++ b/tests/tokenizer.c
 /*
 * Copyright (C) 2017 Reece H. Dunn
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write see:
 *             <http://www.gnu.org/licenses/>.
 */
 #include "config.h"
 #include <assert.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <espeak-ng/espeak_ng.h>
 #include "encoding.h"
 #include "speech.h"
 #include "phoneme.h"
 #include "synthesize.h"
 #include "translate.h"
 void
 test_latin_common()
 {
 	printf("testing Latin/Common (Latn/Zyyy) script classification\n");
 	assert(clause_type_from_codepoint('a') == CLAUSE_NONE);
 	assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD);
 	assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION);
 	assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION);
 	assert(clause_type_from_codepoint(',') == CLAUSE_COMMA);
 	assert(clause_type_from_codepoint(':') == CLAUSE_COLON);
 	assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON);
 	assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON);
 	assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON);
 	assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER));
 }
 void
 test_greek()
 {
 	printf("testing Greek (Grek) script classification\n");
 	assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION);
 	assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON);
 }
 void
 test_armenian()
 {
 	printf("testing Armenian (Armn) script classification\n");
 	assert(clause_type_from_codepoint(0x055B) == (CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD));
 	assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
 	assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA);
 	assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD));
 	assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
 }
 void
 test_arabic()
 {
 	printf("testing Arabic (Arab) script classification\n");
 	assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA);
 	assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON);
 	assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION);
 	assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD);
 }
 void
 test_devanagari()
 {
 	printf("testing Devanagari (Deva) script classification\n");
 	assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
 }
 void
 test_tibetan()
 {
 	printf("testing Tibetan (Tibt) script classification\n");
 	assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH);
 }
 void
 test_sinhala()
 {
 	printf("testing Sinhala (Sinh) script classification\n");
 	assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
 }
 void
 test_georgian()
 {
 	printf("testing Georgian (Geor) script classification\n");
 	assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH);
 }
 void
 test_ethiopic()
 {
 	printf("testing Ethiopic (Ethi) script classification\n");
 	assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD);
 	assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA);
 	assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON);
 	assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON);
 	assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON);
 	assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION);
 	assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH);
 }
 void
 test_ideographic()
 {
 	printf("testing Ideographic (Hani) script classification\n");
 	assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
 }
 void
 test_fullwidth()
 {
 	printf("testing Full Width/Common (Zyyy) script classification\n");
 	assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
 	assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER));
 }
 int
 main(int argc, char **argv)
 {
 	test_latin_common();
 	test_greek();
 	test_armenian();
 	test_arabic();
 	test_devanagari();
 	test_tibetan();
 	test_sinhala();
 	test_georgian();
 	test_ethiopic();
 	test_ideographic();
 	test_fullwidth();
 	printf("done\n");
 	return EXIT_SUCCESS;
 }