When set in a language file, a period followed by a lower case letter is detected as end of sentence. Normal behavior is to require a capital letter.

4 years ago · c4740b3053
--- a/.gitignore
+++ b/.gitignore
@@ -118,6 +118,7 @@ tests/*.check
 !tests/cmd_options.test
 !tests/windows-data.test
 !tests/windows-installer.test
 !tests/voices.test

 espeak-ng.pc

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ The espeak-ng project is a fork of the espeak project.
 *  Add more tests to check the various parts of espeak-ng.
 *  Various changes to clean up the codebase.
 *  Restructure "option brackets" language option to "brackets" and "bracketsAnnounced".
 *  New Language option: "lowercaseSentence" for ending a sentence if a period is followed by a lower case letter.
 *  Added voice variants

 documentation:
@@ -29,6 +30,7 @@ updated languages:
 *  grc (Ancient Greek) -- Reece Dunn (support for variant Greek letter forms)
 *  hak (Hakka Chinese) -- Juho Hiltunen
 *  haw (Hawaiian) -- Juho Hiltunen
 *  ka (Georgian) -- Juho Hiltunen
 *  kok (Konkani) -- Juho Hiltunen
 *  nb (Norwegian Bokmål) -- Juho Hiltunen
 *  nci (Classical Nahuatl) -- Juho Hiltunen
--- a/Makefile.am
+++ b/Makefile.am
@@ -275,6 +275,7 @@ check:	tests/encoding.check \
 	tests/variants.check \
 	tests/windows-installer.check \
 	tests/bom.check \
 	tests/voices.check \
 	tests/cmd_options.check

 ##### fuzzer:
--- a/docs/voices.md
+++ b/docs/voices.md
@@ -26,6 +26,7 @@
  - [phonemes](#phonemes)
  - [dictionary](#dictionary)
  - [dictrules](#dictrules)
  - [lowercaseSentence](#lowercaseSentence)
  - [replace](#replace)
  - [stressRule](#stressrule)
  - [stressLength](#stresslength)
@@ -385,6 +386,16 @@ language dictionary. They apply to rules in the language's `*_rules`
 dictionary file and also its `*_list` exceptions list. See
 [Text to Phoneme Translation](dictionary.md#conditional-rules).

 ### lowercaseSentence

 	lowercaseSentence <no arguments>

 By default, a sentence end is detected if a period `.` is followed by an uppercase letter.
 When lowercaseSentence is set, a period followed by a lowercase letter is also handled as end of sentence.

 Note that other conditions, such as abbreviations, might override this setting.


 ### replace

 	replace <flags> <phoneme> <replacement phoneme>
--- a/espeak-ng-data/lang/ccs/ka
+++ b/espeak-ng-data/lang/ccs/ka
@@ -1,2 +1,3 @@
 name Georgian
 language ka
 lowercaseSentence	// A period followed by a lowercase letter is considered a sentence
--- a/src/libespeak-ng/readclause.c
+++ b/src/libespeak-ng/readclause.c
@@ -876,9 +876,9 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_
 							else if (iswlower(c_next) || (c_next == '-')) // hyphen is needed for lang-hu (eg. 2.-kal)
 								is_end_clause = false; // only if followed by lower-case, (or if there is a XML tag)
 						} 
 						if (iswlower(c_next)) {
 						if (iswlower(c_next) && tr->langopts.lowercase_sentence == false) {
 							// next word has no capital letter, this dot is probably from an abbreviation
 							is_end_clause = 0;
 							is_end_clause = false;
 						}
 						if (any_alnum == false) {
 							// no letters or digits yet, so probably not a sentence terminator
--- a/src/libespeak-ng/tr_languages.c
+++ b/src/libespeak-ng/tr_languages.c
@@ -295,6 +295,7 @@ static Translator *NewTranslator(void)
 	tr->langopts.replace_chars = NULL;
 	tr->langopts.alt_alphabet_lang = L('e', 'n');
 	tr->langopts.roman_suffix = utf8_null;
 	tr->langopts.lowercase_sentence = false;

 	SetLengthMods(tr, 201);

--- a/src/libespeak-ng/translate.h
+++ b/src/libespeak-ng/translate.h
@@ -540,6 +540,7 @@ typedef struct {
 	int max_lengthmod;
 	int lengthen_tonic;   // lengthen the tonic syllable
 	int suffix_add_e;      // replace a suffix (which has the SUFX_E flag) with this character
 	bool lowercase_sentence;	// when true, a period . causes a sentence stop even if next character is lowercase
 } LANGUAGE_OPTIONS;

 typedef struct {
@@ -608,6 +609,7 @@ typedef struct {
 	int end_stressed_vowel;  // word ends with stressed vowel
 	int prev_dict_flags[2];     // dictionary flags from previous word
 	int clause_terminator;

 } Translator;

 #define OPTION_EMPHASIZE_ALLCAPS  0x100
--- a/src/libespeak-ng/voices.c
+++ b/src/libespeak-ng/voices.c
@@ -92,6 +92,7 @@ enum {
 	V_BREATHW,

 	// these override defaults set by the translator
 	V_LOWERCASE_SENTENCE,
 	V_WORDGAP,
 	V_INTONATION,
 	V_TUNES,
@@ -122,6 +123,8 @@ static MNEM_TAB keyword_tab[] = {
 	{ "maintainer",   V_MAINTAINER },
 	{ "status",       V_STATUS },


 	{ "lowercaseSentence",	V_LOWERCASE_SENTENCE },
 	{ "variants",     V_VARIANTS },
 	{ "formant",      V_FORMANT },
 	{ "pitch",        V_PITCH },
@@ -654,6 +657,14 @@ voice_t *LoadVoice(const char *vname, int control)
 		case V_FORMANT:
 			VoiceFormant(p);
 			break;
 		case V_LOWERCASE_SENTENCE: {
 			if (langopts)
 				langopts->lowercase_sentence = true;
 			else
 				fprintf(stderr, "Cannot set lowercaseSentence: language not set, or is invalid.\n");
 			break;
 			}

 		case V_PITCH:
 			// default is  pitch 82 118
 			if (sscanf(p, "%d %d", &pitch1, &pitch2) == 2) {
--- a/tests/voices.test
+++ b/tests/voices.test
@@ -0,0 +1,27 @@
 #!/bin/sh

 test_voices() {
 	TEST_LANG=$1
 	MESSAGE=$2
 	EXPECTED=$3
 	TEST_TEXT=$4
 	TEST_BROKEN=${5:-false}

 	if [ "x$MESSAGE" = x- ] ; then
 		echo "testing ${TEST_LANG}"
 	else
 		echo "testing ${TEST_LANG} ($MESSAGE)"
 	fi
 	ESPEAK_DATA_PATH=`pwd` LD_LIBRARY_PATH=src:${LD_LIBRARY_PATH} \
 		src/espeak-ng ${OPTIONS} -xq -v ${TEST_LANG} "${TEST_TEXT}" > actual.txt
 	echo "${EXPECTED}" > expected.txt
 	if [ x$TEST_BROKEN = xbroken ] ; then
 		diff expected.txt actual.txt || (echo "... ignoring error (broken)" && true)
 	else
 		diff expected.txt actual.txt || exit 1
 	fi
 }

 ##### Voice options
 test_voices ka "lowercaseSentence" "s'asi,amovn,oa t#k#v'eni g'ats#noba
 r'ogor brdz'andebit#" "სასიამოვნოა თქვენი გაცნობა. როგორ ბრძანდებით" #872