When set in a language file, a period followed by a lower case letter is detected as end of sentence. Normal behavior is to require a capital letter.master
@@ -118,6 +118,7 @@ tests/*.check | |||
!tests/cmd_options.test | |||
!tests/windows-data.test | |||
!tests/windows-installer.test | |||
!tests/voices.test | |||
espeak-ng.pc | |||
@@ -13,6 +13,7 @@ The espeak-ng project is a fork of the espeak project. | |||
* Add more tests to check the various parts of espeak-ng. | |||
* Various changes to clean up the codebase. | |||
* Restructure "option brackets" language option to "brackets" and "bracketsAnnounced". | |||
* New Language option: "lowercaseSentence" for ending a sentence if a period is followed by a lower case letter. | |||
* Added voice variants | |||
documentation: | |||
@@ -29,6 +30,7 @@ updated languages: | |||
* grc (Ancient Greek) -- Reece Dunn (support for variant Greek letter forms) | |||
* hak (Hakka Chinese) -- Juho Hiltunen | |||
* haw (Hawaiian) -- Juho Hiltunen | |||
* ka (Georgian) -- Juho Hiltunen | |||
* kok (Konkani) -- Juho Hiltunen | |||
* nb (Norwegian Bokmål) -- Juho Hiltunen | |||
* nci (Classical Nahuatl) -- Juho Hiltunen |
@@ -275,6 +275,7 @@ check: tests/encoding.check \ | |||
tests/variants.check \ | |||
tests/windows-installer.check \ | |||
tests/bom.check \ | |||
tests/voices.check \ | |||
tests/cmd_options.check | |||
##### fuzzer: |
@@ -26,6 +26,7 @@ | |||
- [phonemes](#phonemes) | |||
- [dictionary](#dictionary) | |||
- [dictrules](#dictrules) | |||
- [lowercaseSentence](#lowercaseSentence) | |||
- [replace](#replace) | |||
- [stressRule](#stressrule) | |||
- [stressLength](#stresslength) | |||
@@ -385,6 +386,16 @@ language dictionary. They apply to rules in the language's `*_rules` | |||
dictionary file and also its `*_list` exceptions list. See | |||
[Text to Phoneme Translation](dictionary.md#conditional-rules). | |||
### lowercaseSentence | |||
lowercaseSentence <no arguments> | |||
By default, a sentence end is detected if a period `.` is followed by an uppercase letter. | |||
When lowercaseSentence is set, a period followed by a lowercase letter is also handled as end of sentence. | |||
Note that other conditions, such as abbreviations, might override this setting. | |||
### replace | |||
replace <flags> <phoneme> <replacement phoneme> |
@@ -1,2 +1,3 @@ | |||
name Georgian | |||
language ka | |||
lowercaseSentence // A period followed by a lowercase letter is considered a sentence |
@@ -876,9 +876,9 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_ | |||
else if (iswlower(c_next) || (c_next == '-')) // hyphen is needed for lang-hu (eg. 2.-kal) | |||
is_end_clause = false; // only if followed by lower-case, (or if there is a XML tag) | |||
} | |||
if (iswlower(c_next)) { | |||
if (iswlower(c_next) && tr->langopts.lowercase_sentence == false) { | |||
// next word has no capital letter, this dot is probably from an abbreviation | |||
is_end_clause = 0; | |||
is_end_clause = false; | |||
} | |||
if (any_alnum == false) { | |||
// no letters or digits yet, so probably not a sentence terminator |
@@ -295,6 +295,7 @@ static Translator *NewTranslator(void) | |||
tr->langopts.replace_chars = NULL; | |||
tr->langopts.alt_alphabet_lang = L('e', 'n'); | |||
tr->langopts.roman_suffix = utf8_null; | |||
tr->langopts.lowercase_sentence = false; | |||
SetLengthMods(tr, 201); | |||
@@ -540,6 +540,7 @@ typedef struct { | |||
int max_lengthmod; | |||
int lengthen_tonic; // lengthen the tonic syllable | |||
int suffix_add_e; // replace a suffix (which has the SUFX_E flag) with this character | |||
bool lowercase_sentence; // when true, a period . causes a sentence stop even if next character is lowercase | |||
} LANGUAGE_OPTIONS; | |||
typedef struct { | |||
@@ -608,6 +609,7 @@ typedef struct { | |||
int end_stressed_vowel; // word ends with stressed vowel | |||
int prev_dict_flags[2]; // dictionary flags from previous word | |||
int clause_terminator; | |||
} Translator; | |||
#define OPTION_EMPHASIZE_ALLCAPS 0x100 |
@@ -92,6 +92,7 @@ enum { | |||
V_BREATHW, | |||
// these override defaults set by the translator | |||
V_LOWERCASE_SENTENCE, | |||
V_WORDGAP, | |||
V_INTONATION, | |||
V_TUNES, | |||
@@ -122,6 +123,8 @@ static MNEM_TAB keyword_tab[] = { | |||
{ "maintainer", V_MAINTAINER }, | |||
{ "status", V_STATUS }, | |||
{ "lowercaseSentence", V_LOWERCASE_SENTENCE }, | |||
{ "variants", V_VARIANTS }, | |||
{ "formant", V_FORMANT }, | |||
{ "pitch", V_PITCH }, | |||
@@ -654,6 +657,14 @@ voice_t *LoadVoice(const char *vname, int control) | |||
case V_FORMANT: | |||
VoiceFormant(p); | |||
break; | |||
case V_LOWERCASE_SENTENCE: { | |||
if (langopts) | |||
langopts->lowercase_sentence = true; | |||
else | |||
fprintf(stderr, "Cannot set lowercaseSentence: language not set, or is invalid.\n"); | |||
break; | |||
} | |||
case V_PITCH: | |||
// default is pitch 82 118 | |||
if (sscanf(p, "%d %d", &pitch1, &pitch2) == 2) { |
@@ -0,0 +1,27 @@ | |||
#!/bin/sh | |||
test_voices() { | |||
TEST_LANG=$1 | |||
MESSAGE=$2 | |||
EXPECTED=$3 | |||
TEST_TEXT=$4 | |||
TEST_BROKEN=${5:-false} | |||
if [ "x$MESSAGE" = x- ] ; then | |||
echo "testing ${TEST_LANG}" | |||
else | |||
echo "testing ${TEST_LANG} ($MESSAGE)" | |||
fi | |||
ESPEAK_DATA_PATH=`pwd` LD_LIBRARY_PATH=src:${LD_LIBRARY_PATH} \ | |||
src/espeak-ng ${OPTIONS} -xq -v ${TEST_LANG} "${TEST_TEXT}" > actual.txt | |||
echo "${EXPECTED}" > expected.txt | |||
if [ x$TEST_BROKEN = xbroken ] ; then | |||
diff expected.txt actual.txt || (echo "... ignoring error (broken)" && true) | |||
else | |||
diff expected.txt actual.txt || exit 1 | |||
fi | |||
} | |||
##### Voice options | |||
test_voices ka "lowercaseSentence" "s'asi,amovn,oa t#k#v'eni g'ats#noba | |||
r'ogor brdz'andebit#" "სასიამოვნოა თქვენი გაცნობა. როგორ ბრძანდებით" #872 |