Browse Source

New language option lowercaseSentence

When set in a language file, a period followed by a lower case letter is
detected as end of sentence. Normal behavior is to require a capital
letter.
master
Juho Hiltunen 4 years ago
parent
commit
c4740b3053

+ 1
- 0
.gitignore View File

@@ -118,6 +118,7 @@ tests/*.check
!tests/cmd_options.test
!tests/windows-data.test
!tests/windows-installer.test
!tests/voices.test

espeak-ng.pc


+ 2
- 0
CHANGELOG.md View File

@@ -13,6 +13,7 @@ The espeak-ng project is a fork of the espeak project.
* Add more tests to check the various parts of espeak-ng.
* Various changes to clean up the codebase.
* Restructure "option brackets" language option to "brackets" and "bracketsAnnounced".
* New Language option: "lowercaseSentence" for ending a sentence if a period is followed by a lower case letter.
* Added voice variants

documentation:
@@ -29,6 +30,7 @@ updated languages:
* grc (Ancient Greek) -- Reece Dunn (support for variant Greek letter forms)
* hak (Hakka Chinese) -- Juho Hiltunen
* haw (Hawaiian) -- Juho Hiltunen
* ka (Georgian) -- Juho Hiltunen
* kok (Konkani) -- Juho Hiltunen
* nb (Norwegian Bokmål) -- Juho Hiltunen
* nci (Classical Nahuatl) -- Juho Hiltunen

+ 1
- 0
Makefile.am View File

@@ -275,6 +275,7 @@ check: tests/encoding.check \
tests/variants.check \
tests/windows-installer.check \
tests/bom.check \
tests/voices.check \
tests/cmd_options.check

##### fuzzer:

+ 11
- 0
docs/voices.md View File

@@ -26,6 +26,7 @@
- [phonemes](#phonemes)
- [dictionary](#dictionary)
- [dictrules](#dictrules)
- [lowercaseSentence](#lowercaseSentence)
- [replace](#replace)
- [stressRule](#stressrule)
- [stressLength](#stresslength)
@@ -385,6 +386,16 @@ language dictionary. They apply to rules in the language's `*_rules`
dictionary file and also its `*_list` exceptions list. See
[Text to Phoneme Translation](dictionary.md#conditional-rules).

### lowercaseSentence

lowercaseSentence <no arguments>

By default, a sentence end is detected if a period `.` is followed by an uppercase letter.
When lowercaseSentence is set, a period followed by a lowercase letter is also handled as end of sentence.

Note that other conditions, such as abbreviations, might override this setting.


### replace

replace <flags> <phoneme> <replacement phoneme>

+ 1
- 0
espeak-ng-data/lang/ccs/ka View File

@@ -1,2 +1,3 @@
name Georgian
language ka
lowercaseSentence // A period followed by a lowercase letter is considered a sentence

+ 2
- 2
src/libespeak-ng/readclause.c View File

@@ -876,9 +876,9 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_
else if (iswlower(c_next) || (c_next == '-')) // hyphen is needed for lang-hu (eg. 2.-kal)
is_end_clause = false; // only if followed by lower-case, (or if there is a XML tag)
}
if (iswlower(c_next)) {
if (iswlower(c_next) && tr->langopts.lowercase_sentence == false) {
// next word has no capital letter, this dot is probably from an abbreviation
is_end_clause = 0;
is_end_clause = false;
}
if (any_alnum == false) {
// no letters or digits yet, so probably not a sentence terminator

+ 1
- 0
src/libespeak-ng/tr_languages.c View File

@@ -295,6 +295,7 @@ static Translator *NewTranslator(void)
tr->langopts.replace_chars = NULL;
tr->langopts.alt_alphabet_lang = L('e', 'n');
tr->langopts.roman_suffix = utf8_null;
tr->langopts.lowercase_sentence = false;

SetLengthMods(tr, 201);


+ 2
- 0
src/libespeak-ng/translate.h View File

@@ -540,6 +540,7 @@ typedef struct {
int max_lengthmod;
int lengthen_tonic; // lengthen the tonic syllable
int suffix_add_e; // replace a suffix (which has the SUFX_E flag) with this character
bool lowercase_sentence; // when true, a period . causes a sentence stop even if next character is lowercase
} LANGUAGE_OPTIONS;

typedef struct {
@@ -608,6 +609,7 @@ typedef struct {
int end_stressed_vowel; // word ends with stressed vowel
int prev_dict_flags[2]; // dictionary flags from previous word
int clause_terminator;

} Translator;

#define OPTION_EMPHASIZE_ALLCAPS 0x100

+ 11
- 0
src/libespeak-ng/voices.c View File

@@ -92,6 +92,7 @@ enum {
V_BREATHW,

// these override defaults set by the translator
V_LOWERCASE_SENTENCE,
V_WORDGAP,
V_INTONATION,
V_TUNES,
@@ -122,6 +123,8 @@ static MNEM_TAB keyword_tab[] = {
{ "maintainer", V_MAINTAINER },
{ "status", V_STATUS },


{ "lowercaseSentence", V_LOWERCASE_SENTENCE },
{ "variants", V_VARIANTS },
{ "formant", V_FORMANT },
{ "pitch", V_PITCH },
@@ -654,6 +657,14 @@ voice_t *LoadVoice(const char *vname, int control)
case V_FORMANT:
VoiceFormant(p);
break;
case V_LOWERCASE_SENTENCE: {
if (langopts)
langopts->lowercase_sentence = true;
else
fprintf(stderr, "Cannot set lowercaseSentence: language not set, or is invalid.\n");
break;
}

case V_PITCH:
// default is pitch 82 118
if (sscanf(p, "%d %d", &pitch1, &pitch2) == 2) {

+ 27
- 0
tests/voices.test View File

@@ -0,0 +1,27 @@
#!/bin/sh

test_voices() {
TEST_LANG=$1
MESSAGE=$2
EXPECTED=$3
TEST_TEXT=$4
TEST_BROKEN=${5:-false}

if [ "x$MESSAGE" = x- ] ; then
echo "testing ${TEST_LANG}"
else
echo "testing ${TEST_LANG} ($MESSAGE)"
fi
ESPEAK_DATA_PATH=`pwd` LD_LIBRARY_PATH=src:${LD_LIBRARY_PATH} \
src/espeak-ng ${OPTIONS} -xq -v ${TEST_LANG} "${TEST_TEXT}" > actual.txt
echo "${EXPECTED}" > expected.txt
if [ x$TEST_BROKEN = xbroken ] ; then
diff expected.txt actual.txt || (echo "... ignoring error (broken)" && true)
else
diff expected.txt actual.txt || exit 1
fi
}

##### Voice options
test_voices ka "lowercaseSentence" "s'asi,amovn,oa t#k#v'eni g'ats#noba
r'ogor brdz'andebit#" "სასიამოვნოა თქვენი გაცნობა. როგორ ბრძანდებით" #872

Loading…
Cancel
Save