Browse Source

move ProcessSsmlTag() from readclause.to to ssml.c

use parameters instead of globals
master
Juho Hiltunen 7 years ago
parent
commit
a47ff4f41d
3 changed files with 394 additions and 392 deletions
  1. 3
    387
      src/libespeak-ng/readclause.c
  2. 388
    1
      src/libespeak-ng/ssml.c
  3. 3
    4
      src/libespeak-ng/ssml.h

+ 3
- 387
src/libespeak-ng/readclause.c View File

@@ -416,7 +416,7 @@ static int LookupSoundicon(int c)
return -1;
}

static int LoadSoundFile2(const char *fname)
int LoadSoundFile2(const char *fname)
{
// Load a sound file into one of the reserved slots in the sound icon table
// (if it'snot already loaded)
@@ -549,7 +549,7 @@ static int AnnouncePunctuation(Translator *tr, int c1, int *c2_ptr, char *output
return short_pause;
}

static int AddNameData(const char *name, int wide)
int AddNameData(const char *name, int wide)
{
// Add the name to the namedata and return its position
// (Used by the Windows SAPI wrapper)
@@ -601,390 +601,6 @@ void SetVoiceStack(espeak_VOICE *v, const char *variant_name)
memcpy(&base_voice, &current_voice_selected, sizeof(base_voice));
}

static int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, bool self_closing)
{
// xml_buf is the tag and attributes with a zero terminator in place of the original '>'
// returns a clause terminator value.

unsigned int ix;
int index;
int c;
int tag_type;
int value;
int value2;
int value3;
int voice_change_flag;
wchar_t *px;
wchar_t *attr1;
wchar_t *attr2;
wchar_t *attr3;
int terminator;
char *uri;
int param_type;
char tag_name[40];
char buf[80];
PARAM_STACK *sp;
SSML_STACK *ssml_sp;

static const MNEM_TAB mnem_phoneme_alphabet[] = {
{ "espeak", 1 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_punct[] = {
{ "none", 1 },
{ "all", 2 },
{ "some", 3 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_capitals[] = {
{ "no", 0 },
{ "icon", 1 },
{ "spelling", 2 },
{ "pitch", 20 }, // this is the amount by which to raise the pitch
{ NULL, -1 }
};

static const MNEM_TAB mnem_interpret_as[] = {
{ "characters", SAYAS_CHARS },
{ "tts:char", SAYAS_SINGLE_CHARS },
{ "tts:key", SAYAS_KEY },
{ "tts:digits", SAYAS_DIGITS },
{ "telephone", SAYAS_DIGITS1 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_sayas_format[] = {
{ "glyphs", 1 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_break[] = {
{ "none", 0 },
{ "x-weak", 1 },
{ "weak", 2 },
{ "medium", 3 },
{ "strong", 4 },
{ "x-strong", 5 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_emphasis[] = {
{ "none", 1 },
{ "reduced", 2 },
{ "moderate", 3 },
{ "strong", 4 },
{ "x-strong", 5 },
{ NULL, -1 }
};

static const char *prosody_attr[5] = {
NULL, "rate", "volume", "pitch", "range"
};

for (ix = 0; ix < (sizeof(tag_name)-1); ix++) {
if (((c = xml_buf[ix]) == 0) || iswspace(c))
break;
tag_name[ix] = tolower((char)c);
}
tag_name[ix] = 0;

px = &xml_buf[ix]; // the tag's attributes

if (tag_name[0] == '/') {
// closing tag
if ((tag_type = LookupMnem(ssmltags, &tag_name[1])) != HTML_NOSPACE)
outbuf[(*outix)++] = ' ';
tag_type += SSML_CLOSE;
} else {
if ((tag_type = LookupMnem(ssmltags, tag_name)) != HTML_NOSPACE) {
// separate SSML tags from the previous word (but not HMTL tags such as <b> <font> which can occur inside a word)
outbuf[(*outix)++] = ' ';
}

if (self_closing && ignore_if_self_closing[tag_type])
return 0;
}

voice_change_flag = 0;
ssml_sp = &ssml_stack[n_ssml_stack-1];

switch (tag_type)
{
case SSML_STYLE:
sp = PushParamStack(tag_type, &n_param_stack, (PARAM_STACK *) param_stack);
attr1 = GetSsmlAttribute(px, "field");
attr2 = GetSsmlAttribute(px, "mode");


if (attrcmp(attr1, "punctuation") == 0) {
value = attrlookup(attr2, mnem_punct);
sp->parameter[espeakPUNCTUATION] = value;
} else if (attrcmp(attr1, "capital_letters") == 0) {
value = attrlookup(attr2, mnem_capitals);
sp->parameter[espeakCAPITALS] = value;
}
ProcessParamStack(outbuf, outix, n_param_stack, param_stack, speech_parameters);
break;
case SSML_PROSODY:
sp = PushParamStack(tag_type, &n_param_stack, (PARAM_STACK *) param_stack);

// look for attributes: rate, volume, pitch, range
for (param_type = espeakRATE; param_type <= espeakRANGE; param_type++) {
if ((attr1 = GetSsmlAttribute(px, prosody_attr[param_type])) != NULL)
SetProsodyParameter(param_type, attr1, sp, &param_stack, &speech_parameters);
}

ProcessParamStack(outbuf, outix, n_param_stack, param_stack, speech_parameters);
break;
case SSML_EMPHASIS:
sp = PushParamStack(tag_type, &n_param_stack, (PARAM_STACK *) param_stack);
value = 3; // default is "moderate"
if ((attr1 = GetSsmlAttribute(px, "level")) != NULL)
value = attrlookup(attr1, mnem_emphasis);

if (translator->langopts.tone_language == 1) {
static unsigned char emphasis_to_pitch_range[] = { 50, 50, 40, 70, 90, 100 };
static unsigned char emphasis_to_volume[] = { 100, 100, 70, 110, 135, 150 };
// tone language (eg.Chinese) do emphasis by increasing the pitch range.
sp->parameter[espeakRANGE] = emphasis_to_pitch_range[value];
sp->parameter[espeakVOLUME] = emphasis_to_volume[value];
} else {
static unsigned char emphasis_to_volume2[] = { 100, 100, 75, 100, 120, 150 };
sp->parameter[espeakVOLUME] = emphasis_to_volume2[value];
sp->parameter[espeakEMPHASIS] = value;
}
ProcessParamStack(outbuf, outix, n_param_stack, param_stack, speech_parameters);
break;
case SSML_STYLE + SSML_CLOSE:
case SSML_PROSODY + SSML_CLOSE:
case SSML_EMPHASIS + SSML_CLOSE:
PopParamStack(tag_type, outbuf, outix, &n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
break;
case SSML_PHONEME:
attr1 = GetSsmlAttribute(px, "alphabet");
attr2 = GetSsmlAttribute(px, "ph");
value = attrlookup(attr1, mnem_phoneme_alphabet);
if (value == 1) { // alphabet="espeak"
outbuf[(*outix)++] = '[';
outbuf[(*outix)++] = '[';
*outix += attrcopy_utf8(&outbuf[*outix], attr2, n_outbuf-*outix);
outbuf[(*outix)++] = ']';
outbuf[(*outix)++] = ']';
}
break;
case SSML_SAYAS:
attr1 = GetSsmlAttribute(px, "interpret-as");
attr2 = GetSsmlAttribute(px, "format");
attr3 = GetSsmlAttribute(px, "detail");
value = attrlookup(attr1, mnem_interpret_as);
value2 = attrlookup(attr2, mnem_sayas_format);
if (value2 == 1)
value = SAYAS_GLYPHS;

value3 = attrnumber(attr3, 0, 0);

if (value == SAYAS_DIGITS) {
if (value3 <= 1)
value = SAYAS_DIGITS1;
else
value = SAYAS_DIGITS + value3;
}

sprintf(buf, "%c%dY", CTRL_EMBEDDED, value);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);

sayas_start = *outix;
sayas_mode = value; // punctuation doesn't end clause during SAY-AS
break;
case SSML_SAYAS + SSML_CLOSE:
if (sayas_mode == SAYAS_KEY) {
outbuf[*outix] = 0;
ReplaceKeyName(outbuf, sayas_start, outix);
}

outbuf[(*outix)++] = CTRL_EMBEDDED;
outbuf[(*outix)++] = 'Y';
sayas_mode = 0;
break;
case SSML_SUB:
if ((attr1 = GetSsmlAttribute(px, "alias")) != NULL) {
// use the alias rather than the text
ignore_text = true;
*outix += attrcopy_utf8(&outbuf[*outix], attr1, n_outbuf-*outix);
}
break;
case SSML_IGNORE_TEXT:
ignore_text = true;
break;
case SSML_SUB + SSML_CLOSE:
case SSML_IGNORE_TEXT + SSML_CLOSE:
ignore_text = false;
break;
case SSML_MARK:
if ((attr1 = GetSsmlAttribute(px, "name")) != NULL) {
// add name to circular buffer of marker names
attrcopy_utf8(buf, attr1, sizeof(buf));

if (strcmp(skip_marker, buf) == 0) {
// This is the marker we are waiting for before starting to speak
clear_skipping_text = true;
skip_marker[0] = 0;
return CLAUSE_NONE;
}

if ((index = AddNameData(buf, 0)) >= 0) {
sprintf(buf, "%c%dM", CTRL_EMBEDDED, index);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);
}
}
break;
case SSML_AUDIO:
sp = PushParamStack(tag_type, &n_param_stack, (PARAM_STACK *)param_stack);

if ((attr1 = GetSsmlAttribute(px, "src")) != NULL) {
char fname[256];
attrcopy_utf8(buf, attr1, sizeof(buf));

if (uri_callback == NULL) {
if ((xmlbase != NULL) && (buf[0] != '/')) {
sprintf(fname, "%s/%s", xmlbase, buf);
index = LoadSoundFile2(fname);
} else
index = LoadSoundFile2(buf);
if (index >= 0) {
sprintf(buf, "%c%dI", CTRL_EMBEDDED, index);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);
sp->parameter[espeakSILENCE] = 1;
}
} else {
if ((index = AddNameData(buf, 0)) >= 0) {
uri = &namedata[index];
if (uri_callback(1, uri, xmlbase) == 0) {
sprintf(buf, "%c%dU", CTRL_EMBEDDED, index);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);
sp->parameter[espeakSILENCE] = 1;
}
}
}
}
ProcessParamStack(outbuf, outix, n_param_stack, param_stack, speech_parameters);

if (self_closing)
PopParamStack(tag_type, outbuf, outix, &n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
else
audio_text = true;
return CLAUSE_NONE;
case SSML_AUDIO + SSML_CLOSE:
PopParamStack(tag_type, outbuf, outix, &n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
audio_text = false;
return CLAUSE_NONE;
case SSML_BREAK:
value = 21;
terminator = CLAUSE_NONE;

if ((attr1 = GetSsmlAttribute(px, "strength")) != NULL) {
static int break_value[6] = { 0, 7, 14, 21, 40, 80 }; // *10mS
value = attrlookup(attr1, mnem_break);
if (value < 3) {
// adjust prepause on the following word
sprintf(&outbuf[*outix], "%c%dB", CTRL_EMBEDDED, value);
*outix += 3;
terminator = 0;
}
value = break_value[value];
}
if ((attr2 = GetSsmlAttribute(px, "time")) != NULL) {
value2 = attrnumber(attr2, 0, 1); // pause in mS

// compensate for speaking speed to keep constant pause length, see function PauseLength()
// 'value' here is x 10mS
value = (value2 * 256) / (speed.clause_pause_factor * 10);
if (value < 200)
value = (value2 * 256) / (speed.pause_factor * 10);

if (terminator == 0)
terminator = CLAUSE_NONE;
}
if (terminator) {
if (value > 0xfff) {
// scale down the value and set a scaling indicator bit
value = value / 32;
if (value > 0xfff)
value = 0xfff;
terminator |= CLAUSE_PAUSE_LONG;
}
return terminator + value;
}
break;
case SSML_SPEAK:
if ((attr1 = GetSsmlAttribute(px, "xml:base")) != NULL) {
attrcopy_utf8(buf, attr1, sizeof(buf));
if ((index = AddNameData(buf, 0)) >= 0)
xmlbase = &namedata[index];
}
if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name) == 0)
return 0; // no voice change
return CLAUSE_VOICE;
case SSML_VOICE:
if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name) == 0)
return 0; // no voice change
return CLAUSE_VOICE;
case SSML_SPEAK + SSML_CLOSE:
// unwind stack until the previous <voice> or <speak> tag
while ((n_ssml_stack > 1) && (ssml_stack[n_ssml_stack-1].tag_type != SSML_SPEAK))
n_ssml_stack--;
return CLAUSE_PERIOD + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
case SSML_VOICE + SSML_CLOSE:
// unwind stack until the previous <voice> or <speak> tag
while ((n_ssml_stack > 1) && (ssml_stack[n_ssml_stack-1].tag_type != SSML_VOICE))
n_ssml_stack--;

terminator = 0; // ?? Sentence intonation, but no pause ??
return terminator + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
case HTML_BREAK:
case HTML_BREAK + SSML_CLOSE:
return CLAUSE_COLON;
case SSML_SENTENCE:
if (ssml_sp->tag_type == SSML_SENTENCE) {
// new sentence implies end-of-sentence
voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
}
voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
return CLAUSE_PARAGRAPH + voice_change_flag;
case SSML_PARAGRAPH:
if (ssml_sp->tag_type == SSML_SENTENCE) {
// new paragraph implies end-of-sentence or end-of-paragraph
voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
}
if (ssml_sp->tag_type == SSML_PARAGRAPH) {
// new paragraph implies end-of-sentence or end-of-paragraph
voice_change_flag |= GetVoiceAttributes(px, SSML_PARAGRAPH+SSML_CLOSE, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
}
voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
return CLAUSE_PARAGRAPH + voice_change_flag;
case SSML_SENTENCE + SSML_CLOSE:
if (ssml_sp->tag_type == SSML_SENTENCE) {
// end of a sentence which specified a language
voice_change_flag = GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name);
}
return CLAUSE_PERIOD + voice_change_flag;
case SSML_PARAGRAPH + SSML_CLOSE:
if ((ssml_sp->tag_type == SSML_SENTENCE) || (ssml_sp->tag_type == SSML_PARAGRAPH)) {
// End of a paragraph which specified a language.
// (End-of-paragraph also implies end-of-sentence)
return GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, n_ssml_stack, current_voice_id, &base_voice, base_voice_variant_name) + CLAUSE_PARAGRAPH;
}
return CLAUSE_PARAGRAPH;
}
return 0;
}

static void RemoveChar(char *p)
{
// Replace a UTF-8 character by spaces
@@ -1176,7 +792,7 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_
self_closing = true;
}

terminator = ProcessSsmlTag(xml_buf, buf, &ix, n_buf, self_closing);
terminator = ProcessSsmlTag(xml_buf, buf, &ix, n_buf, self_closing, xmlbase, &audio_text, current_voice_id, &base_voice, base_voice_variant_name, &ignore_text, &clear_skipping_text, &sayas_mode, &sayas_start, ssml_stack, &n_ssml_stack, &n_param_stack, (int *)speech_parameters);

if (terminator != 0) {
buf[ix] = ' ';

+ 388
- 1
src/libespeak-ng/ssml.c View File

@@ -46,6 +46,7 @@
#include "translate.h"
#include "ssml.h"


int attrcmp(const wchar_t *string1, const char *string2)
{
int ix;
@@ -158,7 +159,7 @@ int attr_prosody_value(int param_type, const wchar_t *pw, int *value_out)
return sign; // -1, 0, or 1
}

int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char base_voice_variant_name[40])
static int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char *base_voice_variant_name)
{
// Determines whether voice attribute are specified in this tag, and if so, whether this means
// a voice change.
@@ -507,3 +508,389 @@ void SetProsodyParameter(int param_type, wchar_t *attr1, PARAM_STACK *sp, PARAM_
}
}

int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, bool self_closing, const char *xmlbase, bool *audio_text, char *current_voice_id, espeak_VOICE *base_voice, char *base_voice_variant_name, bool *ignore_text, bool *clear_skipping_text, int *sayas_mode, int *sayas_start, SSML_STACK *ssml_stack, int *n_ssml_stack, int *n_param_stack, int *speech_parameters)
{
// xml_buf is the tag and attributes with a zero terminator in place of the original '>'
// returns a clause terminator value.

unsigned int ix;
int index;
int c;
int tag_type;
int value;
int value2;
int value3;
int voice_change_flag;
wchar_t *px;
wchar_t *attr1;
wchar_t *attr2;
wchar_t *attr3;
int terminator;
char *uri;
int param_type;
char tag_name[40];
char buf[80];
PARAM_STACK *sp;
SSML_STACK *ssml_sp;

// these tags have no effect if they are self-closing, eg. <voice />
static char ignore_if_self_closing[] = { 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0 };

static const MNEM_TAB mnem_phoneme_alphabet[] = {
{ "espeak", 1 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_punct[] = {
{ "none", 1 },
{ "all", 2 },
{ "some", 3 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_capitals[] = {
{ "no", 0 },
{ "icon", 1 },
{ "spelling", 2 },
{ "pitch", 20 }, // this is the amount by which to raise the pitch
{ NULL, -1 }
};

static const MNEM_TAB mnem_interpret_as[] = {
{ "characters", SAYAS_CHARS },
{ "tts:char", SAYAS_SINGLE_CHARS },
{ "tts:key", SAYAS_KEY },
{ "tts:digits", SAYAS_DIGITS },
{ "telephone", SAYAS_DIGITS1 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_sayas_format[] = {
{ "glyphs", 1 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_break[] = {
{ "none", 0 },
{ "x-weak", 1 },
{ "weak", 2 },
{ "medium", 3 },
{ "strong", 4 },
{ "x-strong", 5 },
{ NULL, -1 }
};

static const MNEM_TAB mnem_emphasis[] = {
{ "none", 1 },
{ "reduced", 2 },
{ "moderate", 3 },
{ "strong", 4 },
{ "x-strong", 5 },
{ NULL, -1 }
};

static const char *prosody_attr[5] = {
NULL, "rate", "volume", "pitch", "range"
};

for (ix = 0; ix < (sizeof(tag_name)-1); ix++) {
if (((c = xml_buf[ix]) == 0) || iswspace(c))
break;
tag_name[ix] = tolower((char)c);
}
tag_name[ix] = 0;

px = &xml_buf[ix]; // the tag's attributes

if (tag_name[0] == '/') {
// closing tag
if ((tag_type = LookupMnem(ssmltags, &tag_name[1])) != HTML_NOSPACE)
outbuf[(*outix)++] = ' ';
tag_type += SSML_CLOSE;
} else {
if ((tag_type = LookupMnem(ssmltags, tag_name)) != HTML_NOSPACE) {
// separate SSML tags from the previous word (but not HMTL tags such as <b> <font> which can occur inside a word)
outbuf[(*outix)++] = ' ';
}

if (self_closing && ignore_if_self_closing[tag_type])
return 0;
}

voice_change_flag = 0;
ssml_sp = &ssml_stack[*n_ssml_stack-1];

switch (tag_type)
{
case SSML_STYLE:
sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
attr1 = GetSsmlAttribute(px, "field");
attr2 = GetSsmlAttribute(px, "mode");


if (attrcmp(attr1, "punctuation") == 0) {
value = attrlookup(attr2, mnem_punct);
sp->parameter[espeakPUNCTUATION] = value;
} else if (attrcmp(attr1, "capital_letters") == 0) {
value = attrlookup(attr2, mnem_capitals);
sp->parameter[espeakCAPITALS] = value;
}
ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
break;
case SSML_PROSODY:
sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);

// look for attributes: rate, volume, pitch, range
for (param_type = espeakRATE; param_type <= espeakRANGE; param_type++) {
if ((attr1 = GetSsmlAttribute(px, prosody_attr[param_type])) != NULL)
SetProsodyParameter(param_type, attr1, sp, param_stack, speech_parameters);
}

ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
break;
case SSML_EMPHASIS:
sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
value = 3; // default is "moderate"
if ((attr1 = GetSsmlAttribute(px, "level")) != NULL)
value = attrlookup(attr1, mnem_emphasis);

if (translator->langopts.tone_language == 1) {
static unsigned char emphasis_to_pitch_range[] = { 50, 50, 40, 70, 90, 100 };
static unsigned char emphasis_to_volume[] = { 100, 100, 70, 110, 135, 150 };
// tone language (eg.Chinese) do emphasis by increasing the pitch range.
sp->parameter[espeakRANGE] = emphasis_to_pitch_range[value];
sp->parameter[espeakVOLUME] = emphasis_to_volume[value];
} else {
static unsigned char emphasis_to_volume2[] = { 100, 100, 75, 100, 120, 150 };
sp->parameter[espeakVOLUME] = emphasis_to_volume2[value];
sp->parameter[espeakEMPHASIS] = value;
}
ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
break;
case SSML_STYLE + SSML_CLOSE:
case SSML_PROSODY + SSML_CLOSE:
case SSML_EMPHASIS + SSML_CLOSE:
PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
break;
case SSML_PHONEME:
attr1 = GetSsmlAttribute(px, "alphabet");
attr2 = GetSsmlAttribute(px, "ph");
value = attrlookup(attr1, mnem_phoneme_alphabet);
if (value == 1) { // alphabet="espeak"
outbuf[(*outix)++] = '[';
outbuf[(*outix)++] = '[';
*outix += attrcopy_utf8(&outbuf[*outix], attr2, n_outbuf-*outix);
outbuf[(*outix)++] = ']';
outbuf[(*outix)++] = ']';
}
break;
case SSML_SAYAS:
attr1 = GetSsmlAttribute(px, "interpret-as");
attr2 = GetSsmlAttribute(px, "format");
attr3 = GetSsmlAttribute(px, "detail");
value = attrlookup(attr1, mnem_interpret_as);
value2 = attrlookup(attr2, mnem_sayas_format);
if (value2 == 1)
value = SAYAS_GLYPHS;

value3 = attrnumber(attr3, 0, 0);

if (value == SAYAS_DIGITS) {
if (value3 <= 1)
value = SAYAS_DIGITS1;
else
value = SAYAS_DIGITS + value3;
}

sprintf(buf, "%c%dY", CTRL_EMBEDDED, value);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);

*sayas_start = *outix;
*sayas_mode = value; // punctuation doesn't end clause during SAY-AS
break;
case SSML_SAYAS + SSML_CLOSE:
if (*sayas_mode == SAYAS_KEY) {
outbuf[*outix] = 0;
ReplaceKeyName(outbuf, *sayas_start, outix);
}

outbuf[(*outix)++] = CTRL_EMBEDDED;
outbuf[(*outix)++] = 'Y';
*sayas_mode = 0;
break;
case SSML_SUB:
if ((attr1 = GetSsmlAttribute(px, "alias")) != NULL) {
// use the alias rather than the text
*ignore_text = true;
*outix += attrcopy_utf8(&outbuf[*outix], attr1, n_outbuf-*outix);
}
break;
case SSML_IGNORE_TEXT:
*ignore_text = true;
break;
case SSML_SUB + SSML_CLOSE:
case SSML_IGNORE_TEXT + SSML_CLOSE:
*ignore_text = false;
break;
case SSML_MARK:
if ((attr1 = GetSsmlAttribute(px, "name")) != NULL) {
// add name to circular buffer of marker names
attrcopy_utf8(buf, attr1, sizeof(buf));

if (strcmp(skip_marker, buf) == 0) {
// This is the marker we are waiting for before starting to speak
*clear_skipping_text = true;
skip_marker[0] = 0;
return CLAUSE_NONE;
}

if ((index = AddNameData(buf, 0)) >= 0) {
sprintf(buf, "%c%dM", CTRL_EMBEDDED, index);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);
}
}
break;
case SSML_AUDIO:
sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *)param_stack);

if ((attr1 = GetSsmlAttribute(px, "src")) != NULL) {
char fname[256];
attrcopy_utf8(buf, attr1, sizeof(buf));

if (uri_callback == NULL) {
if ((xmlbase != NULL) && (buf[0] != '/')) {
sprintf(fname, "%s/%s", xmlbase, buf);
index = LoadSoundFile2(fname);
} else
index = LoadSoundFile2(buf);
if (index >= 0) {
sprintf(buf, "%c%dI", CTRL_EMBEDDED, index);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);
sp->parameter[espeakSILENCE] = 1;
}
} else {
if ((index = AddNameData(buf, 0)) >= 0) {
uri = &namedata[index];
if (uri_callback(1, uri, xmlbase) == 0) {
sprintf(buf, "%c%dU", CTRL_EMBEDDED, index);
strcpy(&outbuf[*outix], buf);
*outix += strlen(buf);
sp->parameter[espeakSILENCE] = 1;
}
}
}
}
ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);

if (self_closing)
PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
else
*audio_text = true;
return CLAUSE_NONE;
case SSML_AUDIO + SSML_CLOSE:
PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
*audio_text = false;
return CLAUSE_NONE;
case SSML_BREAK:
value = 21;
terminator = CLAUSE_NONE;

if ((attr1 = GetSsmlAttribute(px, "strength")) != NULL) {
static int break_value[6] = { 0, 7, 14, 21, 40, 80 }; // *10mS
value = attrlookup(attr1, mnem_break);
if (value < 3) {
// adjust prepause on the following word
sprintf(&outbuf[*outix], "%c%dB", CTRL_EMBEDDED, value);
*outix += 3;
terminator = 0;
}
value = break_value[value];
}
if ((attr2 = GetSsmlAttribute(px, "time")) != NULL) {
value2 = attrnumber(attr2, 0, 1); // pause in mS

// compensate for speaking speed to keep constant pause length, see function PauseLength()
// 'value' here is x 10mS
value = (value2 * 256) / (speed.clause_pause_factor * 10);
if (value < 200)
value = (value2 * 256) / (speed.pause_factor * 10);

if (terminator == 0)
terminator = CLAUSE_NONE;
}
if (terminator) {
if (value > 0xfff) {
// scale down the value and set a scaling indicator bit
value = value / 32;
if (value > 0xfff)
value = 0xfff;
terminator |= CLAUSE_PAUSE_LONG;
}
return terminator + value;
}
break;
case SSML_SPEAK:
if ((attr1 = GetSsmlAttribute(px, "xml:base")) != NULL) {
attrcopy_utf8(buf, attr1, sizeof(buf));
if ((index = AddNameData(buf, 0)) >= 0)
xmlbase = &namedata[index];
}
if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
return 0; // no voice change
return CLAUSE_VOICE;
case SSML_VOICE:
if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
return 0; // no voice change
return CLAUSE_VOICE;
case SSML_SPEAK + SSML_CLOSE:
// unwind stack until the previous <voice> or <speak> tag
while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_SPEAK))
*n_ssml_stack = *n_ssml_stack -1;
return CLAUSE_PERIOD + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
case SSML_VOICE + SSML_CLOSE:
// unwind stack until the previous <voice> or <speak> tag
while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_VOICE))
*n_ssml_stack = *n_ssml_stack -1;

terminator = 0; // ?? Sentence intonation, but no pause ??
return terminator + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
case HTML_BREAK:
case HTML_BREAK + SSML_CLOSE:
return CLAUSE_COLON;
case SSML_SENTENCE:
if (ssml_sp->tag_type == SSML_SENTENCE) {
// new sentence implies end-of-sentence
voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
}
voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
return CLAUSE_PARAGRAPH + voice_change_flag;
case SSML_PARAGRAPH:
if (ssml_sp->tag_type == SSML_SENTENCE) {
// new paragraph implies end-of-sentence or end-of-paragraph
voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
}
if (ssml_sp->tag_type == SSML_PARAGRAPH) {
// new paragraph implies end-of-sentence or end-of-paragraph
voice_change_flag |= GetVoiceAttributes(px, SSML_PARAGRAPH+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
}
voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
return CLAUSE_PARAGRAPH + voice_change_flag;
case SSML_SENTENCE + SSML_CLOSE:
if (ssml_sp->tag_type == SSML_SENTENCE) {
// end of a sentence which specified a language
voice_change_flag = GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
}
return CLAUSE_PERIOD + voice_change_flag;
case SSML_PARAGRAPH + SSML_CLOSE:
if ((ssml_sp->tag_type == SSML_SENTENCE) || (ssml_sp->tag_type == SSML_PARAGRAPH)) {
// End of a paragraph which specified a language.
// (End-of-paragraph also implies end-of-sentence)
return GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) + CLAUSE_PARAGRAPH;
}
return CLAUSE_PARAGRAPH;
}
return 0;
}

+ 3
- 4
src/libespeak-ng/ssml.h View File

@@ -29,9 +29,6 @@ typedef struct {
#define HTML_NOSPACE 16 // don't insert a space for this element, so it doesn't break a word
#define SSML_CLOSE 0x20 // for a closing tag, OR this with the tag type

// these tags have no effect if they are self-closing, eg. <voice />
static char ignore_if_self_closing[] = { 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0 };

static MNEM_TAB ssmltags[] = {
{ "speak", SSML_SPEAK },
{ "voice", SSML_VOICE },
@@ -76,10 +73,12 @@ int attrlookup(const wchar_t *string1, const MNEM_TAB *mtab);
int attrnumber(const wchar_t *pw, int default_value, int type);
int attr_prosody_value(int param_type, const wchar_t *pw, int *value_out);
wchar_t *GetSsmlAttribute(wchar_t *pw, const char *name);
int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char base_voice_variant_name[40]);
void ProcessParamStack(char *outbuf, int *outix, int n_param_stack, PARAM_STACK *param_stack, int *speech_parameters);
PARAM_STACK *PushParamStack(int tag_type, int *n_param_stack, PARAM_STACK *param_stack);
const char *VoiceFromStack(SSML_STACK *ssml_stack, int n_ssml_stack, espeak_VOICE *base_voice, char base_voice_variant_name[40]);
void PopParamStack(int tag_type, char *outbuf, int *outix, int *n_param_stack, PARAM_STACK *param_stack, int *speech_parameters);
int ReplaceKeyName(char *outbuf, int index, int *outix);
void SetProsodyParameter(int param_type, wchar_t *attr1, PARAM_STACK *sp, PARAM_STACK *param_stack, int *speech_parameters);
int LoadSoundFile2(const char *fname);
int AddNameData(const char *name, int wide);
int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, bool self_closing, const char *xmlbase, bool *audio_text, char *current_voice_id, espeak_VOICE *base_voice, char *base_voice_variant_name, bool *ignore_text, bool *clear_skipping_text, int *sayas_mode, int *sayas_start, SSML_STACK *ssml_stack, int *n_ssml_stack, int *n_param_stack, int *speech_parameters);

Loading…
Cancel
Save