eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ssml.c 29KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998
  1. /*
  2. * Copyright (C) 2005 to 2015 by Jonathan Duddington
  3. * email: [email protected]
  4. * Copyright (C) 2015-2017 Reece H. Dunn
  5. * Copyright (C) 2018 Juho Hiltunen
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  19. */
  20. #include "config.h"
  21. #include <ctype.h>
  22. #include <errno.h>
  23. #include <locale.h>
  24. #include <math.h>
  25. #include <stdint.h>
  26. #include <stdio.h>
  27. #include <stdlib.h>
  28. #include <string.h>
  29. #include <unistd.h>
  30. #include <wchar.h>
  31. #include <wctype.h>
  32. #include <espeak-ng/espeak_ng.h>
  33. #include <espeak-ng/speak_lib.h>
  34. #include <espeak-ng/encoding.h>
  35. #include <ucd/ucd.h>
  36. #include "ssml.h"
  37. #include "common.h" // for strncpy0
  38. #include "mnemonics.h" // for LookupMnemName, MNEM_TAB,
  39. #include "readclause.h" // for PARAM_STACK, param_stack, AddNameData
  40. #include "soundicon.h" // for LoadSoundFile2
  41. #include "synthesize.h" // for SPEED_FACTORS, speed
  42. #include "translate.h" // for CTRL_EMBEDDED
  43. #include "voice.h" // for SelectVoice, SelectVoiceByName
  44. #include "speech.h" // for MAKE_MEM_UNDEFINED
  45. static const MNEM_TAB ssmltags[] = {
  46. { "speak", SSML_SPEAK },
  47. { "voice", SSML_VOICE },
  48. { "prosody", SSML_PROSODY },
  49. { "say-as", SSML_SAYAS },
  50. { "mark", SSML_MARK },
  51. { "s", SSML_SENTENCE },
  52. { "p", SSML_PARAGRAPH },
  53. { "phoneme", SSML_PHONEME },
  54. { "sub", SSML_SUB },
  55. { "tts:style", SSML_STYLE },
  56. { "audio", SSML_AUDIO },
  57. { "emphasis", SSML_EMPHASIS },
  58. { "break", SSML_BREAK },
  59. { "metadata", SSML_IGNORE_TEXT },
  60. { "br", HTML_BREAK },
  61. { "li", HTML_BREAK },
  62. { "dd", HTML_BREAK },
  63. { "img", HTML_BREAK },
  64. { "td", HTML_BREAK },
  65. { "h1", SSML_PARAGRAPH },
  66. { "h2", SSML_PARAGRAPH },
  67. { "h3", SSML_PARAGRAPH },
  68. { "h4", SSML_PARAGRAPH },
  69. { "hr", SSML_PARAGRAPH },
  70. { "script", SSML_IGNORE_TEXT },
  71. { "style", SSML_IGNORE_TEXT },
  72. { "font", HTML_NOSPACE },
  73. { "b", HTML_NOSPACE },
  74. { "i", HTML_NOSPACE },
  75. { "strong", HTML_NOSPACE },
  76. { "em", HTML_NOSPACE },
  77. { "code", HTML_NOSPACE },
  78. { NULL, 0 }
  79. };
  80. static int attrcmp(const wchar_t *string1, const char *string2)
  81. {
  82. int ix;
  83. if (string1 == NULL)
  84. return 1;
  85. for (ix = 0; (string1[ix] == string2[ix]) && (string1[ix] != 0); ix++)
  86. ;
  87. if (((string1[ix] == '"') || (string1[ix] == '\'')) && (string2[ix] == 0))
  88. return 0;
  89. return 1;
  90. }
  91. static int attrlookup(const wchar_t *string1, const MNEM_TAB *mtab)
  92. {
  93. int ix;
  94. for (ix = 0; mtab[ix].mnem != NULL; ix++) {
  95. if (attrcmp(string1, mtab[ix].mnem) == 0)
  96. return mtab[ix].value;
  97. }
  98. return mtab[ix].value;
  99. }
  100. static int attrnumber(const wchar_t *pw, int default_value, int type)
  101. {
  102. int value = 0;
  103. if ((pw == NULL) || !IsDigit09(*pw))
  104. return default_value;
  105. while (IsDigit09(*pw))
  106. value = value*10 + *pw++ - '0';
  107. if ((type == 1) && (ucd_tolower(*pw) == 's')) {
  108. // time: seconds rather than ms
  109. value *= 1000;
  110. }
  111. return value;
  112. }
  113. static int attrcopy_utf8(char *buf, const wchar_t *pw, int len)
  114. {
  115. // Convert attribute string into utf8, write to buf, and return its utf8 length
  116. int ix = 0;
  117. if (pw != NULL) {
  118. wchar_t quote = pw[-1];
  119. if ((quote != '"') && (quote != '\'')) quote = 0;
  120. unsigned int c;
  121. int prev_c = 0;
  122. while ((ix < (len-4)) && ((c = *pw++) != 0)) {
  123. if ((quote == 0) && (isspace(c) || (c == '/')))
  124. break;
  125. if ((quote != 0) && (c == quote) && (prev_c != '\\'))
  126. break; // " indicates end of attribute, unless preceded by backstroke
  127. int n = utf8_out(c, &buf[ix]);
  128. ix += n;
  129. prev_c = c;
  130. }
  131. }
  132. buf[ix] = 0;
  133. return ix;
  134. }
  135. static int attr_prosody_value(int param_type, const wchar_t *pw, int *value_out)
  136. {
  137. int sign = 0;
  138. wchar_t *tail;
  139. double value;
  140. while (iswspace(*pw)) pw++;
  141. if (*pw == '+') {
  142. pw++;
  143. sign = 1;
  144. }
  145. if (*pw == '-') {
  146. pw++;
  147. sign = -1;
  148. }
  149. value = (double)wcstod(pw, &tail);
  150. if (tail == pw) {
  151. // failed to find a number, return 100%
  152. *value_out = 100;
  153. return 2;
  154. }
  155. if (*tail == '%') {
  156. if (sign != 0)
  157. value = 100 + (sign * value);
  158. *value_out = (int)value;
  159. return 2; // percentage
  160. }
  161. if ((tail[0] == 's') && (tail[1] == 't')) {
  162. double x;
  163. // convert from semitones to a frequency percentage
  164. x = pow((double)2.0, (double)((value*sign)/12)) * 100;
  165. *value_out = (int)x;
  166. return 2; // percentage
  167. }
  168. if (param_type == espeakRATE) {
  169. if (sign == 0)
  170. *value_out = (int)(value * 100);
  171. else
  172. *value_out = 100 + (int)(sign * value * 100);
  173. return 2; // percentage
  174. }
  175. *value_out = (int)value;
  176. return sign; // -1, 0, or 1
  177. }
  178. static const char *VoiceFromStack(SSML_STACK *ssml_stack, int n_ssml_stack, espeak_VOICE *base_voice, char base_voice_variant_name[40])
  179. {
  180. // Use the voice properties from the SSML stack to choose a voice, and switch
  181. // to that voice if it's not the current voice
  182. int ix;
  183. const char *p;
  184. SSML_STACK *sp;
  185. const char *v_id;
  186. int voice_found;
  187. espeak_VOICE voice_select;
  188. static char voice_name[40];
  189. char language[40];
  190. MAKE_MEM_UNDEFINED(&voice_name, sizeof(voice_name));
  191. strcpy(voice_name, ssml_stack[0].voice_name);
  192. strcpy(language, ssml_stack[0].language);
  193. voice_select.age = ssml_stack[0].voice_age;
  194. voice_select.gender = ssml_stack[0].voice_gender;
  195. voice_select.variant = ssml_stack[0].voice_variant_number;
  196. voice_select.identifier = NULL;
  197. for (ix = 0; ix < n_ssml_stack; ix++) {
  198. sp = &ssml_stack[ix];
  199. int voice_name_specified = 0;
  200. if ((sp->voice_name[0] != 0) && (SelectVoiceByName(NULL, sp->voice_name) != NULL)) {
  201. voice_name_specified = 1;
  202. strcpy(voice_name, sp->voice_name);
  203. language[0] = 0;
  204. voice_select.gender = ENGENDER_UNKNOWN;
  205. voice_select.age = 0;
  206. voice_select.variant = 0;
  207. }
  208. if (sp->language[0] != 0) {
  209. strcpy(language, sp->language);
  210. // is this language provided by the base voice?
  211. p = base_voice->languages;
  212. while (*p++ != 0) {
  213. if (strcmp(p, language) == 0) {
  214. // yes, change the language to the main language of the base voice
  215. strcpy(language, &base_voice->languages[1]);
  216. break;
  217. }
  218. p += (strlen(p) + 1);
  219. }
  220. if (voice_name_specified == 0)
  221. voice_name[0] = 0; // forget a previous voice name if a language is specified
  222. }
  223. if (sp->voice_gender != ENGENDER_UNKNOWN)
  224. voice_select.gender = sp->voice_gender;
  225. if (sp->voice_age != 0)
  226. voice_select.age = sp->voice_age;
  227. if (sp->voice_variant_number != 0)
  228. voice_select.variant = sp->voice_variant_number;
  229. }
  230. voice_select.name = voice_name;
  231. voice_select.languages = language;
  232. v_id = SelectVoice(&voice_select, &voice_found);
  233. if (v_id == NULL)
  234. return "default";
  235. if ((strchr(v_id, '+') == NULL) && ((voice_select.gender == ENGENDER_UNKNOWN) || (voice_select.gender == base_voice->gender)) && (base_voice_variant_name[0] != 0)) {
  236. // a voice variant has not been selected, use the original voice variant
  237. char buf[80];
  238. sprintf(buf, "%s+%s", v_id, base_voice_variant_name);
  239. strncpy0(voice_name, buf, sizeof(voice_name));
  240. return voice_name;
  241. }
  242. return v_id;
  243. }
  244. static const wchar_t *GetSsmlAttribute(wchar_t *pw, const char *name)
  245. {
  246. // Gets the value string for an attribute.
  247. // Returns NULL if the attribute is not present
  248. int ix;
  249. static const wchar_t empty[1] = { 0 };
  250. while (*pw != 0) {
  251. if (iswspace(pw[-1])) {
  252. ix = 0;
  253. while (*pw == name[ix]) {
  254. pw++;
  255. ix++;
  256. }
  257. if (name[ix] == 0) {
  258. // found the attribute, now get the value
  259. while (iswspace(*pw)) pw++;
  260. if (*pw == '=') pw++;
  261. while (iswspace(*pw)) pw++;
  262. if ((*pw == '"') || (*pw == '\'')) // allow single-quotes ?
  263. return pw+1;
  264. else if (iswspace(*pw) || (*pw == '/')) // end of attribute
  265. return empty;
  266. else
  267. return pw;
  268. }
  269. }
  270. pw++;
  271. }
  272. return NULL;
  273. }
  274. static int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char *base_voice_variant_name)
  275. {
  276. // Determines whether voice attribute are specified in this tag, and if so, whether this means
  277. // a voice change.
  278. // If it's a closing tag, delete the top frame of the stack and determine whether this implies
  279. // a voice change.
  280. // Returns CLAUSE_TYPE_VOICE_CHANGE if there is a voice change
  281. const char *new_voice_id;
  282. static const MNEM_TAB mnem_gender[] = {
  283. { "male", ENGENDER_MALE },
  284. { "female", ENGENDER_FEMALE },
  285. { "neutral", ENGENDER_NEUTRAL },
  286. { NULL, ENGENDER_UNKNOWN }
  287. };
  288. if (tag_type & SSML_CLOSE) {
  289. // delete a stack frame
  290. if (n_ssml_stack > 1)
  291. n_ssml_stack--;
  292. } else {
  293. const wchar_t *lang;
  294. const wchar_t *gender;
  295. const wchar_t *name;
  296. const wchar_t *age;
  297. const wchar_t *variant;
  298. // add a stack frame if any voice details are specified
  299. lang = GetSsmlAttribute(pw, "xml:lang");
  300. if (tag_type != SSML_VOICE) {
  301. // only expect an xml:lang attribute
  302. name = NULL;
  303. variant = NULL;
  304. age = NULL;
  305. gender = NULL;
  306. } else {
  307. name = GetSsmlAttribute(pw, "name");
  308. variant = GetSsmlAttribute(pw, "variant");
  309. age = GetSsmlAttribute(pw, "age");
  310. gender = GetSsmlAttribute(pw, "gender");
  311. }
  312. if ((tag_type != SSML_VOICE) && (lang == NULL))
  313. return 0; // <s> or <p> without language spec, nothing to do
  314. ssml_sp = &ssml_stack[n_ssml_stack++];
  315. int value;
  316. attrcopy_utf8(ssml_sp->language, lang, sizeof(ssml_sp->language));
  317. attrcopy_utf8(ssml_sp->voice_name, name, sizeof(ssml_sp->voice_name));
  318. if ((value = attrnumber(variant, 1, 0)) > 0)
  319. value--; // variant='0' and variant='1' the same
  320. ssml_sp->voice_variant_number = value;
  321. ssml_sp->voice_age = attrnumber(age, 0, 0);
  322. ssml_sp->voice_gender = attrlookup(gender, mnem_gender);
  323. ssml_sp->tag_type = tag_type;
  324. }
  325. new_voice_id = VoiceFromStack(ssml_stack, n_ssml_stack, base_voice, base_voice_variant_name);
  326. if (strcmp(new_voice_id, current_voice_id) != 0) {
  327. // add an embedded command to change the voice
  328. strcpy(current_voice_id, new_voice_id);
  329. return CLAUSE_TYPE_VOICE_CHANGE;
  330. }
  331. return 0;
  332. }
  333. static void ProcessParamStack(char *outbuf, int *outix, int n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
  334. {
  335. // Set the speech parameters from the parameter stack
  336. int param;
  337. int ix;
  338. char buf[20];
  339. int new_parameters[N_SPEECH_PARAM];
  340. static const char cmd_letter[N_SPEECH_PARAM] = { 0, 'S', 'A', 'P', 'R', 0, 'C', 0, 0, 0, 0, 0, 'F' }; // embedded command letters
  341. for (param = 0; param < N_SPEECH_PARAM; param++)
  342. new_parameters[param] = -1;
  343. for (ix = 0; ix < n_param_stack; ix++) {
  344. for (param = 0; param < N_SPEECH_PARAM; param++) {
  345. if (param_stack[ix].parameter[param] >= 0)
  346. new_parameters[param] = param_stack[ix].parameter[param];
  347. }
  348. }
  349. for (param = 0; param < N_SPEECH_PARAM; param++) {
  350. int value;
  351. if ((value = new_parameters[param]) != speech_parameters[param]) {
  352. buf[0] = 0;
  353. switch (param)
  354. {
  355. case espeakPUNCTUATION:
  356. option_punctuation = value-1;
  357. break;
  358. case espeakCAPITALS:
  359. option_capitals = value;
  360. break;
  361. case espeakRATE:
  362. case espeakVOLUME:
  363. case espeakPITCH:
  364. case espeakRANGE:
  365. case espeakEMPHASIS:
  366. sprintf(buf, "%c%d%c", CTRL_EMBEDDED, value, cmd_letter[param]);
  367. break;
  368. }
  369. speech_parameters[param] = new_parameters[param];
  370. strcpy(&outbuf[*outix], buf);
  371. *outix += strlen(buf);
  372. }
  373. }
  374. }
  375. static PARAM_STACK *PushParamStack(int tag_type, int *n_param_stack, PARAM_STACK *param_stack)
  376. {
  377. int ix;
  378. PARAM_STACK *sp;
  379. sp = &param_stack[*n_param_stack];
  380. if (*n_param_stack < (N_PARAM_STACK-1))
  381. (*n_param_stack)++;
  382. sp->type = tag_type;
  383. for (ix = 0; ix < N_SPEECH_PARAM; ix++)
  384. sp->parameter[ix] = -1;
  385. return sp;
  386. }
  387. static void PopParamStack(int tag_type, char *outbuf, int *outix, int *n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
  388. {
  389. // unwind the stack up to and including the previous tag of this type
  390. int ix;
  391. int top = 0;
  392. if (tag_type >= SSML_CLOSE)
  393. tag_type -= SSML_CLOSE;
  394. for (ix = 0; ix < *n_param_stack; ix++) {
  395. if (param_stack[ix].type == tag_type)
  396. top = ix;
  397. }
  398. if (top > 0)
  399. *n_param_stack = top;
  400. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  401. }
  402. static int ReplaceKeyName(char *outbuf, int index, int *outix)
  403. {
  404. // Replace some key-names by single characters, so they can be pronounced in different languages
  405. static const MNEM_TAB keynames[] = {
  406. { "space ", 0xe020 },
  407. { "tab ", 0xe009 },
  408. { "underscore ", 0xe05f },
  409. { "double-quote ", '"' },
  410. { NULL, 0 }
  411. };
  412. int letter;
  413. char *p;
  414. p = &outbuf[index];
  415. if ((letter = LookupMnem(keynames, p)) != 0) {
  416. int ix;
  417. ix = utf8_out(letter, p);
  418. *outix = index + ix;
  419. return letter;
  420. }
  421. return 0;
  422. }
  423. static void SetProsodyParameter(int param_type, const wchar_t *attr1, PARAM_STACK *sp, PARAM_STACK *param_stack, int *speech_parameters)
  424. {
  425. int value;
  426. static const MNEM_TAB mnem_volume[] = {
  427. { "default", 100 },
  428. { "silent", 0 },
  429. { "x-soft", 30 },
  430. { "soft", 65 },
  431. { "medium", 100 },
  432. { "loud", 150 },
  433. { "x-loud", 230 },
  434. { NULL, -1 }
  435. };
  436. static const MNEM_TAB mnem_rate[] = {
  437. { "default", 100 },
  438. { "x-slow", 60 },
  439. { "slow", 80 },
  440. { "medium", 100 },
  441. { "fast", 125 },
  442. { "x-fast", 160 },
  443. { NULL, -1 }
  444. };
  445. static const MNEM_TAB mnem_pitch[] = {
  446. { "default", 100 },
  447. { "x-low", 70 },
  448. { "low", 85 },
  449. { "medium", 100 },
  450. { "high", 110 },
  451. { "x-high", 120 },
  452. { NULL, -1 }
  453. };
  454. static const MNEM_TAB mnem_range[] = {
  455. { "default", 100 },
  456. { "x-low", 20 },
  457. { "low", 50 },
  458. { "medium", 100 },
  459. { "high", 140 },
  460. { "x-high", 180 },
  461. { NULL, -1 }
  462. };
  463. static const MNEM_TAB * const mnem_tabs[5] = {
  464. NULL, mnem_rate, mnem_volume, mnem_pitch, mnem_range
  465. };
  466. if ((value = attrlookup(attr1, mnem_tabs[param_type])) >= 0) {
  467. // mnemonic specifies a value as a percentage of the base pitch/range/rate/volume
  468. sp->parameter[param_type] = (param_stack[0].parameter[param_type] * value)/100;
  469. } else {
  470. int sign = attr_prosody_value(param_type, attr1, &value);
  471. if (sign == 0)
  472. sp->parameter[param_type] = value; // absolute value in Hz
  473. else if (sign == 2) {
  474. // change specified as percentage or in semitones
  475. sp->parameter[param_type] = (speech_parameters[param_type] * value)/100;
  476. } else {
  477. // change specified as plus or minus Hz
  478. sp->parameter[param_type] = speech_parameters[param_type] + (value*sign);
  479. }
  480. }
  481. }
  482. int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, const char *xmlbase, bool *audio_text, char *current_voice_id, espeak_VOICE *base_voice, char *base_voice_variant_name, bool *ignore_text, bool *clear_skipping_text, int *sayas_mode, int *sayas_start, SSML_STACK *ssml_stack, int *n_ssml_stack, int *n_param_stack, int *speech_parameters)
  483. {
  484. // xml_buf is the tag and attributes with a zero terminator in place of the original '>'
  485. // returns a clause terminator value.
  486. unsigned int ix;
  487. int index;
  488. int tag_type;
  489. int value;
  490. int value2;
  491. int value3;
  492. int voice_change_flag;
  493. wchar_t *px;
  494. const wchar_t *attr1;
  495. const wchar_t *attr2;
  496. const wchar_t *attr3;
  497. int terminator;
  498. int param_type;
  499. char tag_name[40];
  500. char buf[160];
  501. PARAM_STACK *sp;
  502. SSML_STACK *ssml_sp;
  503. // don't process comments and xml declarations
  504. if (wcsncmp(xml_buf, (wchar_t *) "!--", 3) == 0 || wcsncmp(xml_buf, (wchar_t *) "?xml", 4) == 0) {
  505. return 0;
  506. }
  507. // these tags have no effect if they are self-closing, eg. <voice />
  508. static const char ignore_if_self_closing[] = { 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0 };
  509. bool self_closing = false;
  510. int len;
  511. len = wcslen(xml_buf);
  512. if (xml_buf[len - 1] == '/') {
  513. // a self-closing tag
  514. xml_buf[len - 1] = ' ';
  515. self_closing = true;
  516. }
  517. static const MNEM_TAB mnem_phoneme_alphabet[] = {
  518. { "espeak", 1 },
  519. { NULL, -1 }
  520. };
  521. static const MNEM_TAB mnem_punct[] = {
  522. { "none", 1 },
  523. { "all", 2 },
  524. { "some", 3 },
  525. { NULL, -1 }
  526. };
  527. static const MNEM_TAB mnem_capitals[] = {
  528. { "no", 0 },
  529. { "icon", 1 },
  530. { "spelling", 2 },
  531. { "pitch", 20 }, // this is the amount by which to raise the pitch
  532. { NULL, -1 }
  533. };
  534. static const MNEM_TAB mnem_interpret_as[] = {
  535. { "characters", SAYAS_CHARS },
  536. { "tts:char", SAYAS_SINGLE_CHARS },
  537. { "tts:key", SAYAS_KEY },
  538. { "tts:digits", SAYAS_DIGITS },
  539. { "telephone", SAYAS_DIGITS1 },
  540. { NULL, -1 }
  541. };
  542. static const MNEM_TAB mnem_sayas_format[] = {
  543. { "glyphs", 1 },
  544. { NULL, -1 }
  545. };
  546. static const MNEM_TAB mnem_break[] = {
  547. { "none", 0 },
  548. { "x-weak", 1 },
  549. { "weak", 2 },
  550. { "medium", 3 },
  551. { "strong", 4 },
  552. { "x-strong", 5 },
  553. { NULL, -1 }
  554. };
  555. static const MNEM_TAB mnem_emphasis[] = {
  556. { "none", 1 },
  557. { "reduced", 2 },
  558. { "moderate", 3 },
  559. { "strong", 4 },
  560. { "x-strong", 5 },
  561. { NULL, -1 }
  562. };
  563. static const char * const prosody_attr[5] = {
  564. NULL, "rate", "volume", "pitch", "range"
  565. };
  566. for (ix = 0; ix < (sizeof(tag_name)-1); ix++) {
  567. int c;
  568. if (((c = xml_buf[ix]) == 0) || iswspace(c))
  569. break;
  570. tag_name[ix] = tolower((char)c);
  571. }
  572. tag_name[ix] = 0;
  573. px = &xml_buf[ix]; // the tag's attributes
  574. if (tag_name[0] == '/') {
  575. // closing tag
  576. if ((tag_type = LookupMnem(ssmltags, &tag_name[1])) != HTML_NOSPACE)
  577. outbuf[(*outix)++] = ' ';
  578. tag_type += SSML_CLOSE;
  579. } else {
  580. if ((tag_type = LookupMnem(ssmltags, tag_name)) != HTML_NOSPACE) {
  581. // separate SSML tags from the previous word (but not HMTL tags such as <b> <font> which can occur inside a word)
  582. outbuf[(*outix)++] = ' ';
  583. }
  584. if (self_closing && ignore_if_self_closing[tag_type])
  585. return 0;
  586. }
  587. voice_change_flag = 0;
  588. ssml_sp = &ssml_stack[*n_ssml_stack-1];
  589. switch (tag_type)
  590. {
  591. case SSML_STYLE:
  592. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  593. attr1 = GetSsmlAttribute(px, "field");
  594. attr2 = GetSsmlAttribute(px, "mode");
  595. if (attrcmp(attr1, "punctuation") == 0) {
  596. value = attrlookup(attr2, mnem_punct);
  597. sp->parameter[espeakPUNCTUATION] = value;
  598. } else if (attrcmp(attr1, "capital_letters") == 0) {
  599. value = attrlookup(attr2, mnem_capitals);
  600. sp->parameter[espeakCAPITALS] = value;
  601. }
  602. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  603. break;
  604. case SSML_PROSODY:
  605. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  606. // look for attributes: rate, volume, pitch, range
  607. for (param_type = espeakRATE; param_type <= espeakRANGE; param_type++) {
  608. if ((attr1 = GetSsmlAttribute(px, prosody_attr[param_type])) != NULL)
  609. SetProsodyParameter(param_type, attr1, sp, param_stack, speech_parameters);
  610. }
  611. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  612. break;
  613. case SSML_EMPHASIS:
  614. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  615. value = 3; // default is "moderate"
  616. if ((attr1 = GetSsmlAttribute(px, "level")) != NULL)
  617. value = attrlookup(attr1, mnem_emphasis);
  618. if (translator->langopts.tone_language == 1) {
  619. static const unsigned char emphasis_to_pitch_range[] = { 50, 50, 40, 70, 90, 100 };
  620. static const unsigned char emphasis_to_volume[] = { 100, 100, 70, 110, 135, 150 };
  621. // tone language (eg.Chinese) do emphasis by increasing the pitch range.
  622. sp->parameter[espeakRANGE] = emphasis_to_pitch_range[value];
  623. sp->parameter[espeakVOLUME] = emphasis_to_volume[value];
  624. } else {
  625. static const unsigned char emphasis_to_volume2[] = { 100, 100, 75, 100, 120, 150 };
  626. sp->parameter[espeakVOLUME] = emphasis_to_volume2[value];
  627. sp->parameter[espeakEMPHASIS] = value;
  628. }
  629. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  630. break;
  631. case SSML_STYLE + SSML_CLOSE:
  632. case SSML_PROSODY + SSML_CLOSE:
  633. case SSML_EMPHASIS + SSML_CLOSE:
  634. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  635. break;
  636. case SSML_PHONEME:
  637. attr1 = GetSsmlAttribute(px, "alphabet");
  638. attr2 = GetSsmlAttribute(px, "ph");
  639. value = attrlookup(attr1, mnem_phoneme_alphabet);
  640. if (value == 1) { // alphabet="espeak"
  641. outbuf[(*outix)++] = '[';
  642. outbuf[(*outix)++] = '[';
  643. *outix += attrcopy_utf8(&outbuf[*outix], attr2, n_outbuf-*outix);
  644. outbuf[(*outix)++] = ']';
  645. outbuf[(*outix)++] = ']';
  646. }
  647. break;
  648. case SSML_SAYAS:
  649. attr1 = GetSsmlAttribute(px, "interpret-as");
  650. attr2 = GetSsmlAttribute(px, "format");
  651. attr3 = GetSsmlAttribute(px, "detail");
  652. value = attrlookup(attr1, mnem_interpret_as);
  653. value2 = attrlookup(attr2, mnem_sayas_format);
  654. if (value2 == 1)
  655. value = SAYAS_GLYPHS;
  656. value3 = attrnumber(attr3, 0, 0);
  657. if (value == SAYAS_DIGITS) {
  658. if (value3 <= 1)
  659. value = SAYAS_DIGITS1;
  660. else
  661. value = SAYAS_DIGITS + value3;
  662. }
  663. sprintf(buf, "%c%dY", CTRL_EMBEDDED, value);
  664. strcpy(&outbuf[*outix], buf);
  665. *outix += strlen(buf);
  666. *sayas_start = *outix;
  667. *sayas_mode = value; // punctuation doesn't end clause during SAY-AS
  668. break;
  669. case SSML_SAYAS + SSML_CLOSE:
  670. if (*sayas_mode == SAYAS_KEY) {
  671. outbuf[*outix] = 0;
  672. ReplaceKeyName(outbuf, *sayas_start, outix);
  673. }
  674. outbuf[(*outix)++] = CTRL_EMBEDDED;
  675. outbuf[(*outix)++] = 'Y';
  676. *sayas_mode = 0;
  677. break;
  678. case SSML_SUB:
  679. if ((attr1 = GetSsmlAttribute(px, "alias")) != NULL) {
  680. // use the alias rather than the text
  681. *ignore_text = true;
  682. *outix += attrcopy_utf8(&outbuf[*outix], attr1, n_outbuf-*outix);
  683. }
  684. break;
  685. case SSML_IGNORE_TEXT:
  686. *ignore_text = true;
  687. break;
  688. case SSML_SUB + SSML_CLOSE:
  689. case SSML_IGNORE_TEXT + SSML_CLOSE:
  690. *ignore_text = false;
  691. break;
  692. case SSML_MARK:
  693. if ((attr1 = GetSsmlAttribute(px, "name")) != NULL) {
  694. // add name to circular buffer of marker names
  695. attrcopy_utf8(buf, attr1, sizeof(buf));
  696. if (strcmp(skip_marker, buf) == 0) {
  697. // This is the marker we are waiting for before starting to speak
  698. *clear_skipping_text = true;
  699. skip_marker[0] = 0;
  700. return CLAUSE_NONE;
  701. }
  702. if ((index = AddNameData(buf, 0)) >= 0) {
  703. sprintf(buf, "%c%dM", CTRL_EMBEDDED, index);
  704. strcpy(&outbuf[*outix], buf);
  705. *outix += strlen(buf);
  706. }
  707. }
  708. break;
  709. case SSML_AUDIO:
  710. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *)param_stack);
  711. if ((attr1 = GetSsmlAttribute(px, "src")) != NULL) {
  712. attrcopy_utf8(buf, attr1, sizeof(buf));
  713. if (uri_callback == NULL) {
  714. if ((xmlbase != NULL) && (buf[0] != '/')) {
  715. char fname[256];
  716. sprintf(fname, "%s/%s", xmlbase, buf);
  717. index = LoadSoundFile2(fname);
  718. } else
  719. index = LoadSoundFile2(buf);
  720. if (index >= 0) {
  721. sprintf(buf, "%c%dI", CTRL_EMBEDDED, index);
  722. strcpy(&outbuf[*outix], buf);
  723. *outix += strlen(buf);
  724. sp->parameter[espeakSILENCE] = 1;
  725. }
  726. } else {
  727. if ((index = AddNameData(buf, 0)) >= 0) {
  728. char *uri;
  729. uri = &namedata[index];
  730. if (uri_callback(1, uri, xmlbase) == 0) {
  731. sprintf(buf, "%c%dU", CTRL_EMBEDDED, index);
  732. strcpy(&outbuf[*outix], buf);
  733. *outix += strlen(buf);
  734. sp->parameter[espeakSILENCE] = 1;
  735. }
  736. }
  737. }
  738. }
  739. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  740. if (self_closing)
  741. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  742. else
  743. *audio_text = true;
  744. return CLAUSE_NONE;
  745. case SSML_AUDIO + SSML_CLOSE:
  746. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  747. *audio_text = false;
  748. return CLAUSE_NONE;
  749. case SSML_BREAK:
  750. value = 21;
  751. terminator = CLAUSE_NONE;
  752. if ((attr1 = GetSsmlAttribute(px, "strength")) != NULL) {
  753. static const int break_value[6] = { 0, 7, 14, 21, 40, 80 }; // *10mS
  754. value = attrlookup(attr1, mnem_break);
  755. if (value < 3) {
  756. // adjust prepause on the following word
  757. sprintf(&outbuf[*outix], "%c%dB", CTRL_EMBEDDED, value);
  758. *outix += 3;
  759. terminator = 0;
  760. }
  761. value = break_value[value];
  762. }
  763. if ((attr2 = GetSsmlAttribute(px, "time")) != NULL) {
  764. value2 = attrnumber(attr2, 0, 1); // pause in mS
  765. // compensate for speaking speed to keep constant pause length, see function PauseLength()
  766. // 'value' here is x 10mS
  767. value = (value2 * 256) / (speed.clause_pause_factor * 10);
  768. if (value < 200)
  769. value = (value2 * 256) / (speed.pause_factor * 10);
  770. if (terminator == 0)
  771. terminator = CLAUSE_NONE;
  772. }
  773. if (terminator) {
  774. if (value > 0xfff) {
  775. // scale down the value and set a scaling indicator bit
  776. value = value / 32;
  777. if (value > 0xfff)
  778. value = 0xfff;
  779. terminator |= CLAUSE_PAUSE_LONG;
  780. }
  781. return terminator + value;
  782. }
  783. break;
  784. case SSML_SPEAK:
  785. if ((attr1 = GetSsmlAttribute(px, "xml:base")) != NULL) {
  786. attrcopy_utf8(buf, attr1, sizeof(buf));
  787. if ((index = AddNameData(buf, 0)) >= 0)
  788. xmlbase = &namedata[index];
  789. }
  790. if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
  791. return 0; // no voice change
  792. return CLAUSE_VOICE;
  793. case SSML_VOICE:
  794. if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
  795. return 0; // no voice change
  796. return CLAUSE_VOICE;
  797. case SSML_SPEAK + SSML_CLOSE:
  798. // unwind stack until the previous <voice> or <speak> tag
  799. while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_SPEAK))
  800. (*n_ssml_stack)--;
  801. return CLAUSE_PERIOD + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  802. case SSML_VOICE + SSML_CLOSE:
  803. // unwind stack until the previous <voice> or <speak> tag
  804. while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_VOICE))
  805. (*n_ssml_stack)--;
  806. terminator = 0; // ?? Sentence intonation, but no pause ??
  807. return terminator + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  808. case HTML_BREAK:
  809. case HTML_BREAK + SSML_CLOSE:
  810. return CLAUSE_COLON;
  811. case SSML_SENTENCE:
  812. if (ssml_sp->tag_type == SSML_SENTENCE) {
  813. // new sentence implies end-of-sentence
  814. voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  815. }
  816. voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  817. return CLAUSE_PARAGRAPH + voice_change_flag;
  818. case SSML_PARAGRAPH:
  819. if (ssml_sp->tag_type == SSML_SENTENCE) {
  820. // new paragraph implies end-of-sentence or end-of-paragraph
  821. voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  822. }
  823. if (ssml_sp->tag_type == SSML_PARAGRAPH) {
  824. // new paragraph implies end-of-sentence or end-of-paragraph
  825. voice_change_flag |= GetVoiceAttributes(px, SSML_PARAGRAPH+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  826. }
  827. voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  828. return CLAUSE_PARAGRAPH + voice_change_flag;
  829. case SSML_SENTENCE + SSML_CLOSE:
  830. if (ssml_sp->tag_type == SSML_SENTENCE) {
  831. // end of a sentence which specified a language
  832. voice_change_flag = GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  833. }
  834. return CLAUSE_PERIOD + voice_change_flag;
  835. case SSML_PARAGRAPH + SSML_CLOSE:
  836. if ((ssml_sp->tag_type == SSML_SENTENCE) || (ssml_sp->tag_type == SSML_PARAGRAPH)) {
  837. // End of a paragraph which specified a language.
  838. // (End-of-paragraph also implies end-of-sentence)
  839. return GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) + CLAUSE_PARAGRAPH;
  840. }
  841. return CLAUSE_PARAGRAPH;
  842. }
  843. return 0;
  844. }
  845. static const MNEM_TAB xml_entity_mnemonics[] = {
  846. { "gt", '>' },
  847. { "lt", 0xe000 + '<' }, // private usage area, to avoid confusion with XML tag
  848. { "amp", '&' },
  849. { "quot", '"' },
  850. { "nbsp", ' ' },
  851. { "apos", '\'' },
  852. { NULL, -1 }
  853. };
  854. int ParseSsmlReference(char *ref, int *c1, int *c2) {
  855. // Check if buffer *ref contains an XML character or entity reference
  856. // if found, set *c1 to the replacement char
  857. // change *c2 for entity references
  858. // returns >= 0 on success
  859. if (ref[0] == '#') {
  860. // character reference
  861. if (ref[1] == 'x')
  862. return sscanf(&ref[2], "%x", c1);
  863. else
  864. return sscanf(&ref[1], "%d", c1);
  865. } else {
  866. // entity reference
  867. int found;
  868. if ((found = LookupMnem(xml_entity_mnemonics, ref)) != -1) {
  869. *c1 = found;
  870. if (*c2 == 0)
  871. *c2 = ' ';
  872. return found;
  873. }
  874. }
  875. return -1;
  876. }