eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ssml.c 29KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991
  1. /*
  2. * Copyright (C) 2005 to 2015 by Jonathan Duddington
  3. * email: [email protected]
  4. * Copyright (C) 2015-2017 Reece H. Dunn
  5. * Copyright (C) 2018 Juho Hiltunen
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  19. */
  20. #include "config.h"
  21. #include <ctype.h>
  22. #include <errno.h>
  23. #include <locale.h>
  24. #include <math.h>
  25. #include <stdint.h>
  26. #include <stdio.h>
  27. #include <stdlib.h>
  28. #include <string.h>
  29. #include <unistd.h>
  30. #include <wchar.h>
  31. #include <wctype.h>
  32. #include <espeak-ng/espeak_ng.h>
  33. #include <espeak-ng/speak_lib.h>
  34. #include <espeak-ng/encoding.h>
  35. #include <ucd/ucd.h>
  36. #include "ssml.h"
  37. #include "common.h" // for strncpy0
  38. #include "mnemonics.h" // for LookupMnemName, MNEM_TAB,
  39. #include "readclause.h" // for PARAM_STACK, param_stack, AddNameData
  40. #include "soundicon.h" // for LoadSoundFile2
  41. #include "synthesize.h" // for SPEED_FACTORS, speed
  42. #include "translate.h" // for CTRL_EMBEDDED
  43. #include "voice.h" // for SelectVoice, SelectVoiceByName
  44. #include "speech.h" // for MAKE_MEM_UNDEFINED
  45. static const MNEM_TAB ssmltags[] = {
  46. { "speak", SSML_SPEAK },
  47. { "voice", SSML_VOICE },
  48. { "prosody", SSML_PROSODY },
  49. { "say-as", SSML_SAYAS },
  50. { "mark", SSML_MARK },
  51. { "s", SSML_SENTENCE },
  52. { "p", SSML_PARAGRAPH },
  53. { "phoneme", SSML_PHONEME },
  54. { "sub", SSML_SUB },
  55. { "tts:style", SSML_STYLE },
  56. { "audio", SSML_AUDIO },
  57. { "emphasis", SSML_EMPHASIS },
  58. { "break", SSML_BREAK },
  59. { "metadata", SSML_IGNORE_TEXT },
  60. { "br", HTML_BREAK },
  61. { "li", HTML_BREAK },
  62. { "dd", HTML_BREAK },
  63. { "img", HTML_BREAK },
  64. { "td", HTML_BREAK },
  65. { "h1", SSML_PARAGRAPH },
  66. { "h2", SSML_PARAGRAPH },
  67. { "h3", SSML_PARAGRAPH },
  68. { "h4", SSML_PARAGRAPH },
  69. { "hr", SSML_PARAGRAPH },
  70. { "script", SSML_IGNORE_TEXT },
  71. { "style", SSML_IGNORE_TEXT },
  72. { "font", HTML_NOSPACE },
  73. { "b", HTML_NOSPACE },
  74. { "i", HTML_NOSPACE },
  75. { "strong", HTML_NOSPACE },
  76. { "em", HTML_NOSPACE },
  77. { "code", HTML_NOSPACE },
  78. { NULL, 0 }
  79. };
  80. static int attrcmp(const wchar_t *string1, const char *string2)
  81. {
  82. int ix;
  83. if (string1 == NULL)
  84. return 1;
  85. for (ix = 0; (string1[ix] == string2[ix]) && (string1[ix] != 0); ix++)
  86. ;
  87. if (((string1[ix] == '"') || (string1[ix] == '\'')) && (string2[ix] == 0))
  88. return 0;
  89. return 1;
  90. }
  91. static int attrlookup(const wchar_t *string1, const MNEM_TAB *mtab)
  92. {
  93. int ix;
  94. for (ix = 0; mtab[ix].mnem != NULL; ix++) {
  95. if (attrcmp(string1, mtab[ix].mnem) == 0)
  96. return mtab[ix].value;
  97. }
  98. return mtab[ix].value;
  99. }
  100. static int attrnumber(const wchar_t *pw, int default_value, int type)
  101. {
  102. int value = 0;
  103. if ((pw == NULL) || !IsDigit09(*pw))
  104. return default_value;
  105. while (IsDigit09(*pw))
  106. value = value*10 + *pw++ - '0';
  107. if ((type == 1) && (ucd_tolower(*pw) == 's')) {
  108. // time: seconds rather than ms
  109. value *= 1000;
  110. }
  111. return value;
  112. }
  113. static int attrcopy_utf8(char *buf, const wchar_t *pw, int len)
  114. {
  115. // Convert attribute string into utf8, write to buf, and return its utf8 length
  116. int ix = 0;
  117. if (pw != NULL) {
  118. unsigned int c;
  119. int prev_c = 0;
  120. while ((ix < (len-4)) && ((c = *pw++) != 0)) {
  121. if ((c == '"') && (prev_c != '\\'))
  122. break; // " indicates end of attribute, unless preceded by backstroke
  123. int n = utf8_out(c, &buf[ix]);
  124. ix += n;
  125. prev_c = c;
  126. }
  127. }
  128. buf[ix] = 0;
  129. return ix;
  130. }
  131. static int attr_prosody_value(int param_type, const wchar_t *pw, int *value_out)
  132. {
  133. int sign = 0;
  134. wchar_t *tail;
  135. double value;
  136. while (iswspace(*pw)) pw++;
  137. if (*pw == '+') {
  138. pw++;
  139. sign = 1;
  140. }
  141. if (*pw == '-') {
  142. pw++;
  143. sign = -1;
  144. }
  145. value = (double)wcstod(pw, &tail);
  146. if (tail == pw) {
  147. // failed to find a number, return 100%
  148. *value_out = 100;
  149. return 2;
  150. }
  151. if (*tail == '%') {
  152. if (sign != 0)
  153. value = 100 + (sign * value);
  154. *value_out = (int)value;
  155. return 2; // percentage
  156. }
  157. if ((tail[0] == 's') && (tail[1] == 't')) {
  158. double x;
  159. // convert from semitones to a frequency percentage
  160. x = pow((double)2.0, (double)((value*sign)/12)) * 100;
  161. *value_out = (int)x;
  162. return 2; // percentage
  163. }
  164. if (param_type == espeakRATE) {
  165. if (sign == 0)
  166. *value_out = (int)(value * 100);
  167. else
  168. *value_out = 100 + (int)(sign * value * 100);
  169. return 2; // percentage
  170. }
  171. *value_out = (int)value;
  172. return sign; // -1, 0, or 1
  173. }
  174. static const char *VoiceFromStack(SSML_STACK *ssml_stack, int n_ssml_stack, espeak_VOICE *base_voice, char base_voice_variant_name[40])
  175. {
  176. // Use the voice properties from the SSML stack to choose a voice, and switch
  177. // to that voice if it's not the current voice
  178. int ix;
  179. const char *p;
  180. SSML_STACK *sp;
  181. const char *v_id;
  182. int voice_found;
  183. espeak_VOICE voice_select;
  184. static char voice_name[40];
  185. char language[40];
  186. MAKE_MEM_UNDEFINED(&voice_name, sizeof(voice_name));
  187. strcpy(voice_name, ssml_stack[0].voice_name);
  188. strcpy(language, ssml_stack[0].language);
  189. voice_select.age = ssml_stack[0].voice_age;
  190. voice_select.gender = ssml_stack[0].voice_gender;
  191. voice_select.variant = ssml_stack[0].voice_variant_number;
  192. voice_select.identifier = NULL;
  193. for (ix = 0; ix < n_ssml_stack; ix++) {
  194. sp = &ssml_stack[ix];
  195. int voice_name_specified = 0;
  196. if ((sp->voice_name[0] != 0) && (SelectVoiceByName(NULL, sp->voice_name) != NULL)) {
  197. voice_name_specified = 1;
  198. strcpy(voice_name, sp->voice_name);
  199. language[0] = 0;
  200. voice_select.gender = ENGENDER_UNKNOWN;
  201. voice_select.age = 0;
  202. voice_select.variant = 0;
  203. }
  204. if (sp->language[0] != 0) {
  205. strcpy(language, sp->language);
  206. // is this language provided by the base voice?
  207. p = base_voice->languages;
  208. while (*p++ != 0) {
  209. if (strcmp(p, language) == 0) {
  210. // yes, change the language to the main language of the base voice
  211. strcpy(language, &base_voice->languages[1]);
  212. break;
  213. }
  214. p += (strlen(p) + 1);
  215. }
  216. if (voice_name_specified == 0)
  217. voice_name[0] = 0; // forget a previous voice name if a language is specified
  218. }
  219. if (sp->voice_gender != ENGENDER_UNKNOWN)
  220. voice_select.gender = sp->voice_gender;
  221. if (sp->voice_age != 0)
  222. voice_select.age = sp->voice_age;
  223. if (sp->voice_variant_number != 0)
  224. voice_select.variant = sp->voice_variant_number;
  225. }
  226. voice_select.name = voice_name;
  227. voice_select.languages = language;
  228. v_id = SelectVoice(&voice_select, &voice_found);
  229. if (v_id == NULL)
  230. return "default";
  231. if ((strchr(v_id, '+') == NULL) && ((voice_select.gender == ENGENDER_UNKNOWN) || (voice_select.gender == base_voice->gender)) && (base_voice_variant_name[0] != 0)) {
  232. // a voice variant has not been selected, use the original voice variant
  233. char buf[80];
  234. sprintf(buf, "%s+%s", v_id, base_voice_variant_name);
  235. strncpy0(voice_name, buf, sizeof(voice_name));
  236. return voice_name;
  237. }
  238. return v_id;
  239. }
  240. static const wchar_t *GetSsmlAttribute(wchar_t *pw, const char *name)
  241. {
  242. // Gets the value string for an attribute.
  243. // Returns NULL if the attribute is not present
  244. int ix;
  245. static const wchar_t empty[1] = { 0 };
  246. while (*pw != 0) {
  247. if (iswspace(pw[-1])) {
  248. ix = 0;
  249. while (*pw == name[ix]) {
  250. pw++;
  251. ix++;
  252. }
  253. if (name[ix] == 0) {
  254. // found the attribute, now get the value
  255. while (iswspace(*pw)) pw++;
  256. if (*pw == '=') pw++;
  257. while (iswspace(*pw)) pw++;
  258. if ((*pw == '"') || (*pw == '\'')) // allow single-quotes ?
  259. return pw+1;
  260. else
  261. return empty;
  262. }
  263. }
  264. pw++;
  265. }
  266. return NULL;
  267. }
  268. static int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char *base_voice_variant_name)
  269. {
  270. // Determines whether voice attribute are specified in this tag, and if so, whether this means
  271. // a voice change.
  272. // If it's a closing tag, delete the top frame of the stack and determine whether this implies
  273. // a voice change.
  274. // Returns CLAUSE_TYPE_VOICE_CHANGE if there is a voice change
  275. const char *new_voice_id;
  276. static const MNEM_TAB mnem_gender[] = {
  277. { "male", ENGENDER_MALE },
  278. { "female", ENGENDER_FEMALE },
  279. { "neutral", ENGENDER_NEUTRAL },
  280. { NULL, ENGENDER_UNKNOWN }
  281. };
  282. if (tag_type & SSML_CLOSE) {
  283. // delete a stack frame
  284. if (n_ssml_stack > 1)
  285. n_ssml_stack--;
  286. } else {
  287. const wchar_t *lang;
  288. const wchar_t *gender;
  289. const wchar_t *name;
  290. const wchar_t *age;
  291. const wchar_t *variant;
  292. // add a stack frame if any voice details are specified
  293. lang = GetSsmlAttribute(pw, "xml:lang");
  294. if (tag_type != SSML_VOICE) {
  295. // only expect an xml:lang attribute
  296. name = NULL;
  297. variant = NULL;
  298. age = NULL;
  299. gender = NULL;
  300. } else {
  301. name = GetSsmlAttribute(pw, "name");
  302. variant = GetSsmlAttribute(pw, "variant");
  303. age = GetSsmlAttribute(pw, "age");
  304. gender = GetSsmlAttribute(pw, "gender");
  305. }
  306. if ((tag_type != SSML_VOICE) && (lang == NULL))
  307. return 0; // <s> or <p> without language spec, nothing to do
  308. ssml_sp = &ssml_stack[n_ssml_stack++];
  309. int value;
  310. attrcopy_utf8(ssml_sp->language, lang, sizeof(ssml_sp->language));
  311. attrcopy_utf8(ssml_sp->voice_name, name, sizeof(ssml_sp->voice_name));
  312. if ((value = attrnumber(variant, 1, 0)) > 0)
  313. value--; // variant='0' and variant='1' the same
  314. ssml_sp->voice_variant_number = value;
  315. ssml_sp->voice_age = attrnumber(age, 0, 0);
  316. ssml_sp->voice_gender = attrlookup(gender, mnem_gender);
  317. ssml_sp->tag_type = tag_type;
  318. }
  319. new_voice_id = VoiceFromStack(ssml_stack, n_ssml_stack, base_voice, base_voice_variant_name);
  320. if (strcmp(new_voice_id, current_voice_id) != 0) {
  321. // add an embedded command to change the voice
  322. strcpy(current_voice_id, new_voice_id);
  323. return CLAUSE_TYPE_VOICE_CHANGE;
  324. }
  325. return 0;
  326. }
  327. static void ProcessParamStack(char *outbuf, int *outix, int n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
  328. {
  329. // Set the speech parameters from the parameter stack
  330. int param;
  331. int ix;
  332. char buf[20];
  333. int new_parameters[N_SPEECH_PARAM];
  334. static const char cmd_letter[N_SPEECH_PARAM] = { 0, 'S', 'A', 'P', 'R', 0, 'C', 0, 0, 0, 0, 0, 'F' }; // embedded command letters
  335. for (param = 0; param < N_SPEECH_PARAM; param++)
  336. new_parameters[param] = -1;
  337. for (ix = 0; ix < n_param_stack; ix++) {
  338. for (param = 0; param < N_SPEECH_PARAM; param++) {
  339. if (param_stack[ix].parameter[param] >= 0)
  340. new_parameters[param] = param_stack[ix].parameter[param];
  341. }
  342. }
  343. for (param = 0; param < N_SPEECH_PARAM; param++) {
  344. int value;
  345. if ((value = new_parameters[param]) != speech_parameters[param]) {
  346. buf[0] = 0;
  347. switch (param)
  348. {
  349. case espeakPUNCTUATION:
  350. option_punctuation = value-1;
  351. break;
  352. case espeakCAPITALS:
  353. option_capitals = value;
  354. break;
  355. case espeakRATE:
  356. case espeakVOLUME:
  357. case espeakPITCH:
  358. case espeakRANGE:
  359. case espeakEMPHASIS:
  360. sprintf(buf, "%c%d%c", CTRL_EMBEDDED, value, cmd_letter[param]);
  361. break;
  362. }
  363. speech_parameters[param] = new_parameters[param];
  364. strcpy(&outbuf[*outix], buf);
  365. *outix += strlen(buf);
  366. }
  367. }
  368. }
  369. static PARAM_STACK *PushParamStack(int tag_type, int *n_param_stack, PARAM_STACK *param_stack)
  370. {
  371. int ix;
  372. PARAM_STACK *sp;
  373. sp = &param_stack[*n_param_stack];
  374. if (*n_param_stack < (N_PARAM_STACK-1))
  375. (*n_param_stack)++;
  376. sp->type = tag_type;
  377. for (ix = 0; ix < N_SPEECH_PARAM; ix++)
  378. sp->parameter[ix] = -1;
  379. return sp;
  380. }
  381. static void PopParamStack(int tag_type, char *outbuf, int *outix, int *n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
  382. {
  383. // unwind the stack up to and including the previous tag of this type
  384. int ix;
  385. int top = 0;
  386. if (tag_type >= SSML_CLOSE)
  387. tag_type -= SSML_CLOSE;
  388. for (ix = 0; ix < *n_param_stack; ix++) {
  389. if (param_stack[ix].type == tag_type)
  390. top = ix;
  391. }
  392. if (top > 0)
  393. *n_param_stack = top;
  394. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  395. }
  396. static int ReplaceKeyName(char *outbuf, int index, int *outix)
  397. {
  398. // Replace some key-names by single characters, so they can be pronounced in different languages
  399. static const MNEM_TAB keynames[] = {
  400. { "space ", 0xe020 },
  401. { "tab ", 0xe009 },
  402. { "underscore ", 0xe05f },
  403. { "double-quote ", '"' },
  404. { NULL, 0 }
  405. };
  406. int letter;
  407. char *p;
  408. p = &outbuf[index];
  409. if ((letter = LookupMnem(keynames, p)) != 0) {
  410. int ix;
  411. ix = utf8_out(letter, p);
  412. *outix = index + ix;
  413. return letter;
  414. }
  415. return 0;
  416. }
  417. static void SetProsodyParameter(int param_type, const wchar_t *attr1, PARAM_STACK *sp, PARAM_STACK *param_stack, int *speech_parameters)
  418. {
  419. int value;
  420. static const MNEM_TAB mnem_volume[] = {
  421. { "default", 100 },
  422. { "silent", 0 },
  423. { "x-soft", 30 },
  424. { "soft", 65 },
  425. { "medium", 100 },
  426. { "loud", 150 },
  427. { "x-loud", 230 },
  428. { NULL, -1 }
  429. };
  430. static const MNEM_TAB mnem_rate[] = {
  431. { "default", 100 },
  432. { "x-slow", 60 },
  433. { "slow", 80 },
  434. { "medium", 100 },
  435. { "fast", 125 },
  436. { "x-fast", 160 },
  437. { NULL, -1 }
  438. };
  439. static const MNEM_TAB mnem_pitch[] = {
  440. { "default", 100 },
  441. { "x-low", 70 },
  442. { "low", 85 },
  443. { "medium", 100 },
  444. { "high", 110 },
  445. { "x-high", 120 },
  446. { NULL, -1 }
  447. };
  448. static const MNEM_TAB mnem_range[] = {
  449. { "default", 100 },
  450. { "x-low", 20 },
  451. { "low", 50 },
  452. { "medium", 100 },
  453. { "high", 140 },
  454. { "x-high", 180 },
  455. { NULL, -1 }
  456. };
  457. static const MNEM_TAB * const mnem_tabs[5] = {
  458. NULL, mnem_rate, mnem_volume, mnem_pitch, mnem_range
  459. };
  460. if ((value = attrlookup(attr1, mnem_tabs[param_type])) >= 0) {
  461. // mnemonic specifies a value as a percentage of the base pitch/range/rate/volume
  462. sp->parameter[param_type] = (param_stack[0].parameter[param_type] * value)/100;
  463. } else {
  464. int sign = attr_prosody_value(param_type, attr1, &value);
  465. if (sign == 0)
  466. sp->parameter[param_type] = value; // absolute value in Hz
  467. else if (sign == 2) {
  468. // change specified as percentage or in semitones
  469. sp->parameter[param_type] = (speech_parameters[param_type] * value)/100;
  470. } else {
  471. // change specified as plus or minus Hz
  472. sp->parameter[param_type] = speech_parameters[param_type] + (value*sign);
  473. }
  474. }
  475. }
  476. int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, const char *xmlbase, bool *audio_text, char *current_voice_id, espeak_VOICE *base_voice, char *base_voice_variant_name, bool *ignore_text, bool *clear_skipping_text, int *sayas_mode, int *sayas_start, SSML_STACK *ssml_stack, int *n_ssml_stack, int *n_param_stack, int *speech_parameters)
  477. {
  478. // xml_buf is the tag and attributes with a zero terminator in place of the original '>'
  479. // returns a clause terminator value.
  480. unsigned int ix;
  481. int index;
  482. int tag_type;
  483. int value;
  484. int value2;
  485. int value3;
  486. int voice_change_flag;
  487. wchar_t *px;
  488. const wchar_t *attr1;
  489. const wchar_t *attr2;
  490. const wchar_t *attr3;
  491. int terminator;
  492. int param_type;
  493. char tag_name[40];
  494. char buf[160];
  495. PARAM_STACK *sp;
  496. SSML_STACK *ssml_sp;
  497. // don't process comments and xml declarations
  498. if (wcsncmp(xml_buf, (wchar_t *) "!--", 3) == 0 || wcsncmp(xml_buf, (wchar_t *) "?xml", 4) == 0) {
  499. return 0;
  500. }
  501. // these tags have no effect if they are self-closing, eg. <voice />
  502. static const char ignore_if_self_closing[] = { 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0 };
  503. bool self_closing = false;
  504. int len;
  505. len = wcslen(xml_buf);
  506. if (xml_buf[len - 1] == '/') {
  507. // a self-closing tag
  508. xml_buf[len - 1] = ' ';
  509. self_closing = true;
  510. }
  511. static const MNEM_TAB mnem_phoneme_alphabet[] = {
  512. { "espeak", 1 },
  513. { NULL, -1 }
  514. };
  515. static const MNEM_TAB mnem_punct[] = {
  516. { "none", 1 },
  517. { "all", 2 },
  518. { "some", 3 },
  519. { NULL, -1 }
  520. };
  521. static const MNEM_TAB mnem_capitals[] = {
  522. { "no", 0 },
  523. { "icon", 1 },
  524. { "spelling", 2 },
  525. { "pitch", 20 }, // this is the amount by which to raise the pitch
  526. { NULL, -1 }
  527. };
  528. static const MNEM_TAB mnem_interpret_as[] = {
  529. { "characters", SAYAS_CHARS },
  530. { "tts:char", SAYAS_SINGLE_CHARS },
  531. { "tts:key", SAYAS_KEY },
  532. { "tts:digits", SAYAS_DIGITS },
  533. { "telephone", SAYAS_DIGITS1 },
  534. { NULL, -1 }
  535. };
  536. static const MNEM_TAB mnem_sayas_format[] = {
  537. { "glyphs", 1 },
  538. { NULL, -1 }
  539. };
  540. static const MNEM_TAB mnem_break[] = {
  541. { "none", 0 },
  542. { "x-weak", 1 },
  543. { "weak", 2 },
  544. { "medium", 3 },
  545. { "strong", 4 },
  546. { "x-strong", 5 },
  547. { NULL, -1 }
  548. };
  549. static const MNEM_TAB mnem_emphasis[] = {
  550. { "none", 1 },
  551. { "reduced", 2 },
  552. { "moderate", 3 },
  553. { "strong", 4 },
  554. { "x-strong", 5 },
  555. { NULL, -1 }
  556. };
  557. static const char * const prosody_attr[5] = {
  558. NULL, "rate", "volume", "pitch", "range"
  559. };
  560. for (ix = 0; ix < (sizeof(tag_name)-1); ix++) {
  561. int c;
  562. if (((c = xml_buf[ix]) == 0) || iswspace(c))
  563. break;
  564. tag_name[ix] = tolower((char)c);
  565. }
  566. tag_name[ix] = 0;
  567. px = &xml_buf[ix]; // the tag's attributes
  568. if (tag_name[0] == '/') {
  569. // closing tag
  570. if ((tag_type = LookupMnem(ssmltags, &tag_name[1])) != HTML_NOSPACE)
  571. outbuf[(*outix)++] = ' ';
  572. tag_type += SSML_CLOSE;
  573. } else {
  574. if ((tag_type = LookupMnem(ssmltags, tag_name)) != HTML_NOSPACE) {
  575. // separate SSML tags from the previous word (but not HMTL tags such as <b> <font> which can occur inside a word)
  576. outbuf[(*outix)++] = ' ';
  577. }
  578. if (self_closing && ignore_if_self_closing[tag_type])
  579. return 0;
  580. }
  581. voice_change_flag = 0;
  582. ssml_sp = &ssml_stack[*n_ssml_stack-1];
  583. switch (tag_type)
  584. {
  585. case SSML_STYLE:
  586. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  587. attr1 = GetSsmlAttribute(px, "field");
  588. attr2 = GetSsmlAttribute(px, "mode");
  589. if (attrcmp(attr1, "punctuation") == 0) {
  590. value = attrlookup(attr2, mnem_punct);
  591. sp->parameter[espeakPUNCTUATION] = value;
  592. } else if (attrcmp(attr1, "capital_letters") == 0) {
  593. value = attrlookup(attr2, mnem_capitals);
  594. sp->parameter[espeakCAPITALS] = value;
  595. }
  596. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  597. break;
  598. case SSML_PROSODY:
  599. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  600. // look for attributes: rate, volume, pitch, range
  601. for (param_type = espeakRATE; param_type <= espeakRANGE; param_type++) {
  602. if ((attr1 = GetSsmlAttribute(px, prosody_attr[param_type])) != NULL)
  603. SetProsodyParameter(param_type, attr1, sp, param_stack, speech_parameters);
  604. }
  605. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  606. break;
  607. case SSML_EMPHASIS:
  608. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  609. value = 3; // default is "moderate"
  610. if ((attr1 = GetSsmlAttribute(px, "level")) != NULL)
  611. value = attrlookup(attr1, mnem_emphasis);
  612. if (translator->langopts.tone_language == 1) {
  613. static const unsigned char emphasis_to_pitch_range[] = { 50, 50, 40, 70, 90, 100 };
  614. static const unsigned char emphasis_to_volume[] = { 100, 100, 70, 110, 135, 150 };
  615. // tone language (eg.Chinese) do emphasis by increasing the pitch range.
  616. sp->parameter[espeakRANGE] = emphasis_to_pitch_range[value];
  617. sp->parameter[espeakVOLUME] = emphasis_to_volume[value];
  618. } else {
  619. static const unsigned char emphasis_to_volume2[] = { 100, 100, 75, 100, 120, 150 };
  620. sp->parameter[espeakVOLUME] = emphasis_to_volume2[value];
  621. sp->parameter[espeakEMPHASIS] = value;
  622. }
  623. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  624. break;
  625. case SSML_STYLE + SSML_CLOSE:
  626. case SSML_PROSODY + SSML_CLOSE:
  627. case SSML_EMPHASIS + SSML_CLOSE:
  628. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  629. break;
  630. case SSML_PHONEME:
  631. attr1 = GetSsmlAttribute(px, "alphabet");
  632. attr2 = GetSsmlAttribute(px, "ph");
  633. value = attrlookup(attr1, mnem_phoneme_alphabet);
  634. if (value == 1) { // alphabet="espeak"
  635. outbuf[(*outix)++] = '[';
  636. outbuf[(*outix)++] = '[';
  637. *outix += attrcopy_utf8(&outbuf[*outix], attr2, n_outbuf-*outix);
  638. outbuf[(*outix)++] = ']';
  639. outbuf[(*outix)++] = ']';
  640. }
  641. break;
  642. case SSML_SAYAS:
  643. attr1 = GetSsmlAttribute(px, "interpret-as");
  644. attr2 = GetSsmlAttribute(px, "format");
  645. attr3 = GetSsmlAttribute(px, "detail");
  646. value = attrlookup(attr1, mnem_interpret_as);
  647. value2 = attrlookup(attr2, mnem_sayas_format);
  648. if (value2 == 1)
  649. value = SAYAS_GLYPHS;
  650. value3 = attrnumber(attr3, 0, 0);
  651. if (value == SAYAS_DIGITS) {
  652. if (value3 <= 1)
  653. value = SAYAS_DIGITS1;
  654. else
  655. value = SAYAS_DIGITS + value3;
  656. }
  657. sprintf(buf, "%c%dY", CTRL_EMBEDDED, value);
  658. strcpy(&outbuf[*outix], buf);
  659. *outix += strlen(buf);
  660. *sayas_start = *outix;
  661. *sayas_mode = value; // punctuation doesn't end clause during SAY-AS
  662. break;
  663. case SSML_SAYAS + SSML_CLOSE:
  664. if (*sayas_mode == SAYAS_KEY) {
  665. outbuf[*outix] = 0;
  666. ReplaceKeyName(outbuf, *sayas_start, outix);
  667. }
  668. outbuf[(*outix)++] = CTRL_EMBEDDED;
  669. outbuf[(*outix)++] = 'Y';
  670. *sayas_mode = 0;
  671. break;
  672. case SSML_SUB:
  673. if ((attr1 = GetSsmlAttribute(px, "alias")) != NULL) {
  674. // use the alias rather than the text
  675. *ignore_text = true;
  676. *outix += attrcopy_utf8(&outbuf[*outix], attr1, n_outbuf-*outix);
  677. }
  678. break;
  679. case SSML_IGNORE_TEXT:
  680. *ignore_text = true;
  681. break;
  682. case SSML_SUB + SSML_CLOSE:
  683. case SSML_IGNORE_TEXT + SSML_CLOSE:
  684. *ignore_text = false;
  685. break;
  686. case SSML_MARK:
  687. if ((attr1 = GetSsmlAttribute(px, "name")) != NULL) {
  688. // add name to circular buffer of marker names
  689. attrcopy_utf8(buf, attr1, sizeof(buf));
  690. if (strcmp(skip_marker, buf) == 0) {
  691. // This is the marker we are waiting for before starting to speak
  692. *clear_skipping_text = true;
  693. skip_marker[0] = 0;
  694. return CLAUSE_NONE;
  695. }
  696. if ((index = AddNameData(buf, 0)) >= 0) {
  697. sprintf(buf, "%c%dM", CTRL_EMBEDDED, index);
  698. strcpy(&outbuf[*outix], buf);
  699. *outix += strlen(buf);
  700. }
  701. }
  702. break;
  703. case SSML_AUDIO:
  704. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *)param_stack);
  705. if ((attr1 = GetSsmlAttribute(px, "src")) != NULL) {
  706. attrcopy_utf8(buf, attr1, sizeof(buf));
  707. if (uri_callback == NULL) {
  708. if ((xmlbase != NULL) && (buf[0] != '/')) {
  709. char fname[256];
  710. sprintf(fname, "%s/%s", xmlbase, buf);
  711. index = LoadSoundFile2(fname);
  712. } else
  713. index = LoadSoundFile2(buf);
  714. if (index >= 0) {
  715. sprintf(buf, "%c%dI", CTRL_EMBEDDED, index);
  716. strcpy(&outbuf[*outix], buf);
  717. *outix += strlen(buf);
  718. sp->parameter[espeakSILENCE] = 1;
  719. }
  720. } else {
  721. if ((index = AddNameData(buf, 0)) >= 0) {
  722. char *uri;
  723. uri = &namedata[index];
  724. if (uri_callback(1, uri, xmlbase) == 0) {
  725. sprintf(buf, "%c%dU", CTRL_EMBEDDED, index);
  726. strcpy(&outbuf[*outix], buf);
  727. *outix += strlen(buf);
  728. sp->parameter[espeakSILENCE] = 1;
  729. }
  730. }
  731. }
  732. }
  733. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  734. if (self_closing)
  735. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  736. else
  737. *audio_text = true;
  738. return CLAUSE_NONE;
  739. case SSML_AUDIO + SSML_CLOSE:
  740. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  741. *audio_text = false;
  742. return CLAUSE_NONE;
  743. case SSML_BREAK:
  744. value = 21;
  745. terminator = CLAUSE_NONE;
  746. if ((attr1 = GetSsmlAttribute(px, "strength")) != NULL) {
  747. static const int break_value[6] = { 0, 7, 14, 21, 40, 80 }; // *10mS
  748. value = attrlookup(attr1, mnem_break);
  749. if (value < 3) {
  750. // adjust prepause on the following word
  751. sprintf(&outbuf[*outix], "%c%dB", CTRL_EMBEDDED, value);
  752. *outix += 3;
  753. terminator = 0;
  754. }
  755. value = break_value[value];
  756. }
  757. if ((attr2 = GetSsmlAttribute(px, "time")) != NULL) {
  758. value2 = attrnumber(attr2, 0, 1); // pause in mS
  759. // compensate for speaking speed to keep constant pause length, see function PauseLength()
  760. // 'value' here is x 10mS
  761. value = (value2 * 256) / (speed.clause_pause_factor * 10);
  762. if (value < 200)
  763. value = (value2 * 256) / (speed.pause_factor * 10);
  764. if (terminator == 0)
  765. terminator = CLAUSE_NONE;
  766. }
  767. if (terminator) {
  768. if (value > 0xfff) {
  769. // scale down the value and set a scaling indicator bit
  770. value = value / 32;
  771. if (value > 0xfff)
  772. value = 0xfff;
  773. terminator |= CLAUSE_PAUSE_LONG;
  774. }
  775. return terminator + value;
  776. }
  777. break;
  778. case SSML_SPEAK:
  779. if ((attr1 = GetSsmlAttribute(px, "xml:base")) != NULL) {
  780. attrcopy_utf8(buf, attr1, sizeof(buf));
  781. if ((index = AddNameData(buf, 0)) >= 0)
  782. xmlbase = &namedata[index];
  783. }
  784. if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
  785. return 0; // no voice change
  786. return CLAUSE_VOICE;
  787. case SSML_VOICE:
  788. if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
  789. return 0; // no voice change
  790. return CLAUSE_VOICE;
  791. case SSML_SPEAK + SSML_CLOSE:
  792. // unwind stack until the previous <voice> or <speak> tag
  793. while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_SPEAK))
  794. (*n_ssml_stack)--;
  795. return CLAUSE_PERIOD + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  796. case SSML_VOICE + SSML_CLOSE:
  797. // unwind stack until the previous <voice> or <speak> tag
  798. while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_VOICE))
  799. (*n_ssml_stack)--;
  800. terminator = 0; // ?? Sentence intonation, but no pause ??
  801. return terminator + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  802. case HTML_BREAK:
  803. case HTML_BREAK + SSML_CLOSE:
  804. return CLAUSE_COLON;
  805. case SSML_SENTENCE:
  806. if (ssml_sp->tag_type == SSML_SENTENCE) {
  807. // new sentence implies end-of-sentence
  808. voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  809. }
  810. voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  811. return CLAUSE_PARAGRAPH + voice_change_flag;
  812. case SSML_PARAGRAPH:
  813. if (ssml_sp->tag_type == SSML_SENTENCE) {
  814. // new paragraph implies end-of-sentence or end-of-paragraph
  815. voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  816. }
  817. if (ssml_sp->tag_type == SSML_PARAGRAPH) {
  818. // new paragraph implies end-of-sentence or end-of-paragraph
  819. voice_change_flag |= GetVoiceAttributes(px, SSML_PARAGRAPH+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  820. }
  821. voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  822. return CLAUSE_PARAGRAPH + voice_change_flag;
  823. case SSML_SENTENCE + SSML_CLOSE:
  824. if (ssml_sp->tag_type == SSML_SENTENCE) {
  825. // end of a sentence which specified a language
  826. voice_change_flag = GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  827. }
  828. return CLAUSE_PERIOD + voice_change_flag;
  829. case SSML_PARAGRAPH + SSML_CLOSE:
  830. if ((ssml_sp->tag_type == SSML_SENTENCE) || (ssml_sp->tag_type == SSML_PARAGRAPH)) {
  831. // End of a paragraph which specified a language.
  832. // (End-of-paragraph also implies end-of-sentence)
  833. return GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) + CLAUSE_PARAGRAPH;
  834. }
  835. return CLAUSE_PARAGRAPH;
  836. }
  837. return 0;
  838. }
  839. static const MNEM_TAB xml_entity_mnemonics[] = {
  840. { "gt", '>' },
  841. { "lt", 0xe000 + '<' }, // private usage area, to avoid confusion with XML tag
  842. { "amp", '&' },
  843. { "quot", '"' },
  844. { "nbsp", ' ' },
  845. { "apos", '\'' },
  846. { NULL, -1 }
  847. };
  848. int ParseSsmlReference(char *ref, int *c1, int *c2) {
  849. // Check if buffer *ref contains an XML character or entity reference
  850. // if found, set *c1 to the replacement char
  851. // change *c2 for entity references
  852. // returns >= 0 on success
  853. if (ref[0] == '#') {
  854. // character reference
  855. if (ref[1] == 'x')
  856. return sscanf(&ref[2], "%x", c1);
  857. else
  858. return sscanf(&ref[1], "%d", c1);
  859. } else {
  860. // entity reference
  861. int found;
  862. if ((found = LookupMnem(xml_entity_mnemonics, ref)) != -1) {
  863. *c1 = found;
  864. if (*c2 == 0)
  865. *c2 = ' ';
  866. return found;
  867. }
  868. }
  869. return -1;
  870. }