eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ssml.c 29KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021
  1. /*
  2. * Copyright (C) 2005 to 2015 by Jonathan Duddington
  3. * email: [email protected]
  4. * Copyright (C) 2015-2017 Reece H. Dunn
  5. * Copyright (C) 2018 Juho Hiltunen
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  19. */
  20. #include "config.h"
  21. #include <ctype.h>
  22. #include <errno.h>
  23. #include <locale.h>
  24. #include <math.h>
  25. #include <stdint.h>
  26. #include <stdio.h>
  27. #include <stdlib.h>
  28. #include <string.h>
  29. #include <unistd.h>
  30. #include <wchar.h>
  31. #include <wctype.h>
  32. #include <espeak-ng/espeak_ng.h>
  33. #include <espeak-ng/speak_lib.h>
  34. #include <espeak-ng/encoding.h>
  35. #include <ucd/ucd.h>
  36. #include "ssml.h"
  37. #include "common.h" // for strncpy0
  38. #include "mnemonics.h" // for LookupMnemName, MNEM_TAB,
  39. #include "readclause.h" // for PARAM_STACK, param_stack, AddNameData
  40. #include "soundicon.h" // for LoadSoundFile2
  41. #include "synthesize.h" // for SPEED_FACTORS, speed
  42. #include "translate.h" // for CTRL_EMBEDDED
  43. #include "voice.h" // for SelectVoice, SelectVoiceByName
  44. #include "speech.h" // for MAKE_MEM_UNDEFINED
  45. static const MNEM_TAB ssmltags[] = {
  46. { "speak", SSML_SPEAK },
  47. { "voice", SSML_VOICE },
  48. { "prosody", SSML_PROSODY },
  49. { "say-as", SSML_SAYAS },
  50. { "mark", SSML_MARK },
  51. { "s", SSML_SENTENCE },
  52. { "p", SSML_PARAGRAPH },
  53. { "phoneme", SSML_PHONEME },
  54. { "sub", SSML_SUB },
  55. { "tts:style", SSML_STYLE },
  56. { "audio", SSML_AUDIO },
  57. { "emphasis", SSML_EMPHASIS },
  58. { "break", SSML_BREAK },
  59. { "metadata", SSML_IGNORE_TEXT },
  60. { "br", HTML_BREAK },
  61. { "li", HTML_BREAK },
  62. { "dd", HTML_BREAK },
  63. { "img", HTML_BREAK },
  64. { "td", HTML_BREAK },
  65. { "h1", SSML_PARAGRAPH },
  66. { "h2", SSML_PARAGRAPH },
  67. { "h3", SSML_PARAGRAPH },
  68. { "h4", SSML_PARAGRAPH },
  69. { "hr", SSML_PARAGRAPH },
  70. { "script", SSML_IGNORE_TEXT },
  71. { "style", SSML_IGNORE_TEXT },
  72. { "font", HTML_NOSPACE },
  73. { "b", HTML_NOSPACE },
  74. { "i", HTML_NOSPACE },
  75. { "strong", HTML_NOSPACE },
  76. { "em", HTML_NOSPACE },
  77. { "code", HTML_NOSPACE },
  78. { NULL, 0 }
  79. };
  80. static int (*uri_callback)(int, const char *, const char *) = NULL;
  81. static int attrcmp(const wchar_t *string1, const char *string2)
  82. {
  83. int ix;
  84. if (string1 == NULL)
  85. return 1;
  86. for (ix = 0; (string1[ix] == string2[ix]) && (string1[ix] != 0); ix++)
  87. ;
  88. if (((string1[ix] == '"') || (string1[ix] == '\'')) && (string2[ix] == 0))
  89. return 0;
  90. return 1;
  91. }
  92. static int attrlookup(const wchar_t *string1, const MNEM_TAB *mtab)
  93. {
  94. int ix;
  95. for (ix = 0; mtab[ix].mnem != NULL; ix++) {
  96. if (attrcmp(string1, mtab[ix].mnem) == 0)
  97. return mtab[ix].value;
  98. }
  99. return mtab[ix].value;
  100. }
  101. static int attrnumber(const wchar_t *pw, int default_value, int type)
  102. {
  103. int value = 0;
  104. if ((pw == NULL) || !IsDigit09(*pw))
  105. return default_value;
  106. while (IsDigit09(*pw))
  107. value = value*10 + *pw++ - '0';
  108. if ((type == 1) && (ucd_tolower(*pw) == 's')) {
  109. // time: seconds rather than ms
  110. value *= 1000;
  111. }
  112. return value;
  113. }
  114. static int attrcopy_utf8(char *buf, const wchar_t *pw, int len)
  115. {
  116. // Convert attribute string into utf8, write to buf, and return its utf8 length
  117. int ix = 0;
  118. if (pw != NULL) {
  119. wchar_t quote = pw[-1];
  120. if ((quote != '"') && (quote != '\'')) quote = 0;
  121. unsigned int c;
  122. int prev_c = 0;
  123. while ((ix < (len-4)) && ((c = *pw++) != 0)) {
  124. if ((quote == 0) && (isspace(c) || (c == '/')))
  125. break;
  126. if ((quote != 0) && (c == quote) && (prev_c != '\\'))
  127. break; // " indicates end of attribute, unless preceded by backstroke
  128. int n = utf8_out(c, &buf[ix]);
  129. ix += n;
  130. prev_c = c;
  131. }
  132. }
  133. buf[ix] = 0;
  134. return ix;
  135. }
  136. static int attr_prosody_value(int param_type, const wchar_t *pw, int *value_out)
  137. {
  138. int sign = 0;
  139. wchar_t *tail;
  140. double value;
  141. while (iswspace(*pw)) pw++;
  142. if (*pw == '+') {
  143. pw++;
  144. sign = 1;
  145. }
  146. if (*pw == '-') {
  147. pw++;
  148. sign = -1;
  149. }
  150. value = (double)wcstod(pw, &tail);
  151. if (tail == pw) {
  152. // failed to find a number, return 100%
  153. *value_out = 100;
  154. return 2;
  155. }
  156. if (*tail == '%') {
  157. if (sign != 0)
  158. value = 100 + (sign * value);
  159. *value_out = (int)value;
  160. return 2; // percentage
  161. }
  162. if ((tail[0] == 's') && (tail[1] == 't')) {
  163. double x;
  164. // convert from semitones to a frequency percentage
  165. x = pow((double)2.0, (double)((value*sign)/12)) * 100;
  166. *value_out = (int)x;
  167. return 2; // percentage
  168. }
  169. if (param_type == espeakRATE) {
  170. if (sign == 0)
  171. *value_out = (int)(value * 100);
  172. else
  173. *value_out = 100 + (int)(sign * value * 100);
  174. return 2; // percentage
  175. }
  176. *value_out = (int)value;
  177. return sign; // -1, 0, or 1
  178. }
  179. static const char *VoiceFromStack(SSML_STACK *ssml_stack, int n_ssml_stack, espeak_VOICE *base_voice, char base_voice_variant_name[40])
  180. {
  181. // Use the voice properties from the SSML stack to choose a voice, and switch
  182. // to that voice if it's not the current voice
  183. int ix;
  184. const char *p;
  185. SSML_STACK *sp;
  186. const char *v_id;
  187. int voice_found;
  188. espeak_VOICE voice_select;
  189. static char voice_name[40];
  190. char language[40];
  191. MAKE_MEM_UNDEFINED(&voice_name, sizeof(voice_name));
  192. strcpy(voice_name, ssml_stack[0].voice_name);
  193. strcpy(language, ssml_stack[0].language);
  194. voice_select.age = ssml_stack[0].voice_age;
  195. voice_select.gender = ssml_stack[0].voice_gender;
  196. voice_select.variant = ssml_stack[0].voice_variant_number;
  197. voice_select.identifier = NULL;
  198. for (ix = 0; ix < n_ssml_stack; ix++) {
  199. sp = &ssml_stack[ix];
  200. int voice_name_specified = 0;
  201. if ((sp->voice_name[0] != 0) && (SelectVoiceByName(NULL, sp->voice_name) != NULL)) {
  202. voice_name_specified = 1;
  203. strcpy(voice_name, sp->voice_name);
  204. language[0] = 0;
  205. voice_select.gender = ENGENDER_UNKNOWN;
  206. voice_select.age = 0;
  207. voice_select.variant = 0;
  208. }
  209. if (sp->language[0] != 0) {
  210. strcpy(language, sp->language);
  211. // is this language provided by the base voice?
  212. p = base_voice->languages;
  213. while (*p++ != 0) {
  214. if (strcmp(p, language) == 0) {
  215. // yes, change the language to the main language of the base voice
  216. strcpy(language, &base_voice->languages[1]);
  217. break;
  218. }
  219. p += (strlen(p) + 1);
  220. }
  221. if (voice_name_specified == 0)
  222. voice_name[0] = 0; // forget a previous voice name if a language is specified
  223. }
  224. if (sp->voice_gender != ENGENDER_UNKNOWN)
  225. voice_select.gender = sp->voice_gender;
  226. if (sp->voice_age != 0)
  227. voice_select.age = sp->voice_age;
  228. if (sp->voice_variant_number != 0)
  229. voice_select.variant = sp->voice_variant_number;
  230. }
  231. voice_select.name = voice_name;
  232. voice_select.languages = language;
  233. v_id = SelectVoice(&voice_select, &voice_found);
  234. if (v_id == NULL)
  235. return "default";
  236. if ((strchr(v_id, '+') == NULL) && ((voice_select.gender == ENGENDER_UNKNOWN) || (voice_select.gender == base_voice->gender)) && (base_voice_variant_name[0] != 0)) {
  237. // a voice variant has not been selected, use the original voice variant
  238. char buf[80];
  239. sprintf(buf, "%s+%s", v_id, base_voice_variant_name);
  240. strncpy0(voice_name, buf, sizeof(voice_name));
  241. return voice_name;
  242. }
  243. return v_id;
  244. }
  245. static const wchar_t *GetSsmlAttribute(wchar_t *pw, const char *name)
  246. {
  247. // Gets the value string for an attribute.
  248. // Returns NULL if the attribute is not present
  249. int ix;
  250. static const wchar_t empty[1] = { 0 };
  251. while (*pw != 0) {
  252. if (iswspace(pw[-1])) {
  253. ix = 0;
  254. while (*pw == name[ix]) {
  255. pw++;
  256. ix++;
  257. }
  258. if (name[ix] == 0) {
  259. // found the attribute, now get the value
  260. while (iswspace(*pw)) pw++;
  261. if (*pw == '=') pw++;
  262. while (iswspace(*pw)) pw++;
  263. if ((*pw == '"') || (*pw == '\'')) // allow single-quotes ?
  264. return pw+1;
  265. else if (iswspace(*pw) || (*pw == '/')) // end of attribute
  266. return empty;
  267. else
  268. return pw;
  269. }
  270. }
  271. pw++;
  272. }
  273. return NULL;
  274. }
  275. static int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char *base_voice_variant_name)
  276. {
  277. // Determines whether voice attribute are specified in this tag, and if so, whether this means
  278. // a voice change.
  279. // If it's a closing tag, delete the top frame of the stack and determine whether this implies
  280. // a voice change.
  281. // Returns CLAUSE_TYPE_VOICE_CHANGE if there is a voice change
  282. const char *new_voice_id;
  283. static const MNEM_TAB mnem_gender[] = {
  284. { "male", ENGENDER_MALE },
  285. { "female", ENGENDER_FEMALE },
  286. { "neutral", ENGENDER_NEUTRAL },
  287. { NULL, ENGENDER_UNKNOWN }
  288. };
  289. if (tag_type & SSML_CLOSE) {
  290. // delete a stack frame
  291. if (n_ssml_stack > 1)
  292. n_ssml_stack--;
  293. } else {
  294. const wchar_t *lang;
  295. const wchar_t *gender;
  296. const wchar_t *name;
  297. const wchar_t *age;
  298. const wchar_t *variant;
  299. // add a stack frame if any voice details are specified
  300. lang = GetSsmlAttribute(pw, "xml:lang");
  301. if (tag_type != SSML_VOICE) {
  302. // only expect an xml:lang attribute
  303. name = NULL;
  304. variant = NULL;
  305. age = NULL;
  306. gender = NULL;
  307. } else {
  308. name = GetSsmlAttribute(pw, "name");
  309. variant = GetSsmlAttribute(pw, "variant");
  310. age = GetSsmlAttribute(pw, "age");
  311. gender = GetSsmlAttribute(pw, "gender");
  312. }
  313. if ((tag_type != SSML_VOICE) && (lang == NULL))
  314. return 0; // <s> or <p> without language spec, nothing to do
  315. ssml_sp = &ssml_stack[n_ssml_stack++];
  316. int value;
  317. attrcopy_utf8(ssml_sp->language, lang, sizeof(ssml_sp->language));
  318. attrcopy_utf8(ssml_sp->voice_name, name, sizeof(ssml_sp->voice_name));
  319. if ((value = attrnumber(variant, 1, 0)) > 0)
  320. value--; // variant='0' and variant='1' the same
  321. ssml_sp->voice_variant_number = value;
  322. ssml_sp->voice_age = attrnumber(age, 0, 0);
  323. ssml_sp->voice_gender = attrlookup(gender, mnem_gender);
  324. ssml_sp->tag_type = tag_type;
  325. }
  326. new_voice_id = VoiceFromStack(ssml_stack, n_ssml_stack, base_voice, base_voice_variant_name);
  327. if (strcmp(new_voice_id, current_voice_id) != 0) {
  328. // add an embedded command to change the voice
  329. strcpy(current_voice_id, new_voice_id);
  330. return CLAUSE_TYPE_VOICE_CHANGE;
  331. }
  332. return 0;
  333. }
  334. static void ProcessParamStack(char *outbuf, int *outix, int n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
  335. {
  336. // Set the speech parameters from the parameter stack
  337. int param;
  338. int ix;
  339. char buf[20];
  340. int new_parameters[N_SPEECH_PARAM];
  341. static const char cmd_letter[N_SPEECH_PARAM] = { 0, 'S', 'A', 'P', 'R', 0, 'C', 0, 0, 0, 0, 0, 'F' }; // embedded command letters
  342. for (param = 0; param < N_SPEECH_PARAM; param++)
  343. new_parameters[param] = -1;
  344. for (ix = 0; ix < n_param_stack; ix++) {
  345. for (param = 0; param < N_SPEECH_PARAM; param++) {
  346. if (param_stack[ix].parameter[param] >= 0)
  347. new_parameters[param] = param_stack[ix].parameter[param];
  348. }
  349. }
  350. for (param = 0; param < N_SPEECH_PARAM; param++) {
  351. int value;
  352. if ((value = new_parameters[param]) != speech_parameters[param]) {
  353. buf[0] = 0;
  354. switch (param)
  355. {
  356. case espeakPUNCTUATION:
  357. option_punctuation = value-1;
  358. break;
  359. case espeakCAPITALS:
  360. option_capitals = value;
  361. break;
  362. case espeakRATE:
  363. case espeakVOLUME:
  364. case espeakPITCH:
  365. case espeakRANGE:
  366. case espeakEMPHASIS:
  367. sprintf(buf, "%c%d%c", CTRL_EMBEDDED, value, cmd_letter[param]);
  368. break;
  369. }
  370. speech_parameters[param] = new_parameters[param];
  371. strcpy(&outbuf[*outix], buf);
  372. *outix += strlen(buf);
  373. }
  374. }
  375. }
  376. static PARAM_STACK *PushParamStack(int tag_type, int *n_param_stack, PARAM_STACK *param_stack)
  377. {
  378. int ix;
  379. PARAM_STACK *sp;
  380. sp = &param_stack[*n_param_stack];
  381. if (*n_param_stack < (N_PARAM_STACK-1))
  382. (*n_param_stack)++;
  383. sp->type = tag_type;
  384. for (ix = 0; ix < N_SPEECH_PARAM; ix++)
  385. sp->parameter[ix] = -1;
  386. return sp;
  387. }
  388. static void PopParamStack(int tag_type, char *outbuf, int *outix, int *n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
  389. {
  390. // unwind the stack up to and including the previous tag of this type
  391. int ix;
  392. int top = 0;
  393. if (tag_type >= SSML_CLOSE)
  394. tag_type -= SSML_CLOSE;
  395. for (ix = 0; ix < *n_param_stack; ix++) {
  396. if (param_stack[ix].type == tag_type)
  397. top = ix;
  398. }
  399. if (top > 0)
  400. *n_param_stack = top;
  401. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  402. }
  403. static int ReplaceKeyName(char *outbuf, int index, int *outix)
  404. {
  405. // Replace some key-names by single characters, so they can be pronounced in different languages
  406. static const MNEM_TAB keynames[] = {
  407. { "space ", 0xe020 },
  408. { "tab ", 0xe009 },
  409. { "underscore ", 0xe05f },
  410. { "double-quote ", '"' },
  411. { NULL, 0 }
  412. };
  413. int letter;
  414. char *p;
  415. p = &outbuf[index];
  416. if ((letter = LookupMnem(keynames, p)) != 0) {
  417. int ix;
  418. ix = utf8_out(letter, p);
  419. *outix = index + ix;
  420. return letter;
  421. }
  422. return 0;
  423. }
  424. static void SetProsodyParameter(int param_type, const wchar_t *attr1, PARAM_STACK *sp, PARAM_STACK *param_stack, int *speech_parameters)
  425. {
  426. int value;
  427. static const MNEM_TAB mnem_volume[] = {
  428. { "default", 100 },
  429. { "silent", 0 },
  430. { "x-soft", 30 },
  431. { "soft", 65 },
  432. { "medium", 100 },
  433. { "loud", 150 },
  434. { "x-loud", 230 },
  435. { NULL, -1 }
  436. };
  437. static const MNEM_TAB mnem_rate[] = {
  438. { "default", 100 },
  439. { "x-slow", 60 },
  440. { "slow", 80 },
  441. { "medium", 100 },
  442. { "fast", 125 },
  443. { "x-fast", 160 },
  444. { NULL, -1 }
  445. };
  446. static const MNEM_TAB mnem_pitch[] = {
  447. { "default", 100 },
  448. { "x-low", 70 },
  449. { "low", 85 },
  450. { "medium", 100 },
  451. { "high", 110 },
  452. { "x-high", 120 },
  453. { NULL, -1 }
  454. };
  455. static const MNEM_TAB mnem_range[] = {
  456. { "default", 100 },
  457. { "x-low", 20 },
  458. { "low", 50 },
  459. { "medium", 100 },
  460. { "high", 140 },
  461. { "x-high", 180 },
  462. { NULL, -1 }
  463. };
  464. static const MNEM_TAB * const mnem_tabs[5] = {
  465. NULL, mnem_rate, mnem_volume, mnem_pitch, mnem_range
  466. };
  467. if ((value = attrlookup(attr1, mnem_tabs[param_type])) >= 0) {
  468. // mnemonic specifies a value as a percentage of the base pitch/range/rate/volume
  469. sp->parameter[param_type] = (param_stack[0].parameter[param_type] * value)/100;
  470. } else {
  471. int sign = attr_prosody_value(param_type, attr1, &value);
  472. if (sign == 0)
  473. sp->parameter[param_type] = value; // absolute value in Hz
  474. else if (sign == 2) {
  475. // change specified as percentage or in semitones
  476. sp->parameter[param_type] = (speech_parameters[param_type] * value)/100;
  477. } else {
  478. // change specified as plus or minus Hz
  479. sp->parameter[param_type] = speech_parameters[param_type] + (value*sign);
  480. }
  481. }
  482. }
  483. int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, const char *xmlbase, bool *audio_text, char *current_voice_id, espeak_VOICE *base_voice, char *base_voice_variant_name, bool *ignore_text, bool *clear_skipping_text, int *sayas_mode, int *sayas_start, SSML_STACK *ssml_stack, int *n_ssml_stack, int *n_param_stack, int *speech_parameters)
  484. {
  485. // xml_buf is the tag and attributes with a zero terminator in place of the original '>'
  486. // returns a clause terminator value.
  487. unsigned int ix;
  488. int index;
  489. int tag_type;
  490. int value;
  491. int value2;
  492. int value3;
  493. int voice_change_flag;
  494. wchar_t *px;
  495. const wchar_t *attr1;
  496. const wchar_t *attr2;
  497. const wchar_t *attr3;
  498. int terminator;
  499. int param_type;
  500. char tag_name[40];
  501. char buf[160];
  502. PARAM_STACK *sp;
  503. SSML_STACK *ssml_sp;
  504. // don't process comments and xml declarations
  505. if (wcsncmp(xml_buf, (wchar_t *) "!--", 3) == 0 || wcsncmp(xml_buf, (wchar_t *) "?xml", 4) == 0) {
  506. return 0;
  507. }
  508. // these tags have no effect if they are self-closing, eg. <voice />
  509. static const char ignore_if_self_closing[] = { 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0 };
  510. bool self_closing = false;
  511. int len;
  512. len = wcslen(xml_buf);
  513. if (xml_buf[len - 1] == '/') {
  514. // a self-closing tag
  515. xml_buf[len - 1] = ' ';
  516. self_closing = true;
  517. }
  518. static const MNEM_TAB mnem_phoneme_alphabet[] = {
  519. { "espeak", 1 },
  520. { NULL, -1 }
  521. };
  522. static const MNEM_TAB mnem_punct[] = {
  523. { "none", 1 },
  524. { "all", 2 },
  525. { "some", 3 },
  526. { NULL, -1 }
  527. };
  528. static const MNEM_TAB mnem_capitals[] = {
  529. { "no", 0 },
  530. { "icon", 1 },
  531. { "spelling", 2 },
  532. { "pitch", 20 }, // this is the amount by which to raise the pitch
  533. { NULL, -1 }
  534. };
  535. static const MNEM_TAB mnem_interpret_as[] = {
  536. { "characters", SAYAS_CHARS },
  537. { "tts:char", SAYAS_SINGLE_CHARS },
  538. { "tts:key", SAYAS_KEY },
  539. { "tts:digits", SAYAS_DIGITS },
  540. { "telephone", SAYAS_DIGITS1 },
  541. { NULL, -1 }
  542. };
  543. static const MNEM_TAB mnem_sayas_format[] = {
  544. { "glyphs", 1 },
  545. { NULL, -1 }
  546. };
  547. static const MNEM_TAB mnem_break[] = {
  548. { "none", 0 },
  549. { "x-weak", 1 },
  550. { "weak", 2 },
  551. { "medium", 3 },
  552. { "strong", 4 },
  553. { "x-strong", 5 },
  554. { NULL, -1 }
  555. };
  556. static const MNEM_TAB mnem_emphasis[] = {
  557. { "none", 1 },
  558. { "reduced", 2 },
  559. { "moderate", 3 },
  560. { "strong", 4 },
  561. { "x-strong", 5 },
  562. { NULL, -1 }
  563. };
  564. static const char * const prosody_attr[5] = {
  565. NULL, "rate", "volume", "pitch", "range"
  566. };
  567. for (ix = 0; ix < (sizeof(tag_name)-1); ix++) {
  568. int c;
  569. if (((c = xml_buf[ix]) == 0) || iswspace(c))
  570. break;
  571. tag_name[ix] = tolower((char)c);
  572. }
  573. tag_name[ix] = 0;
  574. px = &xml_buf[ix]; // the tag's attributes
  575. if (tag_name[0] == '/') {
  576. // closing tag
  577. if ((tag_type = LookupMnem(ssmltags, &tag_name[1])) != HTML_NOSPACE)
  578. outbuf[(*outix)++] = ' ';
  579. tag_type += SSML_CLOSE;
  580. } else {
  581. if ((tag_type = LookupMnem(ssmltags, tag_name)) != HTML_NOSPACE) {
  582. // separate SSML tags from the previous word (but not HMTL tags such as <b> <font> which can occur inside a word)
  583. outbuf[(*outix)++] = ' ';
  584. }
  585. if (self_closing && ignore_if_self_closing[tag_type])
  586. return 0;
  587. }
  588. voice_change_flag = 0;
  589. ssml_sp = &ssml_stack[*n_ssml_stack-1];
  590. switch (tag_type)
  591. {
  592. case SSML_STYLE:
  593. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  594. attr1 = GetSsmlAttribute(px, "field");
  595. attr2 = GetSsmlAttribute(px, "mode");
  596. if (attrcmp(attr1, "punctuation") == 0) {
  597. value = attrlookup(attr2, mnem_punct);
  598. sp->parameter[espeakPUNCTUATION] = value;
  599. } else if (attrcmp(attr1, "capital_letters") == 0) {
  600. value = attrlookup(attr2, mnem_capitals);
  601. sp->parameter[espeakCAPITALS] = value;
  602. }
  603. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  604. break;
  605. case SSML_PROSODY:
  606. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  607. // look for attributes: rate, volume, pitch, range
  608. for (param_type = espeakRATE; param_type <= espeakRANGE; param_type++) {
  609. if ((attr1 = GetSsmlAttribute(px, prosody_attr[param_type])) != NULL)
  610. SetProsodyParameter(param_type, attr1, sp, param_stack, speech_parameters);
  611. }
  612. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  613. break;
  614. case SSML_EMPHASIS:
  615. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
  616. value = 3; // default is "moderate"
  617. if ((attr1 = GetSsmlAttribute(px, "level")) != NULL)
  618. value = attrlookup(attr1, mnem_emphasis);
  619. if (translator->langopts.tone_language == 1) {
  620. static const unsigned char emphasis_to_pitch_range[] = { 50, 50, 40, 70, 90, 100 };
  621. static const unsigned char emphasis_to_volume[] = { 100, 100, 70, 110, 135, 150 };
  622. // tone language (eg.Chinese) do emphasis by increasing the pitch range.
  623. sp->parameter[espeakRANGE] = emphasis_to_pitch_range[value];
  624. sp->parameter[espeakVOLUME] = emphasis_to_volume[value];
  625. } else {
  626. static const unsigned char emphasis_to_volume2[] = { 100, 100, 75, 100, 120, 150 };
  627. sp->parameter[espeakVOLUME] = emphasis_to_volume2[value];
  628. sp->parameter[espeakEMPHASIS] = value;
  629. }
  630. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  631. break;
  632. case SSML_STYLE + SSML_CLOSE:
  633. case SSML_PROSODY + SSML_CLOSE:
  634. case SSML_EMPHASIS + SSML_CLOSE:
  635. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  636. break;
  637. case SSML_PHONEME:
  638. attr1 = GetSsmlAttribute(px, "alphabet");
  639. attr2 = GetSsmlAttribute(px, "ph");
  640. value = attrlookup(attr1, mnem_phoneme_alphabet);
  641. if (value == 1) { // alphabet="espeak"
  642. outbuf[(*outix)++] = '[';
  643. outbuf[(*outix)++] = '[';
  644. *outix += attrcopy_utf8(&outbuf[*outix], attr2, n_outbuf-*outix);
  645. outbuf[(*outix)++] = ']';
  646. outbuf[(*outix)++] = ']';
  647. }
  648. break;
  649. case SSML_SAYAS:
  650. attr1 = GetSsmlAttribute(px, "interpret-as");
  651. attr2 = GetSsmlAttribute(px, "format");
  652. attr3 = GetSsmlAttribute(px, "detail");
  653. value = attrlookup(attr1, mnem_interpret_as);
  654. value2 = attrlookup(attr2, mnem_sayas_format);
  655. if (value2 == 1)
  656. value = SAYAS_GLYPHS;
  657. value3 = attrnumber(attr3, 0, 0);
  658. if (value == SAYAS_DIGITS) {
  659. if (value3 <= 1)
  660. value = SAYAS_DIGITS1;
  661. else
  662. value = SAYAS_DIGITS + value3;
  663. }
  664. sprintf(buf, "%c%dY", CTRL_EMBEDDED, value);
  665. strcpy(&outbuf[*outix], buf);
  666. *outix += strlen(buf);
  667. *sayas_start = *outix;
  668. *sayas_mode = value; // punctuation doesn't end clause during SAY-AS
  669. break;
  670. case SSML_SAYAS + SSML_CLOSE:
  671. if (*sayas_mode == SAYAS_KEY) {
  672. outbuf[*outix] = 0;
  673. ReplaceKeyName(outbuf, *sayas_start, outix);
  674. }
  675. outbuf[(*outix)++] = CTRL_EMBEDDED;
  676. outbuf[(*outix)++] = 'Y';
  677. *sayas_mode = 0;
  678. break;
  679. case SSML_SUB:
  680. if ((attr1 = GetSsmlAttribute(px, "alias")) != NULL) {
  681. // use the alias rather than the text
  682. *ignore_text = true;
  683. *outix += attrcopy_utf8(&outbuf[*outix], attr1, n_outbuf-*outix);
  684. }
  685. break;
  686. case SSML_IGNORE_TEXT:
  687. *ignore_text = true;
  688. break;
  689. case SSML_SUB + SSML_CLOSE:
  690. case SSML_IGNORE_TEXT + SSML_CLOSE:
  691. *ignore_text = false;
  692. break;
  693. case SSML_MARK:
  694. if ((attr1 = GetSsmlAttribute(px, "name")) != NULL) {
  695. // add name to circular buffer of marker names
  696. attrcopy_utf8(buf, attr1, sizeof(buf));
  697. if ((buf[0] != 0) && (strcmp(skip_marker, buf) == 0)) {
  698. // This is the marker we are waiting for before starting to speak
  699. *clear_skipping_text = true;
  700. skip_marker[0] = 0;
  701. return CLAUSE_NONE;
  702. }
  703. if ((index = AddNameData(buf, 0)) >= 0) {
  704. sprintf(buf, "%c%dM", CTRL_EMBEDDED, index);
  705. strcpy(&outbuf[*outix], buf);
  706. *outix += strlen(buf);
  707. }
  708. }
  709. break;
  710. case SSML_AUDIO:
  711. sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *)param_stack);
  712. if ((attr1 = GetSsmlAttribute(px, "src")) != NULL) {
  713. attrcopy_utf8(buf, attr1, sizeof(buf));
  714. if (uri_callback == NULL) {
  715. if ((xmlbase != NULL) && (buf[0] != '/')) {
  716. char fname[256];
  717. sprintf(fname, "%s/%s", xmlbase, buf);
  718. index = LoadSoundFile2(fname);
  719. } else
  720. index = LoadSoundFile2(buf);
  721. if (index >= 0) {
  722. sprintf(buf, "%c%dI", CTRL_EMBEDDED, index);
  723. strcpy(&outbuf[*outix], buf);
  724. *outix += strlen(buf);
  725. sp->parameter[espeakSILENCE] = 1;
  726. }
  727. } else {
  728. if ((index = AddNameData(buf, 0)) >= 0) {
  729. char *uri;
  730. uri = &namedata[index];
  731. if (uri_callback(1, uri, xmlbase) == 0) {
  732. sprintf(buf, "%c%dU", CTRL_EMBEDDED, index);
  733. strcpy(&outbuf[*outix], buf);
  734. *outix += strlen(buf);
  735. sp->parameter[espeakSILENCE] = 1;
  736. }
  737. }
  738. }
  739. }
  740. ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
  741. if (self_closing)
  742. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  743. else
  744. *audio_text = true;
  745. return CLAUSE_NONE;
  746. case SSML_AUDIO + SSML_CLOSE:
  747. PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
  748. *audio_text = false;
  749. return CLAUSE_NONE;
  750. case SSML_BREAK:
  751. value = 21;
  752. terminator = CLAUSE_NONE;
  753. if ((attr1 = GetSsmlAttribute(px, "strength")) != NULL) {
  754. static const int break_value[6] = { 0, 7, 14, 21, 40, 80 }; // *10mS
  755. value = attrlookup(attr1, mnem_break);
  756. if (value < 0) value = 2;
  757. if (value < 3) {
  758. // adjust prepause on the following word
  759. sprintf(&outbuf[*outix], "%c%dB", CTRL_EMBEDDED, value);
  760. *outix += 3;
  761. terminator = 0;
  762. }
  763. value = break_value[value];
  764. }
  765. if ((attr2 = GetSsmlAttribute(px, "time")) != NULL) {
  766. value2 = attrnumber(attr2, 0, 1); // pause in mS
  767. value2 = value2 * speech_parameters[espeakSSML_BREAK_MUL] / 100;
  768. int wpm = speech_parameters[espeakRATE];
  769. espeak_SetParameter(espeakRATE, wpm, 0);
  770. #if USE_LIBSONIC
  771. if (wpm >= espeakRATE_MAXIMUM) {
  772. // Compensate speedup with libsonic, see function SetSpeed()
  773. double sonic = ((double)wpm)/espeakRATE_NORMAL;
  774. value2 = value2 * sonic;
  775. }
  776. #endif
  777. // compensate for speaking speed to keep constant pause length, see function PauseLength()
  778. // 'value' here is x 10mS
  779. value = (value2 * 256) / (speed.clause_pause_factor * 10);
  780. if (value < 200)
  781. value = (value2 * 256) / (speed.pause_factor * 10);
  782. if (terminator == 0)
  783. terminator = CLAUSE_NONE;
  784. }
  785. if (terminator) {
  786. if (value > 0xfff) {
  787. // scale down the value and set a scaling indicator bit
  788. value = value / 32;
  789. if (value > 0xfff)
  790. value = 0xfff;
  791. terminator |= CLAUSE_PAUSE_LONG;
  792. }
  793. return terminator + value;
  794. }
  795. break;
  796. case SSML_SPEAK:
  797. if ((attr1 = GetSsmlAttribute(px, "xml:base")) != NULL) {
  798. attrcopy_utf8(buf, attr1, sizeof(buf));
  799. if ((index = AddNameData(buf, 0)) >= 0)
  800. xmlbase = &namedata[index];
  801. }
  802. if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
  803. return 0; // no voice change
  804. return CLAUSE_VOICE;
  805. case SSML_VOICE:
  806. if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
  807. return 0; // no voice change
  808. return CLAUSE_VOICE;
  809. case SSML_SPEAK + SSML_CLOSE:
  810. // unwind stack until the previous <voice> or <speak> tag
  811. while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_SPEAK))
  812. (*n_ssml_stack)--;
  813. return CLAUSE_PERIOD + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  814. case SSML_VOICE + SSML_CLOSE:
  815. // unwind stack until the previous <voice> or <speak> tag
  816. while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_VOICE))
  817. (*n_ssml_stack)--;
  818. terminator = 0; // ?? Sentence intonation, but no pause ??
  819. return terminator + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  820. case HTML_BREAK:
  821. case HTML_BREAK + SSML_CLOSE:
  822. return CLAUSE_COLON;
  823. case SSML_SENTENCE:
  824. if (ssml_sp->tag_type == SSML_SENTENCE) {
  825. // new sentence implies end-of-sentence
  826. voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  827. }
  828. voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  829. return CLAUSE_PARAGRAPH + voice_change_flag;
  830. case SSML_PARAGRAPH:
  831. if (ssml_sp->tag_type == SSML_SENTENCE) {
  832. // new paragraph implies end-of-sentence or end-of-paragraph
  833. voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  834. }
  835. if (ssml_sp->tag_type == SSML_PARAGRAPH) {
  836. // new paragraph implies end-of-sentence or end-of-paragraph
  837. voice_change_flag |= GetVoiceAttributes(px, SSML_PARAGRAPH+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  838. }
  839. voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  840. return CLAUSE_PARAGRAPH + voice_change_flag;
  841. case SSML_SENTENCE + SSML_CLOSE:
  842. if (ssml_sp->tag_type == SSML_SENTENCE) {
  843. // end of a sentence which specified a language
  844. voice_change_flag = GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
  845. }
  846. return CLAUSE_PERIOD + voice_change_flag;
  847. case SSML_PARAGRAPH + SSML_CLOSE:
  848. if ((ssml_sp->tag_type == SSML_SENTENCE) || (ssml_sp->tag_type == SSML_PARAGRAPH)) {
  849. // End of a paragraph which specified a language.
  850. // (End-of-paragraph also implies end-of-sentence)
  851. return GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) + CLAUSE_PARAGRAPH;
  852. }
  853. return CLAUSE_PARAGRAPH;
  854. }
  855. return 0;
  856. }
  857. #pragma GCC visibility push(default)
  858. ESPEAK_API void espeak_SetUriCallback(int (*UriCallback)(int, const char *, const char *))
  859. {
  860. uri_callback = UriCallback;
  861. }
  862. #pragma GCC visibility pop
  863. static const MNEM_TAB xml_entity_mnemonics[] = {
  864. { "gt", '>' },
  865. { "lt", 0xe000 + '<' }, // private usage area, to avoid confusion with XML tag
  866. { "amp", '&' },
  867. { "quot", '"' },
  868. { "nbsp", ' ' },
  869. { "apos", '\'' },
  870. { NULL, -1 }
  871. };
  872. int ParseSsmlReference(char *ref, int *c1, int *c2) {
  873. // Check if buffer *ref contains an XML character or entity reference
  874. // if found, set *c1 to the replacement char
  875. // change *c2 for entity references
  876. // returns >= 0 on success
  877. if (ref[0] == '#') {
  878. // character reference
  879. if (ref[1] == 'x')
  880. return sscanf(&ref[2], "%x", c1);
  881. else
  882. return sscanf(&ref[1], "%d", c1);
  883. } else {
  884. // entity reference
  885. int found;
  886. if ((found = LookupMnem(xml_entity_mnemonics, ref)) != -1) {
  887. *c1 = found;
  888. if (*c2 == 0)
  889. *c2 = ' ';
  890. return found;
  891. }
  892. }
  893. return -1;
  894. }