eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

synth_mbrola.c 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. /*
  2. * Copyright (C) 2005 to 2013 by Jonathan Duddington
  3. * email: [email protected]
  4. * Copyright (C) 2015-2016 Reece H. Dunn
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  18. */
  19. #include "config.h"
  20. #include <ctype.h>
  21. #include <errno.h>
  22. #include <math.h>
  23. #include <stdbool.h>
  24. #include <stdint.h>
  25. #include <stdio.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #include <espeak-ng/espeak_ng.h>
  29. #include <espeak-ng/speak_lib.h>
  30. #include <espeak-ng/encoding.h>
  31. #include "dictionary.h"
  32. #include "mbrola.h"
  33. #include "setlengths.h"
  34. #include "synthdata.h"
  35. #include "wavegen.h"
  36. #include "common.h"
  37. #include "phoneme.h"
  38. #include "voice.h"
  39. #include "speech.h"
  40. #include "synthesize.h"
  41. #include "translate.h"
  42. // included here so tests can find these even without OPT_MBROLA set
  43. int mbrola_delay;
  44. char mbrola_name[20];
  45. #if USE_MBROLA
  46. #if defined(_WIN32) || defined(_WIN64)
  47. #include <windows.h>
  48. #endif
  49. #include "mbrowrap.h"
  50. static MBROLA_TAB *mbrola_tab = NULL;
  51. static int mbrola_control = 0;
  52. static int mbr_name_prefix = 0;
  53. espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate)
  54. {
  55. // Load a phoneme name translation table from espeak-ng-data/mbrola
  56. int size;
  57. int ix;
  58. int *pw;
  59. FILE *f_in;
  60. char path[sizeof(path_home)+15];
  61. mbrola_name[0] = 0;
  62. mbrola_delay = 0;
  63. mbr_name_prefix = 0;
  64. if (mbrola_voice == NULL) {
  65. samplerate = samplerate;
  66. SetParameter(espeakVOICETYPE, 0, 0);
  67. return ENS_OK;
  68. }
  69. if (!load_MBR())
  70. return ENS_MBROLA_NOT_FOUND;
  71. sprintf(path, "%s/mbrola/%s", path_home, mbrola_voice);
  72. #if PLATFORM_POSIX
  73. // if not found, then also look in
  74. // usr/share/mbrola/xx, /usr/share/mbrola/xx/xx, /usr/share/mbrola/voices/xx
  75. if (GetFileLength(path) <= 0) {
  76. sprintf(path, "/usr/share/mbrola/%s", mbrola_voice);
  77. if (GetFileLength(path) <= 0) {
  78. sprintf(path, "/usr/share/mbrola/%s/%s", mbrola_voice, mbrola_voice);
  79. if (GetFileLength(path) <= 0)
  80. sprintf(path, "/usr/share/mbrola/voices/%s", mbrola_voice);
  81. // Show error message
  82. if (GetFileLength(path) <= 0) {
  83. fprintf(stderr, "Cannot find MBROLA voice file '%s' in neither of paths:\n"
  84. " - /usr/share/mbrola/%s\n"
  85. " - /usr/share/mbrola/%s/%s\n"
  86. " - /usr/share/mbrola/voices/%s\n"
  87. "Please install necessary MBROLA voice!\n",
  88. mbrola_voice, mbrola_voice, mbrola_voice, mbrola_voice, mbrola_voice);
  89. // Set path back to simple name, otherwise it shows misleading error only for
  90. // last unsuccessfully searched path
  91. sprintf(path, "%s", mbrola_voice);
  92. }
  93. }
  94. }
  95. close_MBR();
  96. #endif
  97. if (init_MBR(path) != 0) // initialise the required mbrola voice
  98. return ENS_MBROLA_VOICE_NOT_FOUND;
  99. setNoError_MBR(1); // don't stop on phoneme errors
  100. // read eSpeak's mbrola phoneme translation data, eg. en1_phtrans
  101. sprintf(path, "%s/mbrola_ph/%s", path_home, phtrans);
  102. size = GetFileLength(path);
  103. if (size < 0) // size == -errno
  104. return -size;
  105. if ((f_in = fopen(path, "rb")) == NULL) {
  106. int error = errno;
  107. close_MBR();
  108. return error;
  109. }
  110. MBROLA_TAB *new_mbrola_tab = (MBROLA_TAB *)realloc(mbrola_tab, size);
  111. if (new_mbrola_tab == NULL) {
  112. fclose(f_in);
  113. close_MBR();
  114. return ENOMEM;
  115. }
  116. mbrola_tab = new_mbrola_tab;
  117. mbrola_control = Read4Bytes(f_in);
  118. pw = (int *)mbrola_tab;
  119. for (ix = 4; ix < size; ix += 4)
  120. *pw++ = Read4Bytes(f_in);
  121. fclose(f_in);
  122. setVolumeRatio_MBR((float)(mbrola_control & 0xff) /16.0f);
  123. samplerate = *srate = getFreq_MBR();
  124. if (*srate == 22050)
  125. SetParameter(espeakVOICETYPE, 0, 0);
  126. else
  127. SetParameter(espeakVOICETYPE, 1, 0);
  128. strcpy(mbrola_name, mbrola_voice);
  129. mbrola_delay = 1000; // improve synchronization of events
  130. return ENS_OK;
  131. }
  132. static int GetMbrName(PHONEME_LIST *plist, PHONEME_TAB *ph, PHONEME_TAB *ph_prev, PHONEME_TAB *ph_next, int *name2, int *split, int *control)
  133. {
  134. // Look up a phoneme in the mbrola phoneme name translation table
  135. // It may give none, 1, or 2 mbrola phonemes
  136. MBROLA_TAB *pr;
  137. PHONEME_TAB *other_ph;
  138. bool found = false;
  139. static int mnem;
  140. // control
  141. // bit 0 skip the next phoneme
  142. // bit 1 match this and Previous phoneme
  143. // bit 2 only at the start of a word
  144. // bit 3 don't match two phonemes across a word boundary
  145. // bit 4 add this phoneme name as a prefix to the next phoneme name (used for de4 phoneme prefix '?')
  146. // bit 5 only in stressed syllable
  147. // bit 6 only at the end of a word
  148. *name2 = 0;
  149. *split = 0;
  150. *control = 0;
  151. mnem = ph->mnemonic;
  152. pr = mbrola_tab;
  153. while (pr->name != 0) {
  154. if (mnem == pr->name) {
  155. if (pr->next_phoneme == 0)
  156. found = true;
  157. else if ((pr->next_phoneme == ':') && (plist->synthflags & SFLAG_LENGTHEN))
  158. found = true;
  159. else {
  160. if (pr->control & 2)
  161. other_ph = ph_prev;
  162. else if ((pr->control & 8) && ((plist+1)->newword))
  163. other_ph = phoneme_tab[phPAUSE]; // don't match the next phoneme over a word boundary
  164. else
  165. other_ph = ph_next;
  166. if ((pr->next_phoneme == other_ph->mnemonic) ||
  167. ((pr->next_phoneme == 2) && (other_ph->type == phVOWEL)) ||
  168. ((pr->next_phoneme == '_') && (other_ph->type == phPAUSE)))
  169. found = true;
  170. }
  171. if ((pr->control & 4) && (plist->newword == 0)) // only at start of word
  172. found = false;
  173. if ((pr->control & 0x40) && (plist[1].newword == 0)) // only at the end of a word
  174. found = false;
  175. if ((pr->control & 0x20) && (plist->stresslevel < plist->wordstress))
  176. found = false; // only in stressed syllables
  177. if (found) {
  178. *name2 = pr->mbr_name2;
  179. *split = pr->percent;
  180. *control = pr->control;
  181. if (pr->control & 0x10) {
  182. mbr_name_prefix = pr->mbr_name;
  183. return 0;
  184. }
  185. mnem = pr->mbr_name;
  186. break;
  187. }
  188. }
  189. pr++;
  190. }
  191. if (mbr_name_prefix != 0)
  192. mnem = (mnem << 8) | (mbr_name_prefix & 0xff);
  193. mbr_name_prefix = 0;
  194. return mnem;
  195. }
  196. static char *WritePitch(int env, int pitch1, int pitch2, int split, int final)
  197. {
  198. // final=1: only give the final pitch value.
  199. int x;
  200. int ix;
  201. int pitch_base;
  202. int pitch_range;
  203. int p1, p2, p_end;
  204. const unsigned char *pitch_env;
  205. int max = -1;
  206. int min = 999;
  207. int y_max = 0;
  208. int y_min = 0;
  209. int env100 = 80; // apply the pitch change only over this proportion of the mbrola phoneme(s)
  210. int y2;
  211. int y[4];
  212. int env_split;
  213. char buf[50];
  214. static char output[50];
  215. MAKE_MEM_UNDEFINED(&output, sizeof(output));
  216. output[0] = 0;
  217. pitch_env = envelope_data[env];
  218. SetPitch2(voice, pitch1, pitch2, &pitch_base, &pitch_range);
  219. env_split = (split * 128)/100;
  220. if (env_split < 0)
  221. env_split = 0-env_split;
  222. // find max and min in the pitch envelope
  223. for (x = 0; x < 128; x++) {
  224. if (pitch_env[x] > max) {
  225. max = pitch_env[x];
  226. y_max = x;
  227. }
  228. if (pitch_env[x] < min) {
  229. min = pitch_env[x];
  230. y_min = x;
  231. }
  232. }
  233. // set an additional pitch point half way through the phoneme.
  234. // but look for a maximum or a minimum and use that instead
  235. y[2] = 64;
  236. if ((y_max > 0) && (y_max < 127))
  237. y[2] = y_max;
  238. if ((y_min > 0) && (y_min < 127))
  239. y[2] = y_min;
  240. y[1] = y[2] / 2;
  241. y[3] = y[2] + (127 - y[2])/2;
  242. // set initial pitch
  243. p1 = ((pitch_env[0]*pitch_range)>>8) + pitch_base; // Hz << 12
  244. p_end = ((pitch_env[127]*pitch_range)>>8) + pitch_base;
  245. if (split >= 0) {
  246. sprintf(buf, " 0 %d", p1/4096);
  247. strcat(output, buf);
  248. }
  249. // don't use intermediate pitch points for linear rise and fall
  250. if (env > 1) {
  251. for (ix = 1; ix < 4; ix++) {
  252. p2 = ((pitch_env[y[ix]]*pitch_range)>>8) + pitch_base;
  253. if (split > 0)
  254. y2 = (y[ix] * env100)/env_split;
  255. else if (split < 0)
  256. y2 = ((y[ix]-env_split) * env100)/env_split;
  257. else
  258. y2 = (y[ix] * env100)/128;
  259. if ((y2 > 0) && (y2 <= env100)) {
  260. sprintf(buf, " %d %d", y2, p2/4096);
  261. strcat(output, buf);
  262. }
  263. }
  264. }
  265. p_end = p_end/4096;
  266. if (split <= 0) {
  267. sprintf(buf, " %d %d", env100, p_end);
  268. strcat(output, buf);
  269. }
  270. if (env100 < 100) {
  271. sprintf(buf, " %d %d", 100, p_end);
  272. strcat(output, buf);
  273. }
  274. strcat(output, "\n");
  275. if (final)
  276. sprintf(output, "\t100 %d\n", p_end);
  277. return output;
  278. }
  279. int MbrolaTranslate(PHONEME_LIST *plist, int n_phonemes, bool resume, FILE *f_mbrola)
  280. {
  281. // Generate a mbrola pho file
  282. unsigned int name;
  283. int len;
  284. int len1;
  285. PHONEME_TAB *ph;
  286. PHONEME_TAB *ph_next;
  287. PHONEME_TAB *ph_prev;
  288. PHONEME_LIST *p;
  289. PHONEME_LIST *next;
  290. PHONEME_DATA phdata;
  291. FMT_PARAMS fmtp;
  292. int pause = 0;
  293. bool released;
  294. int name2;
  295. int control;
  296. bool done;
  297. int len_percent;
  298. const char *final_pitch;
  299. char *ptr;
  300. char mbr_buf[120];
  301. char phbuf[5];
  302. static int phix;
  303. static int embedded_ix;
  304. static int word_count;
  305. if (!resume) {
  306. phix = 1;
  307. embedded_ix = 0;
  308. word_count = 0;
  309. }
  310. while (phix < n_phonemes) {
  311. if (WcmdqFree() < MIN_WCMDQ)
  312. return 1;
  313. ptr = mbr_buf;
  314. p = &plist[phix];
  315. next = &plist[phix+1];
  316. ph = p->ph;
  317. ph_prev = plist[phix-1].ph;
  318. ph_next = plist[phix+1].ph;
  319. if (p->synthflags & SFLAG_EMBEDDED)
  320. DoEmbedded(&embedded_ix, p->sourceix);
  321. if (p->newword & PHLIST_START_OF_SENTENCE)
  322. DoMarker(espeakEVENT_SENTENCE, (p->sourceix & 0x7ff) + clause_start_char, 0, count_sentences);
  323. if (p->newword & PHLIST_START_OF_SENTENCE)
  324. DoMarker(espeakEVENT_WORD, (p->sourceix & 0x7ff) + clause_start_char, p->sourceix >> 11, clause_start_word + word_count++);
  325. name = GetMbrName(p, ph, ph_prev, ph_next, &name2, &len_percent, &control);
  326. if (control & 1)
  327. phix++;
  328. if (name == 0) {
  329. phix++;
  330. continue; // ignore this phoneme
  331. }
  332. if ((ph->type == phPAUSE) && (name == ph->mnemonic)) {
  333. // a pause phoneme, which has not been changed by the translation
  334. name = '_';
  335. len = (p->length * speed.pause_factor)/256;
  336. if (len == 0)
  337. len = 1;
  338. } else
  339. len = (80 * speed.wav_factor)/256;
  340. if (ph->code != phonEND_WORD) {
  341. char phoneme_name[16];
  342. WritePhMnemonic(phoneme_name, p->ph, p, option_phoneme_events & espeakINITIALIZE_PHONEME_IPA, NULL);
  343. DoPhonemeMarker(espeakEVENT_PHONEME, (p->sourceix & 0x7ff) + clause_start_char, 0, phoneme_name);
  344. }
  345. ptr += sprintf(ptr, "%s\t", WordToString(phbuf, name));
  346. if (name2 == '_') {
  347. // add a pause after this phoneme
  348. pause = len_percent;
  349. name2 = 0;
  350. }
  351. done = false;
  352. final_pitch = "";
  353. switch (ph->type)
  354. {
  355. case phVOWEL:
  356. len = ph->std_length;
  357. if (p->synthflags & SFLAG_LENGTHEN)
  358. len += phoneme_tab[phonLENGTHEN]->std_length; // phoneme was followed by an extra : symbol
  359. if (ph_next->type == phPAUSE)
  360. len += 50; // lengthen vowels before a pause
  361. len = (len * p->length)/256;
  362. if (name2 == 0) {
  363. char *pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 0);
  364. ptr += sprintf(ptr, "%d\t%s", len, pitch);
  365. } else {
  366. char *pitch;
  367. pitch = WritePitch(p->env, p->pitch1, p->pitch2, len_percent, 0);
  368. len1 = (len * len_percent)/100;
  369. ptr += sprintf(ptr, "%d\t%s", len1, pitch);
  370. pitch = WritePitch(p->env, p->pitch1, p->pitch2, -len_percent, 0);
  371. ptr += sprintf(ptr, "%s\t%d\t%s", WordToString(phbuf, name2), len-len1, pitch);
  372. }
  373. done = true;
  374. break;
  375. case phSTOP:
  376. released = false;
  377. if (next->type == phVOWEL) released = true;
  378. if (next->type == phLIQUID && !next->newword) released = true;
  379. if (released == false)
  380. p->synthflags |= SFLAG_NEXT_PAUSE;
  381. InterpretPhoneme(NULL, 0, p, &phdata, NULL);
  382. len = DoSample3(&phdata, 0, -1);
  383. len = (len * 1000)/samplerate; // convert to mS
  384. len += PauseLength(p->prepause, 1);
  385. break;
  386. case phVSTOP:
  387. len = (80 * speed.wav_factor)/256;
  388. break;
  389. case phFRICATIVE:
  390. len = 0;
  391. InterpretPhoneme(NULL, 0, p, &phdata, NULL);
  392. if (p->synthflags & SFLAG_LENGTHEN)
  393. len = DoSample3(&phdata, p->length, -1); // play it twice for [s:] etc.
  394. len += DoSample3(&phdata, p->length, -1);
  395. len = (len * 1000)/samplerate; // convert to mS
  396. break;
  397. case phNASAL:
  398. if (next->type != phVOWEL) {
  399. memset(&fmtp, 0, sizeof(fmtp));
  400. InterpretPhoneme(NULL, 0, p, &phdata, NULL);
  401. fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
  402. len = DoSpect2(p->ph, 0, &fmtp, p, -1);
  403. len = (len * 1000)/samplerate;
  404. if (next->type == phPAUSE)
  405. len += 50;
  406. final_pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 1);
  407. }
  408. break;
  409. case phLIQUID:
  410. if (next->type == phPAUSE) {
  411. len += 50;
  412. final_pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 1);
  413. }
  414. break;
  415. }
  416. if (!done) {
  417. if (name2 != 0) {
  418. len1 = (len * len_percent)/100;
  419. ptr += sprintf(ptr, "%d\n%s\t", len1, WordToString(phbuf, name2));
  420. len -= len1;
  421. }
  422. ptr += sprintf(ptr, "%d%s\n", len, final_pitch);
  423. }
  424. if (pause) {
  425. len += PauseLength(pause, 0);
  426. ptr += sprintf(ptr, "_ \t%d\n", PauseLength(pause, 0));
  427. pause = 0;
  428. }
  429. if (f_mbrola)
  430. fwrite(mbr_buf, 1, (ptr-mbr_buf), f_mbrola); // write .pho to a file
  431. else {
  432. int res = write_MBR(mbr_buf);
  433. if (res < 0)
  434. return 0; // don't get stuck on error
  435. if (res == 0)
  436. return 1;
  437. wcmdq[wcmdq_tail][0] = WCMD_MBROLA_DATA;
  438. wcmdq[wcmdq_tail][1] = len;
  439. WcmdqInc();
  440. }
  441. phix++;
  442. }
  443. if (!f_mbrola) {
  444. flush_MBR();
  445. // flush the mbrola output buffer
  446. wcmdq[wcmdq_tail][0] = WCMD_MBROLA_DATA;
  447. wcmdq[wcmdq_tail][1] = 500;
  448. WcmdqInc();
  449. }
  450. return 0;
  451. }
  452. int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume)
  453. {
  454. FILE *f_mbrola = NULL;
  455. if (*n_ph == 0)
  456. return 0;
  457. if (option_phonemes & espeakPHONEMES_MBROLA) {
  458. // send mbrola data to a file, not to the mbrola library
  459. f_mbrola = f_trans;
  460. }
  461. int again = MbrolaTranslate(phoneme_list, *n_ph, resume, f_mbrola);
  462. if (!again)
  463. *n_ph = 0;
  464. return again;
  465. }
  466. int MbrolaFill(int length, bool resume, int amplitude)
  467. {
  468. // Read audio data from Mbrola (length is in millisecs)
  469. static int n_samples;
  470. int req_samples, result;
  471. int ix;
  472. short value16;
  473. int value;
  474. if (!resume)
  475. n_samples = samplerate * length / 1000;
  476. req_samples = (out_end - out_ptr)/2;
  477. if (req_samples > n_samples)
  478. req_samples = n_samples;
  479. result = read_MBR((short *)out_ptr, req_samples);
  480. if (result <= 0)
  481. return 0;
  482. for (ix = 0; ix < result; ix++) {
  483. value16 = out_ptr[0] + (out_ptr[1] << 8);
  484. value = value16 * amplitude;
  485. value = value / 40; // adjust this constant to give a suitable amplitude for mbrola voices
  486. if (value > 0x7fff)
  487. value = 0x7fff;
  488. if (value < -0x8000)
  489. value = 0x8000;
  490. out_ptr[0] = value;
  491. out_ptr[1] = value >> 8;
  492. out_ptr += 2;
  493. }
  494. n_samples -= result;
  495. return n_samples ? 1 : 0;
  496. }
  497. void MbrolaReset(void)
  498. {
  499. // Reset the Mbrola engine and flush the pending audio
  500. reset_MBR();
  501. }
  502. #else
  503. // mbrola interface is not compiled, provide dummy functions.
  504. espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate)
  505. {
  506. (void)mbrola_voice; // unused parameter
  507. (void)phtrans; // unused parameter
  508. (void)srate; // unused parameter
  509. return ENS_NOT_SUPPORTED;
  510. }
  511. int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume)
  512. {
  513. (void)phoneme_list; // unused parameter
  514. (void)n_ph; // unused parameter
  515. (void)resume; // unused parameter
  516. return 0;
  517. }
  518. int MbrolaFill(int length, bool resume, int amplitude)
  519. {
  520. (void)length; // unused parameter
  521. (void)resume; // unused parameter
  522. (void)amplitude; // unused parameter
  523. return 0;
  524. }
  525. void MbrolaReset(void)
  526. {
  527. }
  528. #endif