eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

synth_mbrola.c 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. /*
  2. * Copyright (C) 2005 to 2013 by Jonathan Duddington
  3. * email: [email protected]
  4. * Copyright (C) 2015-2016 Reece H. Dunn
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  18. */
  19. #include "config.h"
  20. #include <ctype.h>
  21. #include <errno.h>
  22. #include <math.h>
  23. #include <stdint.h>
  24. #include <stdio.h>
  25. #include <stdlib.h>
  26. #include <string.h>
  27. #include <espeak-ng/espeak_ng.h>
  28. #include <espeak-ng/speak_lib.h>
  29. #include "speech.h"
  30. #include "phoneme.h"
  31. #include "synthesize.h"
  32. #include "translate.h"
  33. #include "voice.h"
  34. #ifdef INCLUDE_MBROLA
  35. extern int Read4Bytes(FILE *f);
  36. extern void SetPitch2(voice_t *voice, int pitch1, int pitch2, int *pitch_base, int *pitch_range);
  37. extern unsigned char *outbuf;
  38. #if defined(_WIN32) || defined(_WIN64)
  39. #include <windows.h>
  40. #endif
  41. #include "mbrowrap.h"
  42. static MBROLA_TAB *mbrola_tab = NULL;
  43. static int mbrola_control = 0;
  44. static int mbr_name_prefix = 0;
  45. espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate)
  46. {
  47. // Load a phoneme name translation table from espeak-ng-data/mbrola
  48. int size;
  49. int ix;
  50. int *pw;
  51. FILE *f_in;
  52. char path[sizeof(path_home)+15];
  53. mbrola_name[0] = 0;
  54. mbrola_delay = 0;
  55. mbr_name_prefix = 0;
  56. if (mbrola_voice == NULL) {
  57. samplerate = samplerate_native;
  58. SetParameter(espeakVOICETYPE, 0, 0);
  59. return ENS_OK;
  60. }
  61. if (!load_MBR())
  62. return ENS_MBROLA_NOT_FOUND;
  63. sprintf(path, "%s/mbrola/%s", path_home, mbrola_voice);
  64. #ifdef PLATFORM_POSIX
  65. // if not found, then also look in
  66. // usr/share/mbrola/xx, /usr/share/mbrola/xx/xx, /usr/share/mbrola/voices/xx
  67. if (GetFileLength(path) <= 0) {
  68. sprintf(path, "/usr/share/mbrola/%s", mbrola_voice);
  69. if (GetFileLength(path) <= 0) {
  70. sprintf(path, "/usr/share/mbrola/%s/%s", mbrola_voice, mbrola_voice);
  71. if (GetFileLength(path) <= 0)
  72. sprintf(path, "/usr/share/mbrola/voices/%s", mbrola_voice);
  73. }
  74. }
  75. close_MBR();
  76. #endif
  77. if (init_MBR(path) != 0) // initialise the required mbrola voice
  78. return ENS_MBROLA_VOICE_NOT_FOUND;
  79. setNoError_MBR(1); // don't stop on phoneme errors
  80. // read eSpeak's mbrola phoneme translation data, eg. en1_phtrans
  81. sprintf(path, "%s/mbrola_ph/%s", path_home, phtrans);
  82. size = GetFileLength(path);
  83. if (size < 0) // size == -errno
  84. return -size;
  85. if ((f_in = fopen(path, "rb")) == NULL) {
  86. int error = errno;
  87. close_MBR();
  88. return error;
  89. }
  90. MBROLA_TAB *new_mbrola_tab = (MBROLA_TAB *)realloc(mbrola_tab, size);
  91. if (new_mbrola_tab == NULL) {
  92. fclose(f_in);
  93. close_MBR();
  94. return ENOMEM;
  95. }
  96. mbrola_tab = new_mbrola_tab;
  97. mbrola_control = Read4Bytes(f_in);
  98. pw = (int *)mbrola_tab;
  99. for (ix = 4; ix < size; ix += 4)
  100. *pw++ = Read4Bytes(f_in);
  101. fclose(f_in);
  102. setVolumeRatio_MBR((float)(mbrola_control & 0xff) /16.0f);
  103. samplerate = *srate = getFreq_MBR();
  104. if (*srate == 22050)
  105. SetParameter(espeakVOICETYPE, 0, 0);
  106. else
  107. SetParameter(espeakVOICETYPE, 1, 0);
  108. strcpy(mbrola_name, mbrola_voice);
  109. mbrola_delay = 1000; // improve synchronization of events
  110. return ENS_OK;
  111. }
  112. static int GetMbrName(PHONEME_LIST *plist, PHONEME_TAB *ph, PHONEME_TAB *ph_prev, PHONEME_TAB *ph_next, int *name2, int *split, int *control)
  113. {
  114. // Look up a phoneme in the mbrola phoneme name translation table
  115. // It may give none, 1, or 2 mbrola phonemes
  116. MBROLA_TAB *pr;
  117. PHONEME_TAB *other_ph;
  118. int found = 0;
  119. static int mnem;
  120. // control
  121. // bit 0 skip the next phoneme
  122. // bit 1 match this and Previous phoneme
  123. // bit 2 only at the start of a word
  124. // bit 3 don't match two phonemes across a word boundary
  125. // bit 4 add this phoneme name as a prefix to the next phoneme name (used for de4 phoneme prefix '?')
  126. // bit 5 only in stressed syllable
  127. // bit 6 only at the end of a word
  128. *name2 = 0;
  129. *split = 0;
  130. *control = 0;
  131. mnem = ph->mnemonic;
  132. pr = mbrola_tab;
  133. while (pr->name != 0) {
  134. if (mnem == pr->name) {
  135. if (pr->next_phoneme == 0)
  136. found = 1;
  137. else if ((pr->next_phoneme == ':') && (plist->synthflags & SFLAG_LENGTHEN))
  138. found = 1;
  139. else {
  140. if (pr->control & 2)
  141. other_ph = ph_prev;
  142. else if ((pr->control & 8) && ((plist+1)->newword))
  143. other_ph = phoneme_tab[phPAUSE]; // don't match the next phoneme over a word boundary
  144. else
  145. other_ph = ph_next;
  146. if ((pr->next_phoneme == other_ph->mnemonic) ||
  147. ((pr->next_phoneme == 2) && (other_ph->type == phVOWEL)) ||
  148. ((pr->next_phoneme == '_') && (other_ph->type == phPAUSE)))
  149. found = 1;
  150. }
  151. if ((pr->control & 4) && (plist->newword == 0)) // only at start of word
  152. found = 0;
  153. if ((pr->control & 0x40) && (plist[1].newword == 0)) // only at the end of a word
  154. found = 0;
  155. if ((pr->control & 0x20) && (plist->stresslevel < plist->wordstress))
  156. found = 0; // only in stressed syllables
  157. if (found) {
  158. *name2 = pr->mbr_name2;
  159. *split = pr->percent;
  160. *control = pr->control;
  161. if (pr->control & 0x10) {
  162. mbr_name_prefix = pr->mbr_name;
  163. return 0;
  164. }
  165. mnem = pr->mbr_name;
  166. break;
  167. }
  168. }
  169. pr++;
  170. }
  171. if (mbr_name_prefix != 0)
  172. mnem = (mnem << 8) | (mbr_name_prefix & 0xff);
  173. mbr_name_prefix = 0;
  174. return mnem;
  175. }
  176. static char *WritePitch(int env, int pitch1, int pitch2, int split, int final)
  177. {
  178. // final=1: only give the final pitch value.
  179. int x;
  180. int ix;
  181. int pitch_base;
  182. int pitch_range;
  183. int p1, p2, p_end;
  184. unsigned char *pitch_env;
  185. int max = -1;
  186. int min = 999;
  187. int y_max = 0;
  188. int y_min = 0;
  189. int env100 = 80; // apply the pitch change only over this proportion of the mbrola phoneme(s)
  190. int y2;
  191. int y[4];
  192. int env_split;
  193. char buf[50];
  194. static char output[50];
  195. output[0] = 0;
  196. pitch_env = envelope_data[env];
  197. SetPitch2(voice, pitch1, pitch2, &pitch_base, &pitch_range);
  198. env_split = (split * 128)/100;
  199. if (env_split < 0)
  200. env_split = 0-env_split;
  201. // find max and min in the pitch envelope
  202. for (x = 0; x < 128; x++) {
  203. if (pitch_env[x] > max) {
  204. max = pitch_env[x];
  205. y_max = x;
  206. }
  207. if (pitch_env[x] < min) {
  208. min = pitch_env[x];
  209. y_min = x;
  210. }
  211. }
  212. // set an additional pitch point half way through the phoneme.
  213. // but look for a maximum or a minimum and use that instead
  214. y[2] = 64;
  215. if ((y_max > 0) && (y_max < 127))
  216. y[2] = y_max;
  217. if ((y_min > 0) && (y_min < 127))
  218. y[2] = y_min;
  219. y[1] = y[2] / 2;
  220. y[3] = y[2] + (127 - y[2])/2;
  221. // set initial pitch
  222. p1 = ((pitch_env[0]*pitch_range)>>8) + pitch_base; // Hz << 12
  223. p_end = ((pitch_env[127]*pitch_range)>>8) + pitch_base;
  224. if (split >= 0) {
  225. sprintf(buf, " 0 %d", p1/4096);
  226. strcat(output, buf);
  227. }
  228. // don't use intermediate pitch points for linear rise and fall
  229. if (env > 1) {
  230. for (ix = 1; ix < 4; ix++) {
  231. p2 = ((pitch_env[y[ix]]*pitch_range)>>8) + pitch_base;
  232. if (split > 0)
  233. y2 = (y[ix] * env100)/env_split;
  234. else if (split < 0)
  235. y2 = ((y[ix]-env_split) * env100)/env_split;
  236. else
  237. y2 = (y[ix] * env100)/128;
  238. if ((y2 > 0) && (y2 <= env100)) {
  239. sprintf(buf, " %d %d", y2, p2/4096);
  240. strcat(output, buf);
  241. }
  242. }
  243. }
  244. p_end = p_end/4096;
  245. if (split <= 0) {
  246. sprintf(buf, " %d %d", env100, p_end);
  247. strcat(output, buf);
  248. }
  249. if (env100 < 100) {
  250. sprintf(buf, " %d %d", 100, p_end);
  251. strcat(output, buf);
  252. }
  253. strcat(output, "\n");
  254. if (final)
  255. sprintf(output, "\t100 %d\n", p_end);
  256. return output;
  257. }
  258. int MbrolaTranslate(PHONEME_LIST *plist, int n_phonemes, int resume, FILE *f_mbrola)
  259. {
  260. // Generate a mbrola pho file
  261. unsigned int name;
  262. int len;
  263. int len1;
  264. PHONEME_TAB *ph;
  265. PHONEME_TAB *ph_next;
  266. PHONEME_TAB *ph_prev;
  267. PHONEME_LIST *p;
  268. PHONEME_LIST *next;
  269. PHONEME_DATA phdata;
  270. FMT_PARAMS fmtp;
  271. int pause = 0;
  272. int released;
  273. int name2;
  274. int control;
  275. int done;
  276. int len_percent;
  277. const char *final_pitch;
  278. char *ptr;
  279. char mbr_buf[120];
  280. static int phix;
  281. static int embedded_ix;
  282. static int word_count;
  283. if (!resume) {
  284. phix = 1;
  285. embedded_ix = 0;
  286. word_count = 0;
  287. }
  288. while (phix < n_phonemes) {
  289. if (WcmdqFree() < MIN_WCMDQ)
  290. return 1;
  291. ptr = mbr_buf;
  292. p = &plist[phix];
  293. next = &plist[phix+1];
  294. ph = p->ph;
  295. ph_prev = plist[phix-1].ph;
  296. ph_next = plist[phix+1].ph;
  297. if (p->synthflags & SFLAG_EMBEDDED)
  298. DoEmbedded(&embedded_ix, p->sourceix);
  299. if (p->newword & 4)
  300. DoMarker(espeakEVENT_SENTENCE, (p->sourceix & 0x7ff) + clause_start_char, 0, count_sentences);
  301. if (p->newword & 1)
  302. DoMarker(espeakEVENT_WORD, (p->sourceix & 0x7ff) + clause_start_char, p->sourceix >> 11, clause_start_word + word_count++);
  303. name = GetMbrName(p, ph, ph_prev, ph_next, &name2, &len_percent, &control);
  304. if (control & 1)
  305. phix++;
  306. if (name == 0) {
  307. phix++;
  308. continue; // ignore this phoneme
  309. }
  310. if ((ph->type == phPAUSE) && (name == ph->mnemonic)) {
  311. // a pause phoneme, which has not been changed by the translation
  312. name = '_';
  313. len = (p->length * speed.pause_factor)/256;
  314. if (len == 0)
  315. len = 1;
  316. } else
  317. len = (80 * speed.wav_factor)/256;
  318. if (ph->code != phonEND_WORD) {
  319. char phoneme_name[16];
  320. WritePhMnemonic(phoneme_name, p->ph, p, option_phoneme_events & espeakINITIALIZE_PHONEME_IPA, NULL);
  321. DoPhonemeMarker(espeakEVENT_PHONEME, (p->sourceix & 0x7ff) + clause_start_char, 0, phoneme_name);
  322. }
  323. ptr += sprintf(ptr, "%s\t", WordToString(name));
  324. if (name2 == '_') {
  325. // add a pause after this phoneme
  326. pause = len_percent;
  327. name2 = 0;
  328. }
  329. done = 0;
  330. final_pitch = "";
  331. switch (ph->type)
  332. {
  333. case phVOWEL:
  334. len = ph->std_length;
  335. if (p->synthflags & SFLAG_LENGTHEN)
  336. len += phoneme_tab[phonLENGTHEN]->std_length; // phoneme was followed by an extra : symbol
  337. if (ph_next->type == phPAUSE)
  338. len += 50; // lengthen vowels before a pause
  339. len = (len * p->length)/256;
  340. if (name2 == 0) {
  341. char *pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 0);
  342. ptr += sprintf(ptr, "%d\t%s", len, pitch);
  343. } else {
  344. char *pitch;
  345. pitch = WritePitch(p->env, p->pitch1, p->pitch2, len_percent, 0);
  346. len1 = (len * len_percent)/100;
  347. ptr += sprintf(ptr, "%d\t%s", len1, pitch);
  348. pitch = WritePitch(p->env, p->pitch1, p->pitch2, -len_percent, 0);
  349. ptr += sprintf(ptr, "%s\t%d\t%s", WordToString(name2), len-len1, pitch);
  350. }
  351. done = 1;
  352. break;
  353. case phSTOP:
  354. released = 0;
  355. if (next->type == phVOWEL) released = 1;
  356. if (next->type == phLIQUID && !next->newword) released = 1;
  357. if (released == 0)
  358. p->synthflags |= SFLAG_NEXT_PAUSE;
  359. InterpretPhoneme(NULL, 0, p, &phdata, NULL);
  360. len = DoSample3(&phdata, 0, -1);
  361. len = (len * 1000)/samplerate; // convert to mS
  362. len += PauseLength(p->prepause, 1);
  363. break;
  364. case phVSTOP:
  365. len = (80 * speed.wav_factor)/256;
  366. break;
  367. case phFRICATIVE:
  368. len = 0;
  369. InterpretPhoneme(NULL, 0, p, &phdata, NULL);
  370. if (p->synthflags & SFLAG_LENGTHEN)
  371. len = DoSample3(&phdata, p->length, -1); // play it twice for [s:] etc.
  372. len += DoSample3(&phdata, p->length, -1);
  373. len = (len * 1000)/samplerate; // convert to mS
  374. break;
  375. case phNASAL:
  376. if (next->type != phVOWEL) {
  377. memset(&fmtp, 0, sizeof(fmtp));
  378. InterpretPhoneme(NULL, 0, p, &phdata, NULL);
  379. fmtp.fmt_addr = phdata.sound_addr[pd_FMT];
  380. len = DoSpect2(p->ph, 0, &fmtp, p, -1);
  381. len = (len * 1000)/samplerate;
  382. if (next->type == phPAUSE)
  383. len += 50;
  384. final_pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 1);
  385. }
  386. break;
  387. case phLIQUID:
  388. if (next->type == phPAUSE) {
  389. len += 50;
  390. final_pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 1);
  391. }
  392. break;
  393. }
  394. if (!done) {
  395. if (name2 != 0) {
  396. len1 = (len * len_percent)/100;
  397. ptr += sprintf(ptr, "%d\n%s\t", len1, WordToString(name2));
  398. len -= len1;
  399. }
  400. ptr += sprintf(ptr, "%d%s\n", len, final_pitch);
  401. }
  402. if (pause) {
  403. len += PauseLength(pause, 0);
  404. ptr += sprintf(ptr, "_ \t%d\n", PauseLength(pause, 0));
  405. pause = 0;
  406. }
  407. if (f_mbrola)
  408. fwrite(mbr_buf, 1, (ptr-mbr_buf), f_mbrola); // write .pho to a file
  409. else {
  410. int res = write_MBR(mbr_buf);
  411. if (res < 0)
  412. return 0; // don't get stuck on error
  413. if (res == 0)
  414. return 1;
  415. wcmdq[wcmdq_tail][0] = WCMD_MBROLA_DATA;
  416. wcmdq[wcmdq_tail][1] = len;
  417. WcmdqInc();
  418. }
  419. phix++;
  420. }
  421. if (!f_mbrola) {
  422. flush_MBR();
  423. // flush the mbrola output buffer
  424. wcmdq[wcmdq_tail][0] = WCMD_MBROLA_DATA;
  425. wcmdq[wcmdq_tail][1] = 500;
  426. WcmdqInc();
  427. }
  428. return 0;
  429. }
  430. int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, int resume)
  431. {
  432. FILE *f_mbrola = NULL;
  433. if (*n_ph == 0)
  434. return 0;
  435. if (option_phonemes & espeakPHONEMES_MBROLA) {
  436. // send mbrola data to a file, not to the mbrola library
  437. f_mbrola = f_trans;
  438. }
  439. int again = MbrolaTranslate(phoneme_list, *n_ph, resume, f_mbrola);
  440. if (!again)
  441. *n_ph = 0;
  442. return again;
  443. }
  444. int MbrolaFill(int length, int resume, int amplitude)
  445. {
  446. // Read audio data from Mbrola (length is in millisecs)
  447. static int n_samples;
  448. int req_samples, result;
  449. int ix;
  450. short value16;
  451. int value;
  452. if (!resume)
  453. n_samples = samplerate * length / 1000;
  454. req_samples = (out_end - out_ptr)/2;
  455. if (req_samples > n_samples)
  456. req_samples = n_samples;
  457. result = read_MBR((short *)out_ptr, req_samples);
  458. if (result <= 0)
  459. return 0;
  460. for (ix = 0; ix < result; ix++) {
  461. value16 = out_ptr[0] + (out_ptr[1] << 8);
  462. value = value16 * amplitude;
  463. value = value / 40; // adjust this constant to give a suitable amplitude for mbrola voices
  464. if (value > 0x7fff)
  465. value = 0x7fff;
  466. if (value < -0x8000)
  467. value = 0x8000;
  468. out_ptr[0] = value;
  469. out_ptr[1] = value >> 8;
  470. out_ptr += 2;
  471. }
  472. n_samples -= result;
  473. return n_samples ? 1 : 0;
  474. }
  475. void MbrolaReset(void)
  476. {
  477. // Reset the Mbrola engine and flush the pending audio
  478. reset_MBR();
  479. }
  480. #else
  481. // mbrola interface is not compiled, provide dummy functions.
  482. espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate)
  483. {
  484. return ENS_NOT_SUPPORTED;
  485. }
  486. int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, int resume)
  487. {
  488. return 0;
  489. }
  490. int MbrolaFill(int length, int resume, int amplitude)
  491. {
  492. return 0;
  493. }
  494. void MbrolaReset(void)
  495. {
  496. }
  497. #endif