eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

setlengths.cpp 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618
  1. /***************************************************************************
  2. * Copyright (C) 2005 to 2007 by Jonathan Duddington *
  3. * email: [email protected] *
  4. * *
  5. * This program is free software; you can redistribute it and/or modify *
  6. * it under the terms of the GNU General Public License as published by *
  7. * the Free Software Foundation; either version 3 of the License, or *
  8. * (at your option) any later version. *
  9. * *
  10. * This program is distributed in the hope that it will be useful, *
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  13. * GNU General Public License for more details. *
  14. * *
  15. * You should have received a copy of the GNU General Public License *
  16. * along with this program; if not, write see: *
  17. * <http://www.gnu.org/licenses/>. *
  18. ***************************************************************************/
  19. #include "StdAfx.h"
  20. #include <stdlib.h>
  21. #include <stdio.h>
  22. #include <wctype.h>
  23. #include "speak_lib.h"
  24. #include "speech.h"
  25. #include "phoneme.h"
  26. #include "synthesize.h"
  27. #include "voice.h"
  28. #include "translate.h"
  29. extern int GetAmplitude(void);
  30. // convert from words-per-minute to internal speed factor
  31. static unsigned char speed_lookup[290] = {
  32. 250, 246, 243, 239, 236, // 80
  33. 233, 229, 226, 223, 220, // 85
  34. 217, 214, 211, 208, 205, // 90
  35. 202, 197, 194, 192, 190, // 95
  36. 187, 185, 183, 180, 178, // 100
  37. 176, 174, 172, 170, 168, // 105
  38. 166, 164, 161, 159, 158, // 110
  39. 156, 154, 152, 150, 148, // 115
  40. 146, 145, 143, 141, 137, // 120
  41. 136, 135, 133, 132, 131, // 125
  42. 129, 128, 127, 126, 125, // 130
  43. 124, 122, 121, 120, 119, // 135
  44. 117, 116, 115, 114, 113, // 140
  45. 112, 111, 110, 108, 107, // 145
  46. 106, 105, 104, 103, 102, // 150
  47. 101, 100, 99, 98, 97, // 155
  48. 96, 95, 93, 92, 92, // 160
  49. 91, 90, 89, 89, 88, // 165
  50. 87, 87, 86, 85, 85, // 170
  51. 84, 83, 83, 82, 81, // 175
  52. 80, 80, 79, 78, 78, // 180
  53. 77, 76, 76, 75, 73, // 185
  54. 72, 72, 71, 71, 70, // 190
  55. 70, 69, 69, 68, 67, // 195
  56. 67, 66, 66, 65, 65, // 200
  57. 64, 64, 63, 63, 62, // 205
  58. 62, 61, 60, 60, 59, // 210
  59. 59, 58, 58, 57, 57, // 215
  60. 56, 56, 55, 55, 55, // 220
  61. 54, 54, 53, 53, 52, // 225
  62. 52, 51, 51, 50, 50, // 230
  63. 49, 49, 49, 48, 48, // 235
  64. 47, 47, 46, 46, 46, // 240
  65. 45, 45, 44, 44, 43, // 245
  66. 43, 43, 42, 42, 41, // 250
  67. 41, 41, 40, 40, 39, // 255
  68. 39, 39, 38, 38, 38, // 260
  69. 37, 37, 37, 36, 36, // 265
  70. 35, 35, 35, 34, 34, // 270
  71. 34, 33, 33, 33, 32, // 275
  72. 32, 32, 32, 31, 31, // 280
  73. 31, 30, 30, 30, 29, // 285
  74. 29, 29, 29, 28, 28, // 290
  75. 28, 28, 27, 27, 27, // 295
  76. 26, 26, 26, 26, 25, // 300
  77. 25, 25, 22, 22, 22, // 305
  78. 22, 22, 22, 22, 22, // 310
  79. 21, 21, 21, 21, 21, // 315
  80. 21, 20, 20, 20, 20, // 320
  81. 20, 15, 15, 15, 15, // 325
  82. 15, 15, 15, 15, 16, // 330
  83. 16, 16, 16, 15, 15, // 335
  84. 15, 15, 15, 15, 15, // 340
  85. 15, 17, 17, 16, 16, // 345
  86. 15, 15, 14, 14, 13, // 350
  87. 13, 12, 12, 11, 11, // 355
  88. 10, 10, 9, 8, 8, // 360
  89. 7, 6, 5, 5, 4, // 365
  90. };
  91. // speed_factor2 adjustments for speeds 370 to 390
  92. static unsigned char faster[] = {
  93. 114,112,110,109,107,105,104,102,100,98, // 370-379
  94. 96,94,92,90,88,85,83,80,78,75,72 }; //380-390
  95. static int speed1 = 130;
  96. static int speed2 = 121;
  97. static int speed3 = 118;
  98. void SetSpeed(int control)
  99. {//=======================
  100. int x;
  101. int s1;
  102. int wpm;
  103. int wpm2;
  104. wpm2 = wpm = embedded_value[EMBED_S];
  105. if(wpm > 369) wpm = 369;
  106. if(wpm < 80) wpm = 80;
  107. x = speed_lookup[wpm-80];
  108. if(control & 1)
  109. {
  110. // set speed factors for different syllable positions within a word
  111. // these are used in CalcLengths()
  112. speed1 = (x * voice->speedf1)/256;
  113. speed2 = (x * voice->speedf2)/256;
  114. speed3 = (x * voice->speedf3)/256;
  115. }
  116. if(control & 2)
  117. {
  118. // these are used in synthesis file
  119. s1 = (x * voice->speedf1)/256;
  120. speed_factor1 = (256 * s1)/115; // full speed adjustment, used for pause length
  121. if(speed_factor1 < 15)
  122. speed_factor1 = 15;
  123. if(wpm >= 170)
  124. // speed_factor2 = 100 + (166*s1)/128; // reduced speed adjustment, used for playing recorded sounds
  125. speed_factor2 = 110 + (150*s1)/128; // reduced speed adjustment, used for playing recorded sounds
  126. else
  127. speed_factor2 = 128 + (128*s1)/130; // = 215 at 170 wpm
  128. if(wpm2 > 369)
  129. {
  130. if(wpm2 > 390)
  131. wpm2 = 390;
  132. speed_factor2 = faster[wpm2 - 370];
  133. }
  134. }
  135. speed_min_sample_len = 450;
  136. } // end of SetSpeed
  137. #ifdef deleted
  138. void SetAmplitude(int amp)
  139. {//=======================
  140. static unsigned char amplitude_factor[] = {0,5,6,7,9,11,14,17,21,26, 32, 38,44,50,56,63,70,77,84,91,100 };
  141. if((amp >= 0) && (amp <= 20))
  142. {
  143. option_amplitude = (amplitude_factor[amp] * 480)/256;
  144. }
  145. }
  146. #endif
  147. void SetParameter(int parameter, int value, int relative)
  148. {//======================================================
  149. // parameter: reset-all, amp, pitch, speed, linelength, expression, capitals, number grouping
  150. // relative 0=absolute 1=relative
  151. int new_value = value;
  152. int default_value;
  153. if(relative)
  154. {
  155. if(parameter < 5)
  156. {
  157. default_value = param_defaults[parameter];
  158. new_value = default_value + (default_value * value)/100;
  159. }
  160. }
  161. param_stack[0].parameter[parameter] = new_value;
  162. switch(parameter)
  163. {
  164. case espeakRATE:
  165. embedded_value[EMBED_S] = new_value;
  166. embedded_value[EMBED_S2] = new_value;
  167. SetSpeed(3);
  168. break;
  169. case espeakVOLUME:
  170. embedded_value[EMBED_A] = new_value;
  171. GetAmplitude();
  172. break;
  173. case espeakPITCH:
  174. if(new_value > 99) new_value = 99;
  175. if(new_value < 0) new_value = 0;
  176. embedded_value[EMBED_P] = new_value;
  177. break;
  178. case espeakRANGE:
  179. if(new_value > 99) new_value = 99;
  180. embedded_value[EMBED_R] = new_value;
  181. break;
  182. case espeakLINELENGTH:
  183. option_linelength = new_value;
  184. break;
  185. case espeakWORDGAP:
  186. option_wordgap = new_value;
  187. break;
  188. case espeakINTONATION:
  189. if((new_value & 0xff) != 0)
  190. translator->langopts.intonation_group = new_value & 0xff;
  191. option_tone_flags = new_value;
  192. break;
  193. default:
  194. break;
  195. }
  196. } // end of SetParameter
  197. static void DoEmbedded2(int &embix)
  198. {//================================
  199. // There were embedded commands in the text at this point
  200. unsigned int word;
  201. do {
  202. word = embedded_list[embix++];
  203. if((word & 0x1f) == EMBED_S)
  204. {
  205. // speed
  206. SetEmbedded(word & 0x7f, word >> 8); // adjusts embedded_value[EMBED_S]
  207. SetSpeed(1);
  208. }
  209. } while((word & 0x80) == 0);
  210. }
  211. void Translator::CalcLengths()
  212. {//===========================
  213. int ix;
  214. int ix2;
  215. PHONEME_LIST *prev;
  216. PHONEME_LIST *next;
  217. PHONEME_LIST *next2;
  218. PHONEME_LIST *next3;
  219. PHONEME_LIST *p;
  220. PHONEME_LIST *p2;
  221. int stress;
  222. int type;
  223. static int more_syllables=0;
  224. int pre_sonorant=0;
  225. int pre_voiced=0;
  226. int last_pitch = 0;
  227. int pitch_start;
  228. int length_mod;
  229. int len;
  230. int env2;
  231. int end_of_clause;
  232. int embedded_ix = 0;
  233. int min_drop;
  234. int emphasized;
  235. unsigned char *pitch_env=NULL;
  236. for(ix=1; ix<n_phoneme_list; ix++)
  237. {
  238. prev = &phoneme_list[ix-1];
  239. p = &phoneme_list[ix];
  240. stress = p->tone & 0x7;
  241. emphasized = p->tone & 0x8;
  242. next = &phoneme_list[ix+1];
  243. if(p->synthflags & SFLAG_EMBEDDED)
  244. {
  245. DoEmbedded2(embedded_ix);
  246. }
  247. type = p->type;
  248. if(p->synthflags & SFLAG_SYLLABLE)
  249. type = phVOWEL;
  250. switch(type)
  251. {
  252. case phPAUSE:
  253. last_pitch = 0;
  254. break;
  255. case phSTOP:
  256. last_pitch = 0;
  257. if(prev->type == phFRICATIVE)
  258. p->prepause = 20;
  259. else
  260. if((more_syllables > 0) || (stress < 4))
  261. p->prepause = 40;
  262. else
  263. p->prepause = 60;
  264. if((langopts.word_gap & 0x10) && (p->newword))
  265. p->prepause = 60;
  266. if(p->synthflags & SFLAG_LENGTHEN)
  267. p->prepause += langopts.long_stop;
  268. break;
  269. case phVFRICATIVE:
  270. if(next->type==phVOWEL)
  271. {
  272. pre_voiced = 1;
  273. } // drop through
  274. case phFRICATIVE:
  275. if(p->newword)
  276. p->prepause = 15;
  277. if(next->type==phPAUSE && prev->type==phNASAL && !(p->ph->phflags&phFORTIS))
  278. p->prepause = 25;
  279. if(prev->ph->phflags & phBRKAFTER)
  280. p->prepause = 30;
  281. if((p->ph->phflags & phSIBILANT) && next->type==phSTOP && !next->newword)
  282. {
  283. if(prev->type == phVOWEL)
  284. p->length = 200; // ?? should do this if it's from a prefix
  285. else
  286. p->length = 150;
  287. }
  288. else
  289. p->length = 256;
  290. if((langopts.word_gap & 0x10) && (p->newword))
  291. p->prepause = 30;
  292. break;
  293. case phVSTOP:
  294. if(prev->type==phVFRICATIVE || prev->type==phFRICATIVE || (prev->ph->phflags & phSIBILANT) || (prev->type == phLIQUID))
  295. p->prepause = 30;
  296. if(next->type==phVOWEL || next->type==phLIQUID)
  297. {
  298. if((next->type==phVOWEL) || !next->newword)
  299. pre_voiced = 1;
  300. p->prepause = 40;
  301. if((prev->type == phPAUSE) || (prev->type == phVOWEL)) // || (prev->ph->mnemonic == ('/'*256+'r')))
  302. p->prepause = 0;
  303. else
  304. if(p->newword==0)
  305. {
  306. if(prev->type==phLIQUID)
  307. p->prepause = 20;
  308. if(prev->type==phNASAL)
  309. p->prepause = 12;
  310. if(prev->type==phSTOP && !(prev->ph->phflags & phFORTIS))
  311. p->prepause = 0;
  312. }
  313. }
  314. if((langopts.word_gap & 0x10) && (p->newword) && (p->prepause < 20))
  315. p->prepause = 20;
  316. break;
  317. case phLIQUID:
  318. case phNASAL:
  319. p->amp = stress_amps[1]; // unless changed later
  320. p->length = 256; // TEMPORARY
  321. min_drop = 0;
  322. if(p->newword)
  323. {
  324. if(prev->type==phLIQUID)
  325. p->prepause = 25;
  326. if(prev->type==phVOWEL)
  327. p->prepause = 12;
  328. }
  329. if(next->type==phVOWEL)
  330. {
  331. pre_sonorant = 1;
  332. }
  333. else
  334. if((prev->type==phVOWEL) || (prev->type == phLIQUID))
  335. {
  336. p->length = prev->length;
  337. p->pitch2 = last_pitch;
  338. if(p->pitch2 < 7)
  339. p->pitch2 = 7;
  340. p->pitch1 = p->pitch2 - 8;
  341. p->env = PITCHfall;
  342. pre_voiced = 0;
  343. if(p->type == phLIQUID)
  344. {
  345. p->length = speed1;
  346. //p->pitch1 = p->pitch2 - 20; // post vocalic [r/]
  347. }
  348. if(next->type == phVSTOP)
  349. {
  350. p->length = (p->length * 160)/100;
  351. }
  352. if(next->type == phVFRICATIVE)
  353. {
  354. p->length = (p->length * 120)/100;
  355. }
  356. }
  357. else
  358. {
  359. p->pitch2 = last_pitch;
  360. for(ix2=ix; ix2<n_phoneme_list; ix2++)
  361. {
  362. if(phoneme_list[ix2].type == phVOWEL)
  363. {
  364. p->pitch2 = phoneme_list[ix2].pitch2;
  365. break;
  366. }
  367. }
  368. p->pitch1 = p->pitch2-8;
  369. p->env = PITCHfall;
  370. pre_voiced = 0;
  371. }
  372. break;
  373. case phVOWEL:
  374. min_drop = 0;
  375. next2 = &phoneme_list[ix+2];
  376. next3 = &phoneme_list[ix+3];
  377. if(stress > 7) stress = 7;
  378. if(pre_sonorant)
  379. p->amp = stress_amps[stress]-1;
  380. else
  381. p->amp = stress_amps[stress];
  382. if(emphasized)
  383. p->amp = 25;
  384. if(ix >= (n_phoneme_list-3))
  385. {
  386. // last phoneme of a clause, limit its amplitude
  387. if(p->amp > langopts.param[LOPT_MAXAMP_EOC])
  388. p->amp = langopts.param[LOPT_MAXAMP_EOC];
  389. }
  390. // is the last syllable of a word ?
  391. more_syllables=0;
  392. end_of_clause = 0;
  393. for(p2 = p+1; p2->newword== 0; p2++)
  394. {
  395. if((p2->type == phVOWEL) && !(p2->ph->phflags & phNONSYLLABIC))
  396. more_syllables++;
  397. }
  398. if((p2->newword & 2) && (more_syllables==0))
  399. {
  400. end_of_clause = 2;
  401. }
  402. // calc length modifier
  403. if(more_syllables==0)
  404. {
  405. len = langopts.length_mods0[next2->ph->length_mod *10+ next->ph->length_mod];
  406. if((next->newword) && (langopts.word_gap & 0x20))
  407. {
  408. // consider as a pause + first phoneme of the next word
  409. length_mod = (len + langopts.length_mods0[next->ph->length_mod *10+ 1])/2;
  410. }
  411. else
  412. length_mod = len;
  413. }
  414. else
  415. {
  416. length_mod = langopts.length_mods[next2->ph->length_mod *10+ next->ph->length_mod];
  417. if((next->type == phNASAL) && (next2->type == phSTOP || next2->type == phVSTOP) && (next3->ph->phflags & phFORTIS))
  418. length_mod -= 15;
  419. }
  420. if(more_syllables==0)
  421. length_mod *= speed1;
  422. else
  423. if(more_syllables==1)
  424. length_mod *= speed2;
  425. else
  426. length_mod *= speed3;
  427. length_mod = length_mod / 128;
  428. // if(length_mod < 9)
  429. // length_mod = 9; // restrict how much lengths can be reduced
  430. if(length_mod < 8)
  431. length_mod = 8; // restrict how much lengths can be reduced
  432. if(stress >= 7)
  433. {
  434. // tonic syllable, include a constant component so it doesn't decrease directly with speed
  435. length_mod += 20;
  436. }
  437. else
  438. if(emphasized)
  439. {
  440. length_mod += 20;
  441. }
  442. if((len = stress_lengths[stress]) == 0)
  443. len = stress_lengths[6];
  444. length_mod = (length_mod * len)/128;
  445. if(end_of_clause == 2)
  446. {
  447. // this is the last syllable in the clause, lengthen it - more for short vowels
  448. length_mod = length_mod * (256 + (280 - p->ph->std_length)/3)/256;
  449. }
  450. if(p->type != phVOWEL)
  451. {
  452. length_mod = 256; // syllabic consonant
  453. min_drop = 8;
  454. }
  455. p->length = length_mod;
  456. // pre-vocalic part
  457. // set last-pitch
  458. env2 = p->env;
  459. if(env2 > 1) env2++; // version for use with preceding semi-vowel
  460. if(p->tone_ph != 0)
  461. {
  462. pitch_env = LookupEnvelope(phoneme_tab[p->tone_ph]->spect);
  463. }
  464. else
  465. {
  466. pitch_env = envelope_data[env2];
  467. }
  468. pitch_start = p->pitch1 + ((p->pitch2-p->pitch1)*pitch_env[0])/256;
  469. if(pre_sonorant || pre_voiced)
  470. {
  471. // set pitch for pre-vocalic part
  472. if(pitch_start - last_pitch > 8) // was 9
  473. last_pitch = pitch_start - 8;
  474. prev->pitch1 = last_pitch;
  475. prev->pitch2 = pitch_start;
  476. if(last_pitch < pitch_start)
  477. {
  478. prev->env = PITCHrise;
  479. p->env = env2;
  480. }
  481. else
  482. {
  483. prev->env = PITCHfall;
  484. }
  485. prev->length = length_mod;
  486. prev->amp = p->amp;
  487. if((prev->type != phLIQUID) && (prev->amp > 18))
  488. prev->amp = 18;
  489. }
  490. // vowel & post-vocalic part
  491. next->synthflags &= ~SFLAG_SEQCONTINUE;
  492. if(next->type == phNASAL && next2->type != phVOWEL)
  493. next->synthflags |= SFLAG_SEQCONTINUE;
  494. if(next->type == phLIQUID)
  495. {
  496. next->synthflags |= SFLAG_SEQCONTINUE;
  497. if(next2->type == phVOWEL)
  498. {
  499. next->synthflags &= ~SFLAG_SEQCONTINUE;
  500. }
  501. if(next2->type != phVOWEL)
  502. {
  503. if(next->ph->mnemonic == ('/'*256+'r'))
  504. {
  505. next->synthflags &= ~SFLAG_SEQCONTINUE;
  506. // min_drop = 15;
  507. }
  508. }
  509. }
  510. if((min_drop > 0) && ((p->pitch2 - p->pitch1) < min_drop))
  511. {
  512. p->pitch1 = p->pitch2 - min_drop;
  513. if(p->pitch1 < 0)
  514. p->pitch1 = 0;
  515. }
  516. last_pitch = p->pitch1 + ((p->pitch2-p->pitch1)*envelope_data[p->env][127])/256;
  517. pre_sonorant = 0;
  518. pre_voiced = 0;
  519. break;
  520. }
  521. }
  522. } // end of CalcLengths