eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

setlengths.cpp 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. /***************************************************************************
  2. * Copyright (C) 2005,2006 by Jonathan Duddington *
  3. * [email protected] *
  4. * *
  5. * This program is free software; you can redistribute it and/or modify *
  6. * it under the terms of the GNU General Public License as published by *
  7. * the Free Software Foundation; either version 2 of the License, or *
  8. * (at your option) any later version. *
  9. * *
  10. * This program is distributed in the hope that it will be useful, *
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  13. * GNU General Public License for more details. *
  14. * *
  15. * You should have received a copy of the GNU General Public License *
  16. * along with this program; if not, write to the *
  17. * Free Software Foundation, Inc., *
  18. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
  19. ***************************************************************************/
  20. #include "StdAfx.h"
  21. #include <stdlib.h>
  22. #include <stdio.h>
  23. #include <wctype.h>
  24. #include "speak_lib.h"
  25. #include "speech.h"
  26. #include "voice.h"
  27. #include "phoneme.h"
  28. #include "synthesize.h"
  29. #include "translate.h"
  30. extern int GetAmplitude(void);
  31. // convert from words-per-minute to internal speed factor
  32. static unsigned char speed_lookup[290] = {
  33. 250, 246, 243, 239, 236, // 80
  34. 233, 229, 226, 223, 220, // 85
  35. 217, 214, 211, 208, 205, // 90
  36. 202, 197, 194, 192, 190, // 95
  37. 187, 185, 183, 180, 178, // 100
  38. 176, 174, 172, 170, 168, // 105
  39. 166, 164, 161, 159, 158, // 110
  40. 156, 154, 152, 150, 148, // 115
  41. 146, 145, 143, 141, 137, // 120
  42. 136, 135, 133, 132, 131, // 125
  43. 129, 128, 127, 126, 125, // 130
  44. 124, 122, 121, 120, 119, // 135
  45. 117, 116, 115, 114, 113, // 140
  46. 112, 111, 110, 108, 107, // 145
  47. 106, 105, 104, 103, 102, // 150
  48. 101, 100, 99, 98, 97, // 155
  49. 96, 95, 93, 92, 92, // 160
  50. 91, 90, 89, 89, 88, // 165
  51. 87, 87, 86, 85, 85, // 170
  52. 84, 83, 83, 82, 81, // 175
  53. 80, 80, 79, 78, 78, // 180
  54. 77, 76, 76, 75, 73, // 185
  55. 72, 72, 71, 71, 70, // 190
  56. 70, 69, 69, 68, 67, // 195
  57. 67, 66, 66, 65, 65, // 200
  58. 64, 64, 63, 63, 62, // 205
  59. 62, 61, 60, 60, 59, // 210
  60. 59, 58, 58, 57, 57, // 215
  61. 56, 56, 55, 55, 55, // 220
  62. 54, 54, 53, 53, 52, // 225
  63. 52, 51, 51, 50, 50, // 230
  64. 49, 49, 49, 48, 48, // 235
  65. 47, 47, 46, 46, 46, // 240
  66. 45, 45, 44, 44, 43, // 245
  67. 43, 43, 42, 42, 41, // 250
  68. 41, 41, 40, 40, 39, // 255
  69. 39, 39, 38, 38, 38, // 260
  70. 37, 37, 37, 36, 36, // 265
  71. 35, 35, 35, 34, 34, // 270
  72. 34, 33, 33, 33, 32, // 275
  73. 32, 32, 32, 31, 31, // 280
  74. 31, 30, 30, 30, 29, // 285
  75. 29, 29, 29, 28, 28, // 290
  76. 28, 28, 27, 27, 27, // 295
  77. 26, 26, 26, 26, 25, // 300
  78. 25, 25, 22, 22, 22, // 305
  79. 22, 22, 22, 22, 22, // 310
  80. 21, 21, 21, 21, 21, // 315
  81. 21, 20, 20, 20, 20, // 320
  82. 20, 15, 15, 15, 15, // 325
  83. 15, 15, 15, 15, 16, // 330
  84. 16, 16, 16, 15, 15, // 335
  85. 15, 15, 15, 15, 15, // 340
  86. 15, 17, 17, 16, 16, // 345
  87. 15, 15, 14, 14, 13, // 350
  88. 13, 12, 12, 11, 11, // 355
  89. 10, 10, 9, 8, 8, // 360
  90. 7, 7, 6, 6, 5, // 365
  91. };
  92. static int speed1 = 130;
  93. static int speed2 = 121;
  94. static int speed3 = 118;
  95. void SetSpeed(int control)
  96. {//=======================
  97. int x;
  98. int s1;
  99. int wpm;
  100. wpm = embedded_value[EMBED_S];
  101. if(wpm > 369) wpm = 369;
  102. if(wpm < 80) wpm = 80;
  103. x = speed_lookup[wpm-80];
  104. if(control & 1)
  105. {
  106. // set speed factors for different syllable positions within a word
  107. // these are used in CalcLengths()
  108. speed1 = (x * voice->speedf1)/256;
  109. speed2 = (x * voice->speedf2)/256;
  110. speed3 = (x * voice->speedf3)/256;
  111. }
  112. if(control & 2)
  113. {
  114. // these are used in synthesis file
  115. s1 = (x * voice->speedf1)/256;
  116. speed_factor1 = (256 * s1)/115; // full speed adjustment, used for pause length
  117. if(speed_factor1 < 16)
  118. speed_factor1 = 16;
  119. if(wpm >= 170)
  120. // speed_factor2 = 100 + (166*s1)/128; // reduced speed adjustment, used for playing recorded sounds
  121. speed_factor2 = 110 + (151*s1)/128; // reduced speed adjustment, used for playing recorded sounds
  122. else
  123. speed_factor2 = 128 + (128*s1)/130; // = 215 at 170 wpm
  124. }
  125. } // end of SetSpeed
  126. #ifdef deleted
  127. void SetAmplitude(int amp)
  128. {//=======================
  129. static unsigned char amplitude_factor[] = {0,5,6,7,9,11,14,17,21,26, 32, 38,44,50,56,63,70,77,84,91,100 };
  130. if((amp >= 0) && (amp <= 20))
  131. {
  132. option_amplitude = (amplitude_factor[amp] * 480)/256;
  133. }
  134. }
  135. #endif
  136. void SetParameter(int parameter, int value, int relative)
  137. {//======================================================
  138. // parameter: reset-all, amp, pitch, speed, linelength, expression, capitals, number grouping
  139. // relative 0=absolute 1=relative
  140. int new_value = value;
  141. int default_value;
  142. if(relative)
  143. {
  144. if(parameter < 5)
  145. {
  146. default_value = param_defaults[parameter];
  147. new_value = default_value + (default_value * value)/100;
  148. }
  149. }
  150. param_stack[0].parameter[parameter] = new_value;
  151. switch(parameter)
  152. {
  153. case espeakRATE:
  154. embedded_value[EMBED_S] = new_value;
  155. embedded_value[EMBED_S2] = new_value;
  156. SetSpeed(3);
  157. break;
  158. case espeakVOLUME:
  159. embedded_value[EMBED_A] = new_value;
  160. GetAmplitude();
  161. break;
  162. case espeakPITCH:
  163. if(new_value > 99) new_value = 99;
  164. embedded_value[EMBED_P] = new_value;
  165. break;
  166. case espeakRANGE:
  167. if(new_value > 99) new_value = 99;
  168. embedded_value[EMBED_R] = new_value;
  169. break;
  170. case espeakPUNCTUATION:
  171. break;
  172. case espeakCAPITALS:
  173. break;
  174. case espeakLINELENGTH:
  175. option_linelength = new_value;
  176. break;
  177. default:
  178. break;
  179. }
  180. } // end of SetParameter
  181. static void DoEmbedded2(int &embix)
  182. {//================================
  183. // There were embedded commands in the text at this point
  184. unsigned int word;
  185. do {
  186. word = embedded_list[embix++];
  187. if((word & 0x1f) == EMBED_S)
  188. {
  189. // speed
  190. SetEmbedded(word & 0x7f, word >> 8); // adjusts embedded_value[EMBED_S]
  191. SetSpeed(1);
  192. }
  193. } while((word & 0x80) == 0);
  194. }
  195. void Translator::CalcLengths()
  196. {//===========================
  197. int ix;
  198. int ix2;
  199. PHONEME_LIST *prev;
  200. PHONEME_LIST *next;
  201. PHONEME_LIST *next2;
  202. PHONEME_LIST *next3;
  203. PHONEME_LIST *p;
  204. PHONEME_LIST *p2;
  205. int stress;
  206. int type;
  207. static int more_syllables=0;
  208. int pre_sonorant=0;
  209. int pre_voiced=0;
  210. int last_pitch = 0;
  211. int pitch_start;
  212. int length_mod;
  213. int len;
  214. int env2;
  215. int end_of_clause;
  216. int embedded_ix = 0;
  217. int min_drop;
  218. unsigned char *pitch_env=NULL;
  219. for(ix=1; ix<n_phoneme_list; ix++)
  220. {
  221. prev = &phoneme_list[ix-1];
  222. p = &phoneme_list[ix];
  223. stress = p->tone & 0xf;
  224. next = &phoneme_list[ix+1];
  225. if(p->synthflags & SFLAG_EMBEDDED)
  226. {
  227. DoEmbedded2(embedded_ix);
  228. }
  229. type = p->type;
  230. if(p->synthflags & SFLAG_SYLLABLE)
  231. type = phVOWEL;
  232. switch(type)
  233. {
  234. case phPAUSE:
  235. last_pitch = 0;
  236. break;
  237. case phSTOP:
  238. last_pitch = 0;
  239. if(prev->type == phFRICATIVE)
  240. p->prepause = 20;
  241. else
  242. if((more_syllables > 0) || (stress < 4))
  243. p->prepause = 40;
  244. else
  245. p->prepause = 60;
  246. if((langopts.word_gap & 0x10) && (p->newword))
  247. p->prepause = 60;
  248. if(p->synthflags & SFLAG_LENGTHEN)
  249. p->prepause += langopts.long_stop;
  250. break;
  251. case phVFRICATIVE:
  252. if(next->type==phVOWEL)
  253. {
  254. pre_voiced = 1;
  255. }
  256. case phFRICATIVE:
  257. if(p->newword)
  258. p->prepause = 15;
  259. if(next->type==phPAUSE && prev->type==phNASAL && !(p->ph->phflags&phFORTIS))
  260. p->prepause = 25;
  261. if((p->ph->phflags & phSIBILANT) && next->type==phSTOP && !next->newword)
  262. {
  263. if(prev->type == phVOWEL)
  264. p->length = 200; // ?? should do this if it's from a prefix
  265. else
  266. p->length = 150;
  267. }
  268. else
  269. p->length = 256;
  270. if((langopts.word_gap & 0x10) && (p->newword))
  271. p->prepause = 30;
  272. break;
  273. case phVSTOP:
  274. if(prev->type==phVFRICATIVE || prev->type==phFRICATIVE || (prev->ph->phflags & phSIBILANT) || (prev->type == phLIQUID))
  275. p->prepause = 30;
  276. if(next->type==phVOWEL || next->type==phLIQUID)
  277. {
  278. if((next->type==phVOWEL) || !next->newword)
  279. pre_voiced = 1;
  280. p->prepause = 40;
  281. if((prev->type == phPAUSE) || (prev->type == phVOWEL)) // || (prev->ph->mnemonic == ('/'*256+'r')))
  282. p->prepause = 0;
  283. else
  284. if(p->newword==0)
  285. {
  286. if(prev->type==phLIQUID)
  287. p->prepause = 20;
  288. if(prev->type==phNASAL)
  289. p->prepause = 12;
  290. if(prev->type==phSTOP && !(prev->ph->phflags & phFORTIS))
  291. p->prepause = 0;
  292. }
  293. }
  294. if((langopts.word_gap & 0x10) && (p->newword) && (p->prepause < 20))
  295. p->prepause = 20;
  296. break;
  297. case phLIQUID:
  298. case phNASAL:
  299. p->amp = stress_amps[1]; // unless changed later
  300. p->length = 256; // TEMPORARY
  301. min_drop = 0;
  302. if(p->newword)
  303. {
  304. if(prev->type==phLIQUID)
  305. p->prepause = 25;
  306. if(prev->type==phVOWEL)
  307. p->prepause = 12;
  308. }
  309. if(next->type==phVOWEL)
  310. {
  311. pre_sonorant = 1;
  312. }
  313. else
  314. if((prev->type==phVOWEL) || (prev->type == phLIQUID))
  315. {
  316. p->length = prev->length;
  317. p->pitch2 = last_pitch;
  318. if(p->pitch2 < 7)
  319. p->pitch2 = 7;
  320. p->pitch1 = p->pitch2 - 8;
  321. p->env = PITCHfall;
  322. pre_voiced = 0;
  323. if(p->type == phLIQUID)
  324. {
  325. p->length = speed1;
  326. p->pitch1 = p->pitch2 - 20; // post vocalic [r/]
  327. }
  328. if(next->type == phVSTOP)
  329. {
  330. p->length = (p->length * 160)/100;
  331. }
  332. if(next->type == phVFRICATIVE)
  333. {
  334. p->length = (p->length * 120)/100;
  335. }
  336. }
  337. else
  338. {
  339. p->pitch2 = last_pitch;
  340. for(ix2=ix; ix2<n_phoneme_list; ix2++)
  341. {
  342. if(phoneme_list[ix2].type == phVOWEL)
  343. {
  344. p->pitch2 = phoneme_list[ix2].pitch2;
  345. break;
  346. }
  347. }
  348. p->pitch1 = p->pitch2-8;
  349. p->env = PITCHfall;
  350. pre_voiced = 0;
  351. }
  352. break;
  353. case phVOWEL:
  354. min_drop = 0;
  355. next2 = &phoneme_list[ix+2];
  356. next3 = &phoneme_list[ix+3];
  357. if(stress > 7) stress = 7;
  358. if(pre_sonorant)
  359. p->amp = stress_amps[stress]-1;
  360. else
  361. p->amp = stress_amps[stress];
  362. if(ix >= (n_phoneme_list-3))
  363. {
  364. // last phoneme of a clause, limit its amplitude
  365. if(p->amp > langopts.param[LOPT_MAXAMP_EOC])
  366. p->amp = langopts.param[LOPT_MAXAMP_EOC];
  367. }
  368. // is the last syllable of a word ?
  369. more_syllables=0;
  370. end_of_clause = 0;
  371. for(p2 = p+1; p2->newword== 0; p2++)
  372. {
  373. if(p2->type == phVOWEL)
  374. more_syllables++;
  375. }
  376. if((p2->newword & 2) && (more_syllables==0))
  377. {
  378. end_of_clause = 2;
  379. }
  380. // calc length modifier
  381. if(more_syllables==0)
  382. {
  383. len = langopts.length_mods0[next2->ph->length_mod *10+ next->ph->length_mod];
  384. if((next->newword) && (langopts.word_gap & 0x4))
  385. {
  386. // consider as a pause + first phoneme of the next word
  387. length_mod = (len + langopts.length_mods0[next->ph->length_mod *10+ 1])/2;
  388. }
  389. else
  390. length_mod = len;
  391. }
  392. else
  393. {
  394. length_mod = langopts.length_mods[next2->ph->length_mod *10+ next->ph->length_mod];
  395. if((next->type == phNASAL) && (next2->type == phSTOP || next2->type == phVSTOP) && (next3->ph->phflags & phFORTIS))
  396. length_mod -= 15;
  397. }
  398. if(more_syllables==0)
  399. length_mod *= speed1;
  400. else
  401. if(more_syllables==1)
  402. length_mod *= speed2;
  403. else
  404. length_mod *= speed3;
  405. length_mod = length_mod / 128;
  406. // if(length_mod < 24)
  407. // length_mod = 24; // restrict how much lengths can be reduced
  408. if(length_mod < 9)
  409. length_mod = 9; // restrict how much lengths can be reduced
  410. if(stress >= 7)
  411. {
  412. // tonic syllable, include a constant component so it doesn't decrease directly with speed
  413. length_mod += 22;
  414. }
  415. length_mod = (length_mod * stress_lengths[stress])/128;
  416. if(end_of_clause == 2)
  417. {
  418. // this is the last syllable in the clause, lengthen it - more for short vowels
  419. length_mod = length_mod * (256 + (280 - p->ph->std_length)/3)/256;
  420. }
  421. if(p->type != phVOWEL)
  422. {
  423. length_mod = 256; // syllabic consonant
  424. min_drop = 8;
  425. }
  426. p->length = length_mod;
  427. // pre-vocalic part
  428. // set last-pitch
  429. env2 = p->env;
  430. if(env2 > 1) env2++; // version for use with preceding semi-vowel
  431. if(p->tone_ph != 0)
  432. {
  433. pitch_env = LookupEnvelope(phoneme_tab[p->tone_ph]->spect);
  434. }
  435. else
  436. {
  437. pitch_env = envelope_data[env2];
  438. }
  439. pitch_start = p->pitch1 + ((p->pitch2-p->pitch1)*pitch_env[0])/256;
  440. if(pre_sonorant || pre_voiced)
  441. {
  442. // set pitch for pre-vocalic part
  443. if(pitch_start - last_pitch > 9)
  444. last_pitch = pitch_start - 9;
  445. prev->pitch1 = last_pitch;
  446. prev->pitch2 = pitch_start;
  447. if(last_pitch < pitch_start)
  448. {
  449. prev->env = PITCHrise;
  450. p->env = env2;
  451. }
  452. else
  453. {
  454. prev->env = PITCHfall;
  455. }
  456. prev->length = length_mod;
  457. prev->amp = p->amp;
  458. if((prev->type != phLIQUID) && (prev->amp > 18))
  459. prev->amp = 18;
  460. }
  461. // vowel & post-vocalic part
  462. next->synthflags &= ~SFLAG_SEQCONTINUE;
  463. if(next->type == phNASAL && next2->type != phVOWEL)
  464. next->synthflags |= SFLAG_SEQCONTINUE;
  465. if(next->type == phLIQUID)
  466. {
  467. next->synthflags |= SFLAG_SEQCONTINUE;
  468. if(next2->type == phVOWEL)
  469. {
  470. next->synthflags &= ~SFLAG_SEQCONTINUE;
  471. }
  472. if(next2->type != phVOWEL)
  473. {
  474. if(next->ph->mnemonic == ('/'*256+'r'))
  475. {
  476. next->synthflags &= ~SFLAG_SEQCONTINUE;
  477. // min_drop = 15;
  478. }
  479. }
  480. }
  481. if((min_drop > 0) && ((p->pitch2 - p->pitch1) < min_drop))
  482. {
  483. p->pitch1 = p->pitch2 - min_drop;
  484. if(p->pitch1 < 0)
  485. p->pitch1 = 0;
  486. }
  487. last_pitch = p->pitch1 + ((p->pitch2-p->pitch1)*envelope_data[p->env][127])/256;
  488. pre_sonorant = 0;
  489. pre_voiced = 0;
  490. break;
  491. }
  492. }
  493. } // end of CalcLengths