eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ttsengobj.cpp 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. /*******************************************************************************
  2. * TtsEngObj.cpp *
  3. *---------------*
  4. * Description:
  5. * This module is the main implementation file for the CTTSEngObj class.
  6. *-------------------------------------------------------------------------------
  7. * Creation Date: 03/24/99
  8. * Copyright (c) Microsoft Corporation. All rights reserved.
  9. * All Rights Reserved
  10. *
  11. *******************************************************************************/
  12. //--- Additional includes
  13. #include "stdafx.h"
  14. #include "TtsEngObj.h"
  15. #include "src/speak_lib.h"
  16. #include "stdio.h"
  17. //--- Local
  18. //#define LOG_DEBUG
  19. #define CTRL_EMBEDDED 1
  20. CTTSEngObj *m_EngObj;
  21. ISpTTSEngineSite* m_OutputSite;
  22. FILE *f_log2=NULL;
  23. extern int AddNameData(const char *name, int wide);
  24. extern void InitNamedata(void);
  25. int master_volume = 100;
  26. int master_rate = 0;
  27. int gVolume = 100;
  28. int gSpeed = -1;
  29. int gPitch = -1;
  30. int gRange = -1;
  31. int gEmphasis = 0;
  32. int gSayas = 0;
  33. char *path_install = NULL;
  34. unsigned long audio_offset = 0;
  35. unsigned long audio_latest = 0;
  36. unsigned int gBufSize = 0;
  37. wchar_t *TextBuf=NULL;
  38. typedef struct {
  39. unsigned int bufix;
  40. unsigned int textix;
  41. unsigned int cmdlen;
  42. } FRAG_OFFSET;
  43. int srate; // samplerate, Hz/50
  44. int n_frag_offsets = 0;
  45. int frag_ix = 0;
  46. int frag_count=0;
  47. FRAG_OFFSET *frag_offsets = NULL;
  48. int SynthCallback(short *wav, int numsamples, espeak_EVENT *events);
  49. int SynthCallback(short *wav, int numsamples, espeak_EVENT *events)
  50. {//================================================================
  51. int hr;
  52. wchar_t *tailptr;
  53. unsigned int text_offset;
  54. int length;
  55. espeak_EVENT *event;
  56. #define N_EVENTS 100
  57. int n_Events = 0;
  58. SPEVENT *Event;
  59. SPEVENT Events[N_EVENTS];
  60. if(m_OutputSite->GetActions() & SPVES_ABORT)
  61. return(1);
  62. m_EngObj->CheckActions(m_OutputSite);
  63. // return the events
  64. for(event=events; event->type != 0; event++)
  65. {
  66. audio_latest = event->audio_position + audio_offset;
  67. if((event->type == espeakEVENT_WORD) && (event->length > 0))
  68. {
  69. while(((frag_ix+1) < frag_count) &&
  70. ((event->text_position -1 + frag_offsets[frag_ix+1].cmdlen) >= frag_offsets[frag_ix+1].bufix))
  71. {
  72. frag_ix++;
  73. }
  74. text_offset = frag_offsets[frag_ix].textix +
  75. event->text_position -1 - frag_offsets[frag_ix].bufix + frag_offsets[frag_ix].cmdlen;
  76. length = event->length - frag_offsets[frag_ix].cmdlen;
  77. frag_offsets[frag_ix].cmdlen = 0;
  78. if(text_offset < 0)
  79. text_offset = 0;
  80. Event = &Events[n_Events++];
  81. Event->eEventId = SPEI_WORD_BOUNDARY;
  82. Event->elParamType = SPET_LPARAM_IS_UNDEFINED;
  83. Event->ullAudioStreamOffset = ((event->audio_position + audio_offset) * srate)/10; // ms -> bytes
  84. Event->lParam = text_offset;
  85. Event->wParam = length;
  86. }
  87. if(event->type == espeakEVENT_MARK)
  88. {
  89. Event = &Events[n_Events++];
  90. Event->eEventId = SPEI_TTS_BOOKMARK;
  91. Event->elParamType = SPET_LPARAM_IS_STRING;
  92. Event->ullAudioStreamOffset = ((event->audio_position + audio_offset) * 441)/10; // ms -> bytes
  93. Event->lParam = (long)event->id.name;
  94. Event->wParam = wcstol((wchar_t *)event->id.name,&tailptr,10);
  95. }
  96. #ifdef deleted
  97. if(event->type == espeakEVENT_SENTENCE)
  98. {
  99. Event = &Events[n_Events++];
  100. Event->eEventId = SPEI_SENTENCE_BOUNDARY;
  101. Event->elParamType = SPET_LPARAM_IS_UNDEFINED;
  102. Event->ullAudioStreamOffset = (event->audio_position * 441)/10; // ms -> bytes
  103. Event->lParam = event->text_position-1 + text_offset;
  104. Event->wParam = 0; // TEMP
  105. }
  106. #endif
  107. }
  108. if(n_Events > 0)
  109. m_OutputSite->AddEvents(Events, n_Events );
  110. // return the sound data
  111. hr = m_OutputSite->Write(wav, numsamples*2, NULL);
  112. return(hr);
  113. }
  114. static int ConvertRate(int new_rate)
  115. {//=================================
  116. int rate;
  117. static int rate_table[21] = {80,100,116,124,132,140,148,156,164,170,176,
  118. 182,188,197,208,220,240,270,300,335,370 };
  119. rate = new_rate + master_rate;
  120. if(rate < -10) rate = -10;
  121. if(rate > 10) rate = 10;
  122. return(rate_table[rate+10]);
  123. } // end of ConvertRate
  124. static int ConvertPitch(int pitch)
  125. {//===============================
  126. static int pitch_table[41] =
  127. {0, 0, 0, 0, 0, 0, 0, 0, 4, 8,12,16,20,24,28,32,36,40,44,47,50,
  128. 54,58,62,66,70,74,78,82,84,88,92,96,99,99,99,99,99,99,99,99};
  129. // {0,3,5,8,10,13,15,18,20,23,25,28,30,33,35,38,40,43,45,48,50,
  130. // 53,55,58,60,63,65,68,70,73,75,78,80,83,85,88,90,93,95,97,99};
  131. if(pitch < -20) pitch = -20;
  132. if(pitch > 20) pitch = 20;
  133. return(pitch_table[pitch+20]);
  134. }
  135. static int ConvertRange(int range)
  136. {//===============================
  137. static int range_table[21] = {16,28,39,49,58,66,74,81,88,94,100,105,110,115,120,125,130,135,140,145,150};
  138. if(range < -10) range = -10;
  139. if(range > 10) range = 10;
  140. return(range_table[range+10]/2);
  141. }
  142. /*****************************************************************************
  143. * CTTSEngObj::FinalConstruct *
  144. *----------------------------*
  145. * Description:
  146. * Constructor
  147. *****************************************************************************/
  148. HRESULT CTTSEngObj::FinalConstruct()
  149. {
  150. SPDBG_FUNC( "CTTSEngObj::FinalConstruct" );
  151. HRESULT hr = S_OK;
  152. #ifdef LOG_DEBUG
  153. f_log2=fopen("C:\\log_espeak","a");
  154. if(f_log2) fprintf(f_log2,"\n****\n");
  155. #endif
  156. //--- Init vars
  157. m_hVoiceData = NULL;
  158. m_pVoiceData = NULL;
  159. m_pWordList = NULL;
  160. m_ulNumWords = 0;
  161. m_EngObj = this;
  162. return hr;
  163. } /* CTTSEngObj::FinalConstruct */
  164. /*****************************************************************************
  165. * CTTSEngObj::FinalRelease *
  166. *--------------------------*
  167. * Description:
  168. * destructor
  169. *****************************************************************************/
  170. void CTTSEngObj::FinalRelease()
  171. {
  172. SPDBG_FUNC( "CTTSEngObj::FinalRelease" );
  173. delete m_pWordList;
  174. #ifdef LOG_DEBUG
  175. if(f_log2!=NULL) fclose(f_log2);
  176. #endif
  177. if( m_pVoiceData )
  178. {
  179. ::UnmapViewOfFile( (void*)m_pVoiceData );
  180. }
  181. if( m_hVoiceData )
  182. {
  183. ::CloseHandle( m_hVoiceData );
  184. }
  185. } /* CTTSEngObj::FinalRelease */
  186. //
  187. //=== ISpObjectWithToken Implementation ======================================
  188. //
  189. void WcharToChar(char *out, const wchar_t *in, int len)
  190. {//====================================================
  191. int ix;
  192. for(ix=0; ix<len; ix++)
  193. {
  194. if((out[ix] = (char)in[ix]) == 0)
  195. break;
  196. }
  197. out[len-1] = 0;
  198. }
  199. /*****************************************************************************
  200. * CTTSEngObj::SetObjectToken *
  201. *----------------------------*
  202. * Description:
  203. * Read the "VoiceName" attribute from the registry, and use it to select
  204. * an eSpeak voice file
  205. *****************************************************************************/
  206. STDMETHODIMP CTTSEngObj::SetObjectToken(ISpObjectToken * pToken)
  207. {
  208. char voice[80];
  209. strcpy(voice,"default");
  210. SPDBG_FUNC( "CTTSEngObj::SetObjectToken" );
  211. HRESULT hr = SpGenericSetObjectToken(pToken, m_cpToken);
  212. if( SUCCEEDED( hr ) )
  213. {
  214. CSpDynamicString voicename;
  215. CSpDynamicString path;
  216. HRESULT hr2;
  217. int len;
  218. hr2 = m_cpToken->GetStringValue( L"VoiceName", &voicename);
  219. if( SUCCEEDED(hr2) )
  220. {
  221. WcharToChar(voice,voicename,sizeof(voice));
  222. }
  223. hr2 = m_cpToken->GetStringValue( L"Path", &path);
  224. if( SUCCEEDED(hr2) )
  225. {
  226. len = wcslen(path)+1;
  227. path_install = (char *)malloc(len);
  228. WcharToChar(path_install,path,len);
  229. }
  230. }
  231. gVolume = 100;
  232. gSpeed = -1;
  233. gPitch = -1;
  234. gRange = -1;
  235. gEmphasis = 0;
  236. gSayas = 0;
  237. espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,100,path_install);
  238. espeak_SetVoiceByName(voice);
  239. espeak_SetSynthCallback(SynthCallback);
  240. return hr;
  241. } /* CTTSEngObj::SetObjectToken */
  242. //
  243. //=== ISpTTSEngine Implementation ============================================
  244. //
  245. int CTTSEngObj::ProcessFragList(const SPVTEXTFRAG* pTextFragList, wchar_t *pW_start, ISpTTSEngineSite* pOutputSite, int *n_text)
  246. {//=============================================================================================================================
  247. int action;
  248. int control;
  249. wchar_t *pW;
  250. const SPVSTATE *state;
  251. unsigned int ix;
  252. unsigned int len;
  253. unsigned int total=0;
  254. char cmdbuf[50];
  255. wchar_t markbuf[32];
  256. int speed;
  257. int volume;
  258. int pitch;
  259. int range;
  260. int emphasis;
  261. int sayas;
  262. unsigned int text_offset = 0;
  263. frag_count = 0;
  264. frag_ix = 0;
  265. pW = pW_start;
  266. while(pTextFragList != NULL)
  267. {
  268. action = pTextFragList->State.eAction;
  269. control = pOutputSite->GetActions();
  270. len = pTextFragList->ulTextLen;
  271. if(control & SPVES_ABORT)
  272. break;
  273. CheckActions(pOutputSite);
  274. sayas = 0;
  275. switch(action)
  276. {
  277. case SPVA_SpellOut:
  278. sayas = 0x12; // SAYAS_CHARS; // drop through to SPVA_Speak
  279. case SPVA_Speak:
  280. text_offset = pTextFragList->ulTextSrcOffset;
  281. audio_offset = audio_latest;
  282. #ifdef deleted
  283. // attempt to recognise when JAWS is spelling, it doesn't use SPVA_SpellOut
  284. if((pW != NULL) && (*n_text == 1) && ((len == 1) || ((len==2) && (pTextFragList->pTextStart[1]==' '))))
  285. {
  286. // A single text fragment with one character. Speak as a character, not a word
  287. sayas = 0x11;
  288. gSayas = 0;
  289. }
  290. #endif
  291. if(frag_count >= n_frag_offsets)
  292. {
  293. if((frag_offsets = (FRAG_OFFSET *)realloc(frag_offsets,sizeof(FRAG_OFFSET)*(frag_count+500))) != NULL)
  294. {
  295. n_frag_offsets = frag_count+500;
  296. }
  297. }
  298. // first set the volume, rate, pitch
  299. state = &pTextFragList->State;
  300. volume = (state->Volume * master_volume)/100;
  301. speed = ConvertRate(state->RateAdj);
  302. pitch = ConvertPitch(state->PitchAdj.MiddleAdj);
  303. range = ConvertRange(state->PitchAdj.RangeAdj);
  304. emphasis = state->EmphAdj;
  305. if(emphasis != 0)
  306. emphasis = 3;
  307. len = 0;
  308. if(volume != gVolume)
  309. {
  310. sprintf(&cmdbuf[len],"%c%dA",CTRL_EMBEDDED,volume);
  311. len += strlen(&cmdbuf[len]);
  312. }
  313. if(speed != gSpeed)
  314. {
  315. sprintf(&cmdbuf[len],"%c%dS",CTRL_EMBEDDED,speed);
  316. len += strlen(&cmdbuf[len]);
  317. }
  318. if(pitch != gPitch)
  319. {
  320. sprintf(&cmdbuf[len],"%c%dP",CTRL_EMBEDDED,pitch);
  321. len += strlen(&cmdbuf[len]);
  322. }
  323. if(range != gRange)
  324. {
  325. sprintf(&cmdbuf[len],"%c%dR",CTRL_EMBEDDED,range);
  326. len += strlen(&cmdbuf[len]);
  327. }
  328. if(emphasis != gEmphasis)
  329. {
  330. sprintf(&cmdbuf[len],"%c%dF",CTRL_EMBEDDED,emphasis);
  331. len += strlen(&cmdbuf[len]);
  332. }
  333. if(sayas != gSayas)
  334. {
  335. sprintf(&cmdbuf[len],"%c%dY",CTRL_EMBEDDED,sayas);
  336. len += strlen(&cmdbuf[len]);
  337. }
  338. gVolume = volume;
  339. gSpeed = speed;
  340. gPitch = pitch;
  341. gRange = range;
  342. gEmphasis = emphasis;
  343. gSayas = sayas;
  344. total += (len + pTextFragList->ulTextLen);
  345. if(pTextFragList->ulTextLen > 0)
  346. {
  347. total++;
  348. }
  349. if(pW != NULL)
  350. {
  351. for(ix=0; ix<len; ix++)
  352. {
  353. *pW++ = cmdbuf[ix];
  354. }
  355. frag_offsets[frag_count].textix = text_offset;
  356. frag_offsets[frag_count].bufix = pW - pW_start;
  357. frag_offsets[frag_count].cmdlen = len;
  358. for(ix=0; ix<pTextFragList->ulTextLen; ix++)
  359. {
  360. *pW++ = pTextFragList->pTextStart[ix];
  361. }
  362. if(pTextFragList->ulTextLen > 0)
  363. {
  364. *pW++ = ' ';
  365. }
  366. }
  367. frag_count++;
  368. break;
  369. case SPVA_Bookmark:
  370. total += (2 + pTextFragList->ulTextLen);
  371. if(pW != NULL)
  372. {
  373. int index;
  374. for(ix=0; ix<pTextFragList->ulTextLen; ix++)
  375. {
  376. markbuf[ix] = (char )pTextFragList->pTextStart[ix];
  377. }
  378. markbuf[ix] = 0;
  379. if((index = AddNameData((const char *)markbuf,1)) >= 0)
  380. {
  381. sprintf(cmdbuf,"%c%dM",CTRL_EMBEDDED,index);
  382. len = strlen(cmdbuf);
  383. for(ix=0; ix<len; ix++)
  384. {
  385. *pW++ = cmdbuf[ix];
  386. }
  387. }
  388. }
  389. break;
  390. }
  391. pTextFragList = pTextFragList->pNext;
  392. }
  393. if(pW != NULL)
  394. {
  395. *pW = 0;
  396. }
  397. *n_text = frag_count;
  398. return(total);
  399. } // end of ProcessFragList
  400. /*****************************************************************************
  401. * CTTSEngObj::Speak *
  402. *-------------------*
  403. * Description:
  404. * This is the primary method that SAPI calls to render text.
  405. *-----------------------------------------------------------------------------
  406. * Input Parameters
  407. *
  408. * pUser
  409. * Pointer to the current user profile object. This object contains
  410. * information like what languages are being used and this object
  411. * also gives access to resources like the SAPI master lexicon object.
  412. *
  413. * dwSpeakFlags
  414. * This is a set of flags used to control the behavior of the
  415. * SAPI voice object and the associated engine.
  416. *
  417. * VoiceFmtIndex
  418. * Zero based index specifying the output format that should
  419. * be used during rendering.
  420. *
  421. * pTextFragList
  422. * A linked list of text fragments to be rendered. There is
  423. * one fragement per XML state change. If the input text does
  424. * not contain any XML markup, there will only be a single fragment.
  425. *
  426. * pOutputSite
  427. * The interface back to SAPI where all output audio samples and events are written.
  428. *
  429. * Return Values
  430. * S_OK - This should be returned after successful rendering or if
  431. * rendering was interrupted because *pfContinue changed to FALSE.
  432. * E_INVALIDARG
  433. * E_OUTOFMEMORY
  434. *
  435. *****************************************************************************/
  436. STDMETHODIMP CTTSEngObj::Speak( DWORD dwSpeakFlags,
  437. REFGUID rguidFormatId,
  438. const WAVEFORMATEX * pWaveFormatEx,
  439. const SPVTEXTFRAG* pTextFragList,
  440. ISpTTSEngineSite* pOutputSite )
  441. {
  442. SPDBG_FUNC( "CTTSEngObj::Speak" );
  443. HRESULT hr = S_OK;
  444. unsigned int size;
  445. int xVolume;
  446. int xSpeed;
  447. int xPitch;
  448. int xRange;
  449. int xEmphasis;
  450. int xSayas;
  451. int punctuation;
  452. int n_text_frag=0;
  453. //--- Check args
  454. if( SP_IS_BAD_INTERFACE_PTR( pOutputSite ) ||
  455. SP_IS_BAD_READ_PTR( pTextFragList ) )
  456. {
  457. hr = E_INVALIDARG;
  458. }
  459. else
  460. {
  461. InitNamedata();
  462. //--- Init some vars
  463. m_pCurrFrag = pTextFragList;
  464. m_pNextChar = m_pCurrFrag->pTextStart;
  465. m_pEndChar = m_pNextChar + m_pCurrFrag->ulTextLen;
  466. m_ullAudioOff = 0;
  467. m_OutputSite = pOutputSite;
  468. xVolume = gVolume;
  469. xSpeed = gSpeed;
  470. xPitch = gPitch;
  471. xRange = gRange;
  472. xEmphasis = gEmphasis;
  473. xSayas = gSayas;
  474. // find the size of the text buffer needed for this Speak() request
  475. size = ProcessFragList(pTextFragList,NULL,pOutputSite,&n_text_frag);
  476. gVolume = xVolume;
  477. gSpeed = xSpeed;
  478. gPitch = xPitch;
  479. gRange = xRange;
  480. gEmphasis = xEmphasis;
  481. gSayas = xSayas;
  482. punctuation = 0;
  483. if(dwSpeakFlags & SPF_NLP_SPEAK_PUNC)
  484. punctuation = 1;
  485. espeak_SetParameter(espeakPUNCTUATION,punctuation,0);
  486. size = (size + 50)*sizeof(wchar_t);
  487. if(size > gBufSize)
  488. {
  489. size += 1000; // some extra so we don't need to realloc() again too often
  490. TextBuf = (wchar_t *)realloc(TextBuf,size);
  491. if(TextBuf == NULL)
  492. {
  493. gBufSize=0;
  494. return(1);
  495. }
  496. gBufSize = size;
  497. }
  498. audio_latest = 0;
  499. size = ProcessFragList(pTextFragList,TextBuf,pOutputSite,&n_text_frag);
  500. if(size > 0)
  501. {
  502. espeak_Synth(TextBuf,0,0,POS_CHARACTER,0,espeakCHARS_WCHAR | espeakKEEP_NAMEDATA,NULL,NULL);
  503. }
  504. }
  505. return hr;
  506. } /* CTTSEngObj::Speak */
  507. HRESULT CTTSEngObj::CheckActions( ISpTTSEngineSite* pOutputSite )
  508. {//==============================================================
  509. int control;
  510. USHORT volume;
  511. long rate;
  512. control = pOutputSite->GetActions();
  513. if(control & SPVES_VOLUME)
  514. {
  515. if(pOutputSite->GetVolume(&volume) == S_OK)
  516. {
  517. master_volume = volume;
  518. }
  519. }
  520. if(control & SPVES_RATE)
  521. {
  522. if(pOutputSite->GetRate(&rate) == S_OK)
  523. {
  524. master_rate = rate;
  525. }
  526. }
  527. return(S_OK);
  528. } // end of CTTSEngObj::CheckActions
  529. /*****************************************************************************
  530. * CTTSEngObj::GetVoiceFormat *
  531. *----------------------------*
  532. * Description:
  533. * This method returns the output data format associated with the
  534. * specified format Index. Formats are in order of quality with the best
  535. * starting at 0.
  536. *****************************************************************************/
  537. STDMETHODIMP CTTSEngObj::GetOutputFormat( const GUID * pTargetFormatId, const WAVEFORMATEX * pTargetWaveFormatEx,
  538. GUID * pDesiredFormatId, WAVEFORMATEX ** ppCoMemDesiredWaveFormatEx )
  539. {
  540. SPDBG_FUNC( "CTTSEngObj::GetVoiceFormat" );
  541. HRESULT hr = S_OK;
  542. enum SPSTREAMFORMAT sample_rate = SPSF_22kHz16BitMono;
  543. srate = 441;
  544. if(espeak_GetParameter(espeakVOICETYPE,1) == 1)
  545. {
  546. srate = 320;
  547. sample_rate = SPSF_16kHz16BitMono; // an mbrola voice
  548. }
  549. hr = SpConvertStreamFormatEnum(sample_rate, pDesiredFormatId, ppCoMemDesiredWaveFormatEx);
  550. return hr;
  551. } /* CTTSEngObj::GetVoiceFormat */
  552. int FAR PASCAL CompileDictionary(const char *voice, const char *path_log)
  553. {//===========================================================
  554. FILE *f_log;
  555. char fname[120];
  556. f_log = fopen(path_log,"w");
  557. sprintf(fname,"%s/",path_install);
  558. espeak_SetVoiceByName(voice);
  559. espeak_CompileDictionary(fname,f_log);
  560. fclose(f_log);
  561. return(0);
  562. }