eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

sPlayer.c 5.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. #include <espeak-ng/espeak_ng.h>
  2. #include <espeak-ng/speak_lib.h>
  3. #include "sPlayer.h"
  4. extern unsigned char *out_ptr;
  5. extern unsigned char *out_end;
  6. static speechPlayer_handle_t speechPlayerHandle=NULL;
  7. static const unsigned int minFadeLength=110;
  8. static int MIN(int a, int b) { return((a) < (b) ? a : b); }
  9. static bool needsMixWaveFile(WGEN_DATA *wdata) {
  10. return (bool)wdata->n_mix_wavefile;
  11. }
  12. // mixes the currently queued espeak consonant wave file into the existing content in the given sample buffer.
  13. // This would be used for voiced consonants where the voiced part is generated by speechPlayer, but the consonant comes from a wave file in eSpeak.
  14. // e.g. z, v.
  15. // @param maxNumSamples the maximum number of samples that can be mixed into the sample buffer.
  16. // @param sampleBuf the buffer of existing samples.
  17. static void mixWaveFile(WGEN_DATA *wdata, unsigned int maxNumSamples, sample* sampleBuf) {
  18. unsigned int i=0;
  19. for(;wdata->mix_wavefile_ix<wdata->n_mix_wavefile;++wdata->mix_wavefile_ix) {
  20. if(i>=maxNumSamples) break;
  21. int val;
  22. if(wdata->mix_wave_scale==0) {
  23. val=wdata->mix_wavefile[wdata->mix_wavefile_ix+wdata->mix_wavefile_offset];
  24. ++(wdata->mix_wavefile_ix);
  25. signed char c=wdata->mix_wavefile[wdata->mix_wavefile_ix+wdata->mix_wavefile_offset];
  26. val+=(c*256);
  27. } else {
  28. val=(signed char)wdata->mix_wavefile[wdata->mix_wavefile_ix+wdata->mix_wavefile_offset]*wdata->mix_wave_scale;
  29. }
  30. val*=(wdata->amplitude_v/1024.0);
  31. val=(val*wdata->mix_wave_amp)/40;
  32. sampleBuf[i].value+=val;
  33. if((wdata->mix_wavefile_ix+wdata->mix_wavefile_offset)>=wdata->mix_wavefile_max) {
  34. wdata->mix_wavefile_offset-=(wdata->mix_wavefile_max*3)/4;
  35. }
  36. ++i;
  37. }
  38. }
  39. static bool isKlattFrameFollowing() {
  40. // eSpeak implements its command queue with a circular buffer.
  41. // Thus to walk it, we start from the head, walking to the tail, which may wrap around to the beginning of the buffer as it is circular.
  42. for(int i=(wcmdq_head+1)%N_WCMDQ;i!=wcmdq_tail;i=(i+1)%N_WCMDQ) {
  43. int cmd=wcmdq[i][0];
  44. if(cmd==WCMD_PAUSE||cmd==WCMD_WAVE) {
  45. break;
  46. }
  47. if(cmd==WCMD_KLATT) {
  48. return true;
  49. }
  50. }
  51. return false;
  52. }
  53. static void fillSpeechPlayerFrame(WGEN_DATA *wdata, voice_t *wvoice, frame_t * eFrame, speechPlayer_frame_t* spFrame) {
  54. // eSpeak stores pitch in 4096ths of a hz. Specifically comments in voice.h mentions pitch<<12.
  55. // SpeechPlayer deals with floating point values of hz.
  56. spFrame->voicePitch=(wdata->pitch)/4096.0;
  57. // eSpeak stores voicing amplitude with 64 representing 100% according to comments in voice.h.
  58. // speechPlayer uses floating point value of 1 as 100%.
  59. spFrame->voiceAmplitude=(wvoice->voicing)/64.0;
  60. spFrame->aspirationAmplitude=(wvoice->breath[1])/64.0;
  61. // All of eSpeak's relative formant frequency ratio values are stored with 256 representing 100% according to comments in voice.h.
  62. spFrame->cf1=(eFrame->ffreq[1]*wvoice->freq[1]/256.0)+wvoice->freqadd[1];
  63. spFrame->cf2=(eFrame->ffreq[2]*wvoice->freq[2]/256.0)+wvoice->freqadd[2];
  64. spFrame->cf3=(eFrame->ffreq[3]*wvoice->freq[3]/256.0)+wvoice->freqadd[3];
  65. spFrame->cf4=(eFrame->ffreq[4]*wvoice->freq[4]/256.0)+wvoice->freqadd[4];
  66. spFrame->cf5=(eFrame->ffreq[5]*wvoice->freq[5]/256.0)+wvoice->freqadd[5];
  67. spFrame->cf6=(eFrame->ffreq[6]*wvoice->freq[6]/256.0)+wvoice->freqadd[6];
  68. spFrame->cfNP=200;
  69. spFrame->cfN0=250;
  70. if(eFrame->klattp[KLATT_FNZ]>0) {
  71. spFrame->caNP=1;
  72. spFrame->cfN0=eFrame->klattp[KLATT_FNZ]*2;
  73. } else {
  74. spFrame->caNP=0;
  75. }
  76. spFrame->cb1=eFrame->bw[0]*2*(wvoice->width[1]/256.0);
  77. spFrame->cb2=eFrame->bw[1]*2*(wvoice->width[2]/256.0);
  78. spFrame->cb3=eFrame->bw[2]*2*(wvoice->width[3]/256.0);
  79. spFrame->cb4=eFrame->bw[3]*2*(wvoice->width[4]/256.0);
  80. spFrame->cb5=1000;
  81. spFrame->cb6=1000;
  82. spFrame->cbNP=100;
  83. spFrame->cbN0=100;
  84. spFrame->preFormantGain=1;
  85. spFrame->outputGain=3*(wdata->amplitude/100.0);
  86. spFrame->endVoicePitch=spFrame->voicePitch;
  87. }
  88. void KlattInitSP() {
  89. speechPlayerHandle=speechPlayer_initialize(22050);
  90. }
  91. void KlattFiniSP() {
  92. speechPlayer_terminate(speechPlayerHandle);
  93. }
  94. void KlattResetSP() {
  95. KlattFiniSP();
  96. KlattInitSP();
  97. }
  98. int Wavegen_KlattSP(WGEN_DATA *wdata, voice_t *wvoice, int length, int resume, frame_t *fr1, frame_t *fr2){
  99. if(!resume) {
  100. speechPlayer_frame_t spFrame1={0};
  101. fillSpeechPlayerFrame(wdata, wvoice, fr1,&spFrame1);
  102. speechPlayer_frame_t spFrame2={0};
  103. fillSpeechPlayerFrame(wdata, wvoice, fr2,&spFrame2);
  104. wdata->pitch_ix+=(wdata->pitch_inc*(length/STEPSIZE));
  105. wdata->pitch=((wdata->pitch_env[MIN(wdata->pitch_ix>>8,127)]*wdata->pitch_range)>>8)+wdata->pitch_base;
  106. spFrame2.endVoicePitch=wdata->pitch/4096;
  107. bool willMixWaveFile=needsMixWaveFile(wdata);
  108. if(willMixWaveFile) {
  109. spFrame1.outputGain/=5;
  110. spFrame2.outputGain/=5;
  111. }
  112. int mainLength=length;
  113. speechPlayer_queueFrame(speechPlayerHandle,&spFrame1,minFadeLength,minFadeLength,-1,false);
  114. mainLength-=minFadeLength;
  115. bool fadeOut=!isKlattFrameFollowing();
  116. if(fadeOut) {
  117. mainLength-=minFadeLength;
  118. }
  119. if(mainLength>=1) {
  120. speechPlayer_queueFrame(speechPlayerHandle,&spFrame2,mainLength,mainLength,-1,false);
  121. }
  122. if(fadeOut) {
  123. spFrame2.voicePitch=spFrame2.endVoicePitch;
  124. spFrame2.preFormantGain=0;
  125. speechPlayer_queueFrame(speechPlayerHandle,&spFrame2,minFadeLength/2,minFadeLength/2,-1,false);
  126. spFrame2.outputGain=0;
  127. speechPlayer_queueFrame(speechPlayerHandle,&spFrame2,minFadeLength/2,minFadeLength/2,-1,false);
  128. }
  129. }
  130. unsigned int maxLength=(out_end-out_ptr)/sizeof(sample);
  131. unsigned int outLength=speechPlayer_synthesize(speechPlayerHandle,maxLength,(sample*)out_ptr);
  132. mixWaveFile(wdata, outLength,(sample*)out_ptr);
  133. out_ptr=out_ptr+(sizeof(sample)*outLength);
  134. if(out_ptr>=out_end) return 1;
  135. return 0;
  136. }