eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

sPlayer.c 5.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. #include <espeak-ng/espeak_ng.h>
  2. #include <espeak-ng/speak_lib.h>
  3. #include "sPlayer.h"
  4. extern unsigned char *out_ptr;
  5. extern unsigned char *out_end;
  6. static speechPlayer_handle_t speechPlayerHandle=NULL;
  7. static const unsigned int minFadeLength=110;
  8. static int MAX(int a, int b) { return((a) > (b) ? a : b); }
  9. static int MIN(int a, int b) { return((a) < (b) ? a : b); }
  10. static bool needsMixWaveFile(WGEN_DATA *wdata) {
  11. return (bool)wdata->n_mix_wavefile;
  12. }
  13. // mixes the currently queued espeak consonant wave file into the existing content in the given sample buffer.
  14. // This would be used for voiced consonants where the voiced part is generated by speechPlayer, but the consonant comes from a wave file in eSpeak.
  15. // e.g. z, v.
  16. // @param maxNumSamples the maximum number of samples that can be mixed into the sample buffer.
  17. // @param sampleBuf the buffer of existing samples.
  18. static void mixWaveFile(WGEN_DATA *wdata, unsigned int maxNumSamples, sample* sampleBuf) {
  19. unsigned int i=0;
  20. for(;wdata->mix_wavefile_ix<wdata->n_mix_wavefile;++wdata->mix_wavefile_ix) {
  21. if(i>=maxNumSamples) break;
  22. int val;
  23. if(wdata->mix_wave_scale==0) {
  24. val=wdata->mix_wavefile[wdata->mix_wavefile_ix+wdata->mix_wavefile_offset];
  25. ++(wdata->mix_wavefile_ix);
  26. signed char c=wdata->mix_wavefile[wdata->mix_wavefile_ix+wdata->mix_wavefile_offset];
  27. val+=(c*256);
  28. } else {
  29. val=(signed char)wdata->mix_wavefile[wdata->mix_wavefile_ix+wdata->mix_wavefile_offset]*wdata->mix_wave_scale;
  30. }
  31. val*=(wdata->amplitude_v/1024.0);
  32. val=(val*wdata->mix_wave_amp)/40;
  33. sampleBuf[i].value+=val;
  34. if((wdata->mix_wavefile_ix+wdata->mix_wavefile_offset)>=wdata->mix_wavefile_max) {
  35. wdata->mix_wavefile_offset-=(wdata->mix_wavefile_max*3)/4;
  36. }
  37. ++i;
  38. }
  39. }
  40. static bool isKlattFrameFollowing() {
  41. // eSpeak implements its command queue with a circular buffer.
  42. // Thus to walk it, we start from the head, walking to the tail, which may wrap around to the beginning of the buffer as it is circular.
  43. for(int i=(wcmdq_head+1)%N_WCMDQ;i!=wcmdq_tail;i=(i+1)%N_WCMDQ) {
  44. int cmd=wcmdq[i][0];
  45. if(cmd==WCMD_PAUSE||cmd==WCMD_WAVE) {
  46. break;
  47. }
  48. if(cmd==WCMD_KLATT) {
  49. return true;
  50. }
  51. }
  52. return false;
  53. }
  54. static void fillSpeechPlayerFrame(WGEN_DATA *wdata, voice_t *wvoice, frame_t * eFrame, speechPlayer_frame_t* spFrame) {
  55. // eSpeak stores pitch in 4096ths of a hz. Specifically comments in voice.h mentions pitch<<12.
  56. // SpeechPlayer deals with floating point values of hz.
  57. spFrame->voicePitch=(wdata->pitch)/4096.0;
  58. // eSpeak stores voicing amplitude with 64 representing 100% according to comments in voice.h.
  59. // speechPlayer uses floating point value of 1 as 100%.
  60. spFrame->voiceAmplitude=(wvoice->voicing)/64.0;
  61. // All of eSpeak's relative formant frequency ratio values are stored with 256 representing 100% according to comments in voice.h.
  62. spFrame->cf1=(eFrame->ffreq[1]*wvoice->freq[1]/256.0)+wvoice->freqadd[1];
  63. spFrame->cf2=(eFrame->ffreq[2]*wvoice->freq[2]/256.0)+wvoice->freqadd[2];
  64. spFrame->cf3=(eFrame->ffreq[3]*wvoice->freq[3]/256.0)+wvoice->freqadd[3];
  65. spFrame->cf4=(eFrame->ffreq[4]*wvoice->freq[4]/256.0)+wvoice->freqadd[4];
  66. spFrame->cf5=(eFrame->ffreq[5]*wvoice->freq[5]/256.0)+wvoice->freqadd[5];
  67. spFrame->cf6=(eFrame->ffreq[6]*wvoice->freq[6]/256.0)+wvoice->freqadd[6];
  68. spFrame->cfNP=200;
  69. spFrame->cfN0=250;
  70. if(eFrame->klattp[KLATT_FNZ]>0) {
  71. spFrame->caNP=1;
  72. spFrame->cfN0=eFrame->klattp[KLATT_FNZ]*2;
  73. } else {
  74. spFrame->caNP=0;
  75. }
  76. spFrame->cb1=eFrame->bw[1]*2*(wvoice->width[1]/256.0);
  77. spFrame->cb2=eFrame->bw[2]*2*(wvoice->width[2]/256.0);
  78. spFrame->cb3=eFrame->bw[3]*2*(wvoice->width[3]/256.0);
  79. spFrame->cb4=eFrame->bw[4]*2*(wvoice->width[4]/256.0);
  80. spFrame->cb5=1000;
  81. spFrame->cb6=1000;
  82. spFrame->cbNP=100;
  83. spFrame->cbN0=100;
  84. spFrame->preFormantGain=1;
  85. spFrame->outputGain=3*(wdata->amplitude/100.0);
  86. spFrame->endVoicePitch=spFrame->voicePitch;
  87. }
  88. void KlattInitSP() {
  89. speechPlayerHandle=speechPlayer_initialize(22050);
  90. }
  91. void KlattResetSP() {
  92. speechPlayer_terminate(speechPlayerHandle);
  93. speechPlayerHandle=speechPlayer_initialize(22050);
  94. }
  95. int Wavegen_KlattSP(WGEN_DATA *wdata, voice_t *wvoice, int length, int resume, frame_t *fr1, frame_t *fr2){
  96. if(!resume) {
  97. speechPlayer_frame_t spFrame1={0};
  98. fillSpeechPlayerFrame(wdata, wvoice, fr1,&spFrame1);
  99. speechPlayer_frame_t spFrame2={0};
  100. fillSpeechPlayerFrame(wdata, wvoice, fr2,&spFrame2);
  101. wdata->pitch_ix+=(wdata->pitch_inc*(length/STEPSIZE));
  102. wdata->pitch=((wdata->pitch_env[MIN(wdata->pitch_ix>>8,127)]*wdata->pitch_range)>>8)+wdata->pitch_base;
  103. spFrame2.endVoicePitch=wdata->pitch/4096;
  104. bool willMixWaveFile=needsMixWaveFile(wdata);
  105. if(willMixWaveFile) {
  106. spFrame1.outputGain/=5;
  107. spFrame2.outputGain/=5;
  108. }
  109. int mainLength=length;
  110. speechPlayer_queueFrame(speechPlayerHandle,&spFrame1,minFadeLength,minFadeLength,-1,false);
  111. mainLength-=minFadeLength;
  112. bool fadeOut=!isKlattFrameFollowing();
  113. if(fadeOut) {
  114. mainLength-=minFadeLength;
  115. }
  116. if(mainLength>=1) {
  117. speechPlayer_queueFrame(speechPlayerHandle,&spFrame2,mainLength,mainLength,-1,false);
  118. }
  119. if(fadeOut) {
  120. spFrame2.voicePitch=spFrame2.endVoicePitch;
  121. spFrame2.preFormantGain=0;
  122. speechPlayer_queueFrame(speechPlayerHandle,&spFrame2,minFadeLength/2,minFadeLength/2,-1,false);
  123. spFrame2.outputGain=0;
  124. speechPlayer_queueFrame(speechPlayerHandle,&spFrame2,minFadeLength/2,minFadeLength/2,-1,false);
  125. }
  126. }
  127. unsigned int maxLength=(out_end-out_ptr)/sizeof(sample);
  128. unsigned int outLength=speechPlayer_synthesize(speechPlayerHandle,maxLength,(sample*)out_ptr);
  129. mixWaveFile(wdata, outLength,(sample*)out_ptr);
  130. out_ptr=out_ptr+(sizeof(sample)*outLength);
  131. if(out_ptr>=out_end) return 1;
  132. return 0;
  133. }