/***************************************************************************
* Copyright (C) 2005 to 2007 by Jonathan Duddington *
* email: jonsd@users.sourceforge.net *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write see: *
* . *
***************************************************************************/
#include "stdafx.h"
#include "TtsEngObj.h"
#include "src/speak_lib.h"
#include "stdio.h"
#define CTRL_EMBEDDED 1
CTTSEngObj *m_EngObj;
ISpTTSEngineSite* m_OutputSite;
FILE *f_log2=NULL;
ULONGLONG event_interest;
extern int AddNameData(const char *name, int wide);
extern void InitNamedata(void);
int master_volume = 100;
int master_rate = 0;
int initialised = 0;
int gVolume = 100;
int gSpeed = -1;
int gPitch = -1;
int gRange = -1;
int gEmphasis = 0;
int gSayas = 0;
char g_voice_name[80];
char *path_install = NULL;
unsigned long audio_offset = 0;
unsigned long audio_latest = 0;
int prev_phoneme = 0;
int prev_phoneme_position = 0;
unsigned long prev_phoneme_time = 0;
unsigned int gBufSize = 0;
wchar_t *TextBuf=NULL;
typedef struct {
unsigned int bufix;
unsigned int textix;
unsigned int cmdlen;
} FRAG_OFFSET;
int srate; // samplerate, Hz/50
int n_frag_offsets = 0;
int frag_ix = 0;
int frag_count=0;
FRAG_OFFSET *frag_offsets = NULL;
//#define TEST_INPUT // printf input text received from SAPI to espeak_text_log.txt
#ifdef TEST_INPUT
static int utf8_out(unsigned int c, char *buf)
{//====================================
// write a unicode character into a buffer as utf8
// returns the number of bytes written
int n_bytes;
int j;
int shift;
static char unsigned code[4] = {0,0xc0,0xe0,0xf0};
if(c < 0x80)
{
buf[0] = c;
return(1);
}
if(c >= 0x110000)
{
buf[0] = ' '; // out of range character code
return(1);
}
if(c < 0x0800)
n_bytes = 1;
else
if(c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;
shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for(j=0; j> shift) & 0x3f);
}
return(n_bytes+1);
} // end of utf8_out
#endif
int VisemeCode(unsigned int phoneme_name)
{//======================================
// Convert eSpeak phoneme name into a SAPI viseme code
int ix;
unsigned int ph;
unsigned int ph_name;
#define PH(c1,c2) (c2<<8)+c1 // combine two characters into an integer for phoneme name
const unsigned char initial_to_viseme[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0,19, 0, 0, 0, 0, 0,
0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,
4, 2,18,16,17, 4,18,20,12, 6,16,20,14,21,20, 3,
21,20,13,16,17, 4, 1, 5,20, 7,16, 0, 0, 0, 0, 0,
0, 1,21,16,19, 4,18,20,12, 6, 6,20,14,21,19, 8,
21,20,13,15,19, 7,18, 7,20, 7,15, 0, 0, 0, 0, 0 };
const unsigned int viseme_exceptions[] = {
PH('a','I'), 11,
PH('a','U'), 9,
PH('O','I'), 10,
PH('t','S'), 16,
PH('d','Z'), 16,
PH('_','|'), 255,
0
};
ph_name = phoneme_name & 0xffff;
for(ix=0; (ph = viseme_exceptions[ix]) != 0; ix+=2)
{
if(ph == ph_name)
{
return(viseme_exceptions[ix+1]);
}
}
return(initial_to_viseme[phoneme_name & 0x7f]);
}
int SynthCallback(short *wav, int numsamples, espeak_EVENT *events);
int SynthCallback(short *wav, int numsamples, espeak_EVENT *events)
{//================================================================
int hr;
wchar_t *tailptr;
unsigned int text_offset;
int length;
int phoneme_duration;
int this_viseme;
espeak_EVENT *event;
#define N_EVENTS 100
int n_Events = 0;
SPEVENT *Event;
SPEVENT Events[N_EVENTS];
if(m_OutputSite->GetActions() & SPVES_ABORT)
return(1);
m_EngObj->CheckActions(m_OutputSite);
// return the events
for(event=events; (event->type != 0) && (n_Events < N_EVENTS); event++)
{
audio_latest = event->audio_position + audio_offset;
if((event->type == espeakEVENT_WORD) && (event->length > 0))
{
while(((frag_ix+1) < frag_count) &&
((event->text_position -1 + frag_offsets[frag_ix+1].cmdlen) >= frag_offsets[frag_ix+1].bufix))
{
frag_ix++;
}
text_offset = frag_offsets[frag_ix].textix +
event->text_position -1 - frag_offsets[frag_ix].bufix + frag_offsets[frag_ix].cmdlen;
length = event->length - frag_offsets[frag_ix].cmdlen;
frag_offsets[frag_ix].cmdlen = 0;
if(text_offset < 0)
text_offset = 0;
Event = &Events[n_Events++];
Event->eEventId = SPEI_WORD_BOUNDARY;
Event->elParamType = SPET_LPARAM_IS_UNDEFINED;
Event->ullAudioStreamOffset = ((event->audio_position + audio_offset) * srate)/10; // ms -> bytes
Event->lParam = text_offset;
Event->wParam = length;
}
if(event->type == espeakEVENT_MARK)
{
Event = &Events[n_Events++];
Event->eEventId = SPEI_TTS_BOOKMARK;
Event->elParamType = SPET_LPARAM_IS_STRING;
Event->ullAudioStreamOffset = ((event->audio_position + audio_offset) * srate)/10; // ms -> bytes
Event->lParam = (long)event->id.name;
Event->wParam = wcstol((wchar_t *)event->id.name,&tailptr,10);
}
if(event->type == espeakEVENT_PHONEME)
{
if(event_interest & SPEI_VISEME)
{
phoneme_duration = audio_latest - prev_phoneme_time;
// ignore some phonemes (which translate to viseme=255)
if((this_viseme = VisemeCode(event->id.number)) != 255)
{
Event = &Events[n_Events++];
Event->eEventId = SPEI_VISEME;
Event->elParamType = SPET_LPARAM_IS_UNDEFINED;
Event->ullAudioStreamOffset = ((prev_phoneme_position + audio_offset) * srate)/10; // ms -> bytes
Event->lParam = phoneme_duration << 16 | this_viseme;
Event->wParam = VisemeCode(prev_phoneme);
prev_phoneme = event->id.number;
prev_phoneme_time = audio_latest;
prev_phoneme_position = event->audio_position;
}
}
}
#ifdef deleted
if(event->type == espeakEVENT_SENTENCE)
{
Event = &Events[n_Events++];
Event->eEventId = SPEI_SENTENCE_BOUNDARY;
Event->elParamType = SPET_LPARAM_IS_UNDEFINED;
Event->ullAudioStreamOffset = ((event->audio_position + audio_offset) * srate)/10; // ms -> bytes
Event->lParam = 0;
Event->wParam = 0; // TEMP
}
#endif
}
if(n_Events > 0)
m_OutputSite->AddEvents(Events, n_Events );
// return the sound data
hr = m_OutputSite->Write(wav, numsamples*2, NULL);
return(hr);
}
static int ConvertRate(int new_rate)
{//=================================
int rate;
static int rate_table[21] = {80,100,115,124,133,142,151,159,168,174,180,
187,196,208,220,240,270,300,335,369,390 };
rate = new_rate + master_rate;
if(rate < -10) rate = -10;
if(rate > 10) rate = 10;
return(rate_table[rate+10]);
} // end of ConvertRate
static int ConvertPitch(int pitch)
{//===============================
static int pitch_table[41] =
{0, 0, 0, 0, 0, 0, 0, 0, 4, 8,12,16,20,24,28,32,36,40,44,47,50,
54,58,62,66,70,74,78,82,84,88,92,96,99,99,99,99,99,99,99,99};
// {0,3,5,8,10,13,15,18,20,23,25,28,30,33,35,38,40,43,45,48,50,
// 53,55,58,60,63,65,68,70,73,75,78,80,83,85,88,90,93,95,97,99};
if(pitch < -20) pitch = -20;
if(pitch > 20) pitch = 20;
return(pitch_table[pitch+20]);
}
static int ConvertRange(int range)
{//===============================
static int range_table[21] = {16,28,39,49,58,66,74,81,88,94,100,105,110,115,120,125,130,135,140,145,150};
if(range < -10) range = -10;
if(range > 10) range = 10;
return(range_table[range+10]/2);
}
HRESULT CTTSEngObj::FinalConstruct()
{//=================================
SPDBG_FUNC( "CTTSEngObj::FinalConstruct" );
HRESULT hr = S_OK;
#ifdef LOG_DEBUG
f_log2=fopen("C:\\log_espeak","a");
if(f_log2) fprintf(f_log2,"\n****\n");
#endif
//--- Init vars
m_hVoiceData = NULL;
m_pVoiceData = NULL;
m_pWordList = NULL;
m_ulNumWords = 0;
m_EngObj = this;
return hr;
} /* CTTSEngObj::FinalConstruct */
void CTTSEngObj::FinalRelease()
{//============================
SPDBG_FUNC( "CTTSEngObj::FinalRelease" );
delete m_pWordList;
#ifdef LOG_DEBUG
if(f_log2!=NULL) fclose(f_log2);
#endif
if( m_pVoiceData )
{
::UnmapViewOfFile( (void*)m_pVoiceData );
}
if( m_hVoiceData )
{
::CloseHandle( m_hVoiceData );
}
} /* CTTSEngObj::FinalRelease */
//
//=== ISpObjectWithToken Implementation ======================================
//
void WcharToChar(char *out, const wchar_t *in, int len)
{//====================================================
int ix;
for(ix=0; ixGetStringValue( L"VoiceName", &voicename);
if( SUCCEEDED(hr2) )
{
WcharToChar(voice_name,voicename,sizeof(voice_name));
}
hr2 = m_cpToken->GetStringValue( L"Path", &path);
if( SUCCEEDED(hr2) )
{
len = wcslen(path)+1;
path_install = (char *)malloc(len);
WcharToChar(path_install,path,len);
}
}
gVolume = 100;
gSpeed = -1;
gPitch = -1;
gRange = -1;
gEmphasis = 0;
gSayas = 0;
if(initialised==0)
{
espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,100,path_install,1);
espeak_SetSynthCallback(SynthCallback);
initialised = 1;
// g_voice_name[0] = 0;
}
strcpy(g_voice_name, voice_name);
espeak_SetVoiceByName(g_voice_name);
return hr;
} /* CTTSEngObj::SetObjectToken */
//
//=== ISpTTSEngine Implementation ============================================
//
#define L(c1,c2) (c1<<8)+c2 // combine two characters into an integer
static char *phoneme_names_en[] = {
NULL,NULL,NULL," ",NULL,NULL,NULL,NULL,"'",",",
"A:","a","V","0","aU","@","aI",
"b","tS","d","D","E","3:","eI",
"f","g","h","I","i:","dZ","k",
"l","m","n","N","oU","OI","p",
"r","s","S","t","T","U","u:",
"v","w","j","z","Z",
NULL
};
int CTTSEngObj::WritePhonemes(SPPHONEID *phons, wchar_t *pW)
{//=========================================================
int ph;
int ix=2;
int skip=0;
int maxph = 49;
char *p;
int j;
int lang;
char **phoneme_names;
char phbuf[200];
espeak_VOICE *voice;
voice = espeak_GetCurrentVoice();
lang = (voice->languages[1] << 8) + (voice->languages[2]);
phoneme_names = phoneme_names_en;
maxph = 0;
if(lang == L('e','n'))
{
phoneme_names = phoneme_names_en;
maxph = 49;
}
if(maxph == 0)
return(0);
strcpy(phbuf,"[[");
while(((ph = *phons++) != 0) && (ix < (sizeof(phbuf) - 3)))
{
if(skip)
{
skip = 0;
continue;
}
if(ph > maxph)
continue;
p = phoneme_names[phons[0]]; // look at the phoneme after this one
if(p != NULL)
{
if(p[0] == '\'')
{
phbuf[ix++] = '\''; // primary stress, put before the vowel, not after
skip=1;
}
if(p[0] == ',')
{
phbuf[ix++] = ','; // secondary stress
skip=1;
}
}
p = phoneme_names[ph]; // look at this phoneme
if(p != NULL)
{
strcpy(&phbuf[ix],p);
ix += strlen(p);
}
}
strcpy(&phbuf[ix],"]]");
ix += 2;
if(pW != NULL)
{
for(j=0; j<=ix; j++)
{
pW[j] = phbuf[j];
}
}
return(strlen(phbuf));
}
int CTTSEngObj::ProcessFragList(const SPVTEXTFRAG* pTextFragList, wchar_t *pW_start, ISpTTSEngineSite* pOutputSite, int *n_text)
{//============================================================================================================================
int action;
int control;
wchar_t *pW;
const SPVSTATE *state;
unsigned int ix;
unsigned int len;
unsigned int total=0;
char cmdbuf[50];
wchar_t markbuf[32];
int speed;
int volume;
int pitch;
int range;
int emphasis;
int sayas;
unsigned int text_offset = 0;
frag_count = 0;
frag_ix = 0;
pW = pW_start;
// check that the current voice is correct for this request
if(strcmp(voice_name, g_voice_name) != 0)
{
strcpy(g_voice_name, voice_name);
espeak_SetVoiceByName(g_voice_name);
}
while(pTextFragList != NULL)
{
action = pTextFragList->State.eAction;
control = pOutputSite->GetActions();
len = pTextFragList->ulTextLen;
if(control & SPVES_ABORT)
break;
CheckActions(pOutputSite);
sayas = 0;
state = &pTextFragList->State;
switch(action)
{
case SPVA_SpellOut:
sayas = 0x12; // SAYAS_CHARS; // drop through to SPVA_Speak
case SPVA_Speak:
text_offset = pTextFragList->ulTextSrcOffset;
audio_offset = audio_latest;
#ifdef deleted
// attempt to recognise when JAWS is spelling, it doesn't use SPVA_SpellOut
if((pW != NULL) && (*n_text == 1) && ((len == 1) || ((len==2) && (pTextFragList->pTextStart[1]==' '))))
{
// A single text fragment with one character. Speak as a character, not a word
sayas = 0x11;
gSayas = 0;
}
#endif
if(frag_count >= n_frag_offsets)
{
if((frag_offsets = (FRAG_OFFSET *)realloc(frag_offsets,sizeof(FRAG_OFFSET)*(frag_count+500))) != NULL)
{
n_frag_offsets = frag_count+500;
}
}
// first set the volume, rate, pitch
volume = (state->Volume * master_volume)/100;
speed = ConvertRate(state->RateAdj);
pitch = ConvertPitch(state->PitchAdj.MiddleAdj);
range = ConvertRange(state->PitchAdj.RangeAdj);
emphasis = state->EmphAdj;
if(emphasis != 0)
emphasis = 3;
len = 0;
if(volume != gVolume)
{
sprintf(&cmdbuf[len],"%c%dA",CTRL_EMBEDDED,volume);
len += strlen(&cmdbuf[len]);
}
if(speed != gSpeed)
{
sprintf(&cmdbuf[len],"%c%dS",CTRL_EMBEDDED,speed);
len += strlen(&cmdbuf[len]);
}
if(pitch != gPitch)
{
sprintf(&cmdbuf[len],"%c%dP",CTRL_EMBEDDED,pitch);
len += strlen(&cmdbuf[len]);
}
if(range != gRange)
{
sprintf(&cmdbuf[len],"%c%dR",CTRL_EMBEDDED,range);
len += strlen(&cmdbuf[len]);
}
if(emphasis != gEmphasis)
{
sprintf(&cmdbuf[len],"%c%dF",CTRL_EMBEDDED,emphasis);
len += strlen(&cmdbuf[len]);
}
if(sayas != gSayas)
{
sprintf(&cmdbuf[len],"%c%dY",CTRL_EMBEDDED,sayas);
len += strlen(&cmdbuf[len]);
}
gVolume = volume;
gSpeed = speed;
gPitch = pitch;
gRange = range;
gEmphasis = emphasis;
gSayas = sayas;
total += (len + pTextFragList->ulTextLen);
if(pTextFragList->ulTextLen > 0)
{
total++;
}
if(pW != NULL)
{
for(ix=0; ixulTextLen; ix++)
{
c = pTextFragList->pTextStart[ix];
n = utf8_out(c,buf);
buf[n] = 0;
fprintf(f,"%s",buf);
}
fprintf(f,"\n");
fclose(f);
}
}
#endif
for(ix=0; ixulTextLen; ix++)
{
*pW++ = pTextFragList->pTextStart[ix];
}
if(pTextFragList->ulTextLen > 0)
{
*pW++ = ' ';
}
}
frag_count++;
break;
case SPVA_Bookmark:
total += (2 + pTextFragList->ulTextLen);
if(pW != NULL)
{
int index;
for(ix=0; ixulTextLen; ix++)
{
markbuf[ix] = (char )pTextFragList->pTextStart[ix];
}
markbuf[ix] = 0;
if((index = AddNameData((const char *)markbuf,1)) >= 0)
{
sprintf(cmdbuf,"%c%dM",CTRL_EMBEDDED,index);
len = strlen(cmdbuf);
for(ix=0; ixpPhoneIds, pW);
if(pW != NULL)
{
pW += total;
}
break;
}
pTextFragList = pTextFragList->pNext;
}
if(pW != NULL)
{
*pW = 0;
}
*n_text = frag_count;
return(total);
} // end of ProcessFragList
/*****************************************************************************
* CTTSEngObj::Speak *
*-------------------*
* Description:
* This is the primary method that SAPI calls to render text.
*-----------------------------------------------------------------------------
* Input Parameters
*
* pUser
* Pointer to the current user profile object. This object contains
* information like what languages are being used and this object
* also gives access to resources like the SAPI master lexicon object.
*
* dwSpeakFlags
* This is a set of flags used to control the behavior of the
* SAPI voice object and the associated engine.
*
* VoiceFmtIndex
* Zero based index specifying the output format that should
* be used during rendering.
*
* pTextFragList
* A linked list of text fragments to be rendered. There is
* one fragement per XML state change. If the input text does
* not contain any XML markup, there will only be a single fragment.
*
* pOutputSite
* The interface back to SAPI where all output audio samples and events are written.
*
* Return Values
* S_OK - This should be returned after successful rendering or if
* rendering was interrupted because *pfContinue changed to FALSE.
* E_INVALIDARG
* E_OUTOFMEMORY
*
*****************************************************************************/
STDMETHODIMP CTTSEngObj::Speak( DWORD dwSpeakFlags,
REFGUID rguidFormatId,
const WAVEFORMATEX * pWaveFormatEx,
const SPVTEXTFRAG* pTextFragList,
ISpTTSEngineSite* pOutputSite )
{
SPDBG_FUNC( "CTTSEngObj::Speak" );
HRESULT hr = S_OK;
unsigned int size;
int xVolume;
int xSpeed;
int xPitch;
int xRange;
int xEmphasis;
int xSayas;
int punctuation;
int n_text_frag=0;
//--- Check args
if( SP_IS_BAD_INTERFACE_PTR( pOutputSite ) ||
SP_IS_BAD_READ_PTR( pTextFragList ) )
{
hr = E_INVALIDARG;
}
else
{
InitNamedata();
//--- Init some vars
m_pCurrFrag = pTextFragList;
m_pNextChar = m_pCurrFrag->pTextStart;
m_pEndChar = m_pNextChar + m_pCurrFrag->ulTextLen;
m_ullAudioOff = 0;
m_OutputSite = pOutputSite;
pOutputSite->GetEventInterest(&event_interest);
xVolume = gVolume;
xSpeed = gSpeed;
xPitch = gPitch;
xRange = gRange;
xEmphasis = gEmphasis;
xSayas = gSayas;
// find the size of the text buffer needed for this Speak() request
size = ProcessFragList(pTextFragList,NULL,pOutputSite,&n_text_frag);
gVolume = xVolume;
gSpeed = xSpeed;
gPitch = xPitch;
gRange = xRange;
gEmphasis = xEmphasis;
gSayas = xSayas;
punctuation = 0;
if(dwSpeakFlags & SPF_NLP_SPEAK_PUNC)
punctuation = 1;
espeak_SetParameter(espeakPUNCTUATION,punctuation,0);
size = (size + 50)*sizeof(wchar_t);
if(size > gBufSize)
{
size += 1000; // some extra so we don't need to realloc() again too often
TextBuf = (wchar_t *)realloc(TextBuf,size);
if(TextBuf == NULL)
{
gBufSize=0;
return(1);
}
gBufSize = size;
}
audio_latest = 0;
prev_phoneme = 0;
prev_phoneme_time = 0;
prev_phoneme_position = 0;
size = ProcessFragList(pTextFragList,TextBuf,pOutputSite,&n_text_frag);
if(size > 0)
{
espeak_Synth(TextBuf,0,0,POS_CHARACTER,0,espeakCHARS_WCHAR | espeakKEEP_NAMEDATA | espeakPHONEMES,NULL,NULL);
}
}
return hr;
} /* CTTSEngObj::Speak */
HRESULT CTTSEngObj::CheckActions( ISpTTSEngineSite* pOutputSite )
{//==============================================================
int control;
USHORT volume;
long rate;
control = pOutputSite->GetActions();
if(control & SPVES_VOLUME)
{
if(pOutputSite->GetVolume(&volume) == S_OK)
{
master_volume = volume;
}
}
if(control & SPVES_RATE)
{
if(pOutputSite->GetRate(&rate) == S_OK)
{
master_rate = rate;
}
}
return(S_OK);
} // end of CTTSEngObj::CheckActions
STDMETHODIMP CTTSEngObj::GetOutputFormat( const GUID * pTargetFormatId, const WAVEFORMATEX * pTargetWaveFormatEx,
GUID * pDesiredFormatId, WAVEFORMATEX ** ppCoMemDesiredWaveFormatEx )
{//========================================================================
SPDBG_FUNC( "CTTSEngObj::GetVoiceFormat" );
HRESULT hr = S_OK;
enum SPSTREAMFORMAT sample_rate = SPSF_22kHz16BitMono;
srate = 441;
if(espeak_GetParameter(espeakVOICETYPE,1) == 1)
{
srate = 320;
sample_rate = SPSF_16kHz16BitMono; // an mbrola voice
}
hr = SpConvertStreamFormatEnum(sample_rate, pDesiredFormatId, ppCoMemDesiredWaveFormatEx);
return hr;
} /* CTTSEngObj::GetVoiceFormat */
int FAR PASCAL CompileDictionary(const char *voice, const char *path_log)
{//===========================================================
FILE *f_log3;
char fname[120];
f_log3 = fopen(path_log,"w");
sprintf(fname,"%s/",path_install);
espeak_SetVoiceByName(voice);
espeak_CompileDictionary(fname,f_log3,0);
fclose(f_log3);
return(0);
}