/* * Copyright (C) 2005 to 2013 by Jonathan Duddington * email: jonsd@users.sourceforge.net * Copyright (C) 2015-2016 Reece H. Dunn * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see: . */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include "speech.h" #include "phoneme.h" #include "synthesize.h" #include "translate.h" #include "voice.h" #ifdef INCLUDE_MBROLA extern int Read4Bytes(FILE *f); extern void SetPitch2(voice_t *voice, int pitch1, int pitch2, int *pitch_base, int *pitch_range); extern unsigned char *outbuf; #if defined(_WIN32) || defined(_WIN64) #include #endif #include "mbrowrap.h" static MBROLA_TAB *mbrola_tab = NULL; static int mbrola_control = 0; static int mbr_name_prefix = 0; espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate) { // Load a phoneme name translation table from espeak-data/mbrola int size; int ix; int *pw; FILE *f_in; char path[sizeof(path_home)+15]; mbrola_name[0] = 0; mbrola_delay = 0; mbr_name_prefix = 0; if (mbrola_voice == NULL) { samplerate = samplerate_native; SetParameter(espeakVOICETYPE, 0, 0); return ENS_OK; } if (!load_MBR()) return ENS_MBROLA_NOT_FOUND; sprintf(path, "%s/mbrola/%s", path_home, mbrola_voice); #ifdef PLATFORM_POSIX // if not found, then also look in // usr/share/mbrola/xx, /usr/share/mbrola/xx/xx, /usr/share/mbrola/voices/xx if (GetFileLength(path) <= 0) { sprintf(path, "/usr/share/mbrola/%s", mbrola_voice); if (GetFileLength(path) <= 0) { sprintf(path, "/usr/share/mbrola/%s/%s", mbrola_voice, mbrola_voice); if (GetFileLength(path) <= 0) sprintf(path, "/usr/share/mbrola/voices/%s", mbrola_voice); } } close_MBR(); #endif if (init_MBR(path) != 0) // initialise the required mbrola voice return ENS_MBROLA_VOICE_NOT_FOUND; setNoError_MBR(1); // don't stop on phoneme errors // read eSpeak's mbrola phoneme translation data, eg. en1_phtrans sprintf(path, "%s/mbrola_ph/%s", path_home, phtrans); size = GetFileLength(path); if ((f_in = fopen(path, "rb")) == NULL) { int error = errno; close_MBR(); return error; } MBROLA_TAB *new_mbrola_tab = (MBROLA_TAB *)realloc(mbrola_tab, size); if (new_mbrola_tab == NULL) { fclose(f_in); close_MBR(); return ENOMEM; } mbrola_tab = new_mbrola_tab; mbrola_control = Read4Bytes(f_in); pw = (int *)mbrola_tab; for (ix = 4; ix < size; ix += 4) *pw++ = Read4Bytes(f_in); fclose(f_in); setVolumeRatio_MBR((float)(mbrola_control & 0xff) /16.0f); samplerate = *srate = getFreq_MBR(); if (*srate == 22050) SetParameter(espeakVOICETYPE, 0, 0); else SetParameter(espeakVOICETYPE, 1, 0); strcpy(mbrola_name, mbrola_voice); mbrola_delay = 1000; // improve synchronization of events return ENS_OK; } static int GetMbrName(PHONEME_LIST *plist, PHONEME_TAB *ph, PHONEME_TAB *ph_prev, PHONEME_TAB *ph_next, int *name2, int *split, int *control) { // Look up a phoneme in the mbrola phoneme name translation table // It may give none, 1, or 2 mbrola phonemes MBROLA_TAB *pr; PHONEME_TAB *other_ph; int found = 0; static int mnem; // control // bit 0 skip the next phoneme // bit 1 match this and Previous phoneme // bit 2 only at the start of a word // bit 3 don't match two phonemes across a word boundary // bit 4 add this phoneme name as a prefix to the next phoneme name (used for de4 phoneme prefix '?') // bit 5 only in stressed syllable // bit 6 only at the end of a word *name2 = 0; *split = 0; *control = 0; mnem = ph->mnemonic; pr = mbrola_tab; while (pr->name != 0) { if (mnem == pr->name) { if (pr->next_phoneme == 0) found = 1; else if ((pr->next_phoneme == ':') && (plist->synthflags & SFLAG_LENGTHEN)) found = 1; else { if (pr->control & 2) other_ph = ph_prev; else if ((pr->control & 8) && ((plist+1)->newword)) other_ph = phoneme_tab[phPAUSE]; // don't match the next phoneme over a word boundary else other_ph = ph_next; if ((pr->next_phoneme == other_ph->mnemonic) || ((pr->next_phoneme == 2) && (other_ph->type == phVOWEL)) || ((pr->next_phoneme == '_') && (other_ph->type == phPAUSE))) found = 1; } if ((pr->control & 4) && (plist->newword == 0)) // only at start of word found = 0; if ((pr->control & 0x40) && (plist[1].newword == 0)) // only at the end of a word found = 0; if ((pr->control & 0x20) && (plist->stresslevel < plist->wordstress)) found = 0; // only in stressed syllables if (found) { *name2 = pr->mbr_name2; *split = pr->percent; *control = pr->control; if (pr->control & 0x10) { mbr_name_prefix = pr->mbr_name; return 0; } mnem = pr->mbr_name; break; } } pr++; } if (mbr_name_prefix != 0) mnem = (mnem << 8) | (mbr_name_prefix & 0xff); mbr_name_prefix = 0; return mnem; } static char *WritePitch(int env, int pitch1, int pitch2, int split, int final) { // final=1: only give the final pitch value. int x; int ix; int pitch_base; int pitch_range; int p1, p2, p_end; unsigned char *pitch_env; int max = -1; int min = 999; int y_max = 0; int y_min = 0; int env100 = 80; // apply the pitch change only over this proportion of the mbrola phoneme(s) int y2; int y[4]; int env_split; char buf[50]; static char output[50]; output[0] = 0; pitch_env = envelope_data[env]; SetPitch2(voice, pitch1, pitch2, &pitch_base, &pitch_range); env_split = (split * 128)/100; if (env_split < 0) env_split = 0-env_split; // find max and min in the pitch envelope for (x = 0; x < 128; x++) { if (pitch_env[x] > max) { max = pitch_env[x]; y_max = x; } if (pitch_env[x] < min) { min = pitch_env[x]; y_min = x; } } // set an additional pitch point half way through the phoneme. // but look for a maximum or a minimum and use that instead y[2] = 64; if ((y_max > 0) && (y_max < 127)) y[2] = y_max; if ((y_min > 0) && (y_min < 127)) y[2] = y_min; y[1] = y[2] / 2; y[3] = y[2] + (127 - y[2])/2; // set initial pitch p1 = ((pitch_env[0]*pitch_range)>>8) + pitch_base; // Hz << 12 p_end = ((pitch_env[127]*pitch_range)>>8) + pitch_base; if (split >= 0) { sprintf(buf, " 0 %d", p1/4096); strcat(output, buf); } // don't use intermediate pitch points for linear rise and fall if (env > 1) { for (ix = 1; ix < 4; ix++) { p2 = ((pitch_env[y[ix]]*pitch_range)>>8) + pitch_base; if (split > 0) y2 = (y[ix] * env100)/env_split; else if (split < 0) y2 = ((y[ix]-env_split) * env100)/env_split; else y2 = (y[ix] * env100)/128; if ((y2 > 0) && (y2 <= env100)) { sprintf(buf, " %d %d", y2, p2/4096); strcat(output, buf); } } } p_end = p_end/4096; if (split <= 0) { sprintf(buf, " %d %d", env100, p_end); strcat(output, buf); } if (env100 < 100) { sprintf(buf, " %d %d", 100, p_end); strcat(output, buf); } strcat(output, "\n"); if (final) sprintf(output, "\t100 %d\n", p_end); return output; } int MbrolaTranslate(PHONEME_LIST *plist, int n_phonemes, int resume, FILE *f_mbrola) { // Generate a mbrola pho file unsigned int name; int len; int len1; PHONEME_TAB *ph; PHONEME_TAB *ph_next; PHONEME_TAB *ph_prev; PHONEME_LIST *p; PHONEME_LIST *next; PHONEME_DATA phdata; FMT_PARAMS fmtp; int pause = 0; int released; int name2; int control; int done; int len_percent; const char *final_pitch; char *ptr; char mbr_buf[120]; static int phix; static int embedded_ix; static int word_count; if (!resume) { phix = 1; embedded_ix = 0; word_count = 0; } while (phix < n_phonemes) { if (WcmdqFree() < MIN_WCMDQ) return 1; ptr = mbr_buf; p = &plist[phix]; next = &plist[phix+1]; ph = p->ph; ph_prev = plist[phix-1].ph; ph_next = plist[phix+1].ph; if (p->synthflags & SFLAG_EMBEDDED) DoEmbedded(&embedded_ix, p->sourceix); if (p->newword & 4) DoMarker(espeakEVENT_SENTENCE, (p->sourceix & 0x7ff) + clause_start_char, 0, count_sentences); if (p->newword & 1) DoMarker(espeakEVENT_WORD, (p->sourceix & 0x7ff) + clause_start_char, p->sourceix >> 11, clause_start_word + word_count++); name = GetMbrName(p, ph, ph_prev, ph_next, &name2, &len_percent, &control); if (control & 1) phix++; if (name == 0) { phix++; continue; // ignore this phoneme } if ((ph->type == phPAUSE) && (name == ph->mnemonic)) { // a pause phoneme, which has not been changed by the translation name = '_'; len = (p->length * speed.pause_factor)/256; if (len == 0) len = 1; } else len = (80 * speed.wav_factor)/256; if (ph->code != phonEND_WORD) { char phoneme_name[16]; WritePhMnemonic(phoneme_name, p->ph, p, option_phoneme_events & espeakINITIALIZE_PHONEME_IPA, NULL); DoPhonemeMarker(espeakEVENT_PHONEME, (p->sourceix & 0x7ff) + clause_start_char, 0, phoneme_name); } ptr += sprintf(ptr, "%s\t", WordToString(name)); if (name2 == '_') { // add a pause after this phoneme pause = len_percent; name2 = 0; } done = 0; final_pitch = ""; switch (ph->type) { case phVOWEL: len = ph->std_length; if (p->synthflags & SFLAG_LENGTHEN) len += phoneme_tab[phonLENGTHEN]->std_length; // phoneme was followed by an extra : symbol if (ph_next->type == phPAUSE) len += 50; // lengthen vowels before a pause len = (len * p->length)/256; if (name2 == 0) { char *pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 0); ptr += sprintf(ptr, "%d\t%s", len, pitch); } else { char *pitch; pitch = WritePitch(p->env, p->pitch1, p->pitch2, len_percent, 0); len1 = (len * len_percent)/100; ptr += sprintf(ptr, "%d\t%s", len1, pitch); pitch = WritePitch(p->env, p->pitch1, p->pitch2, -len_percent, 0); ptr += sprintf(ptr, "%s\t%d\t%s", WordToString(name2), len-len1, pitch); } done = 1; break; case phSTOP: released = 0; if (next->type == phVOWEL) released = 1; if (next->type == phLIQUID && !next->newword) released = 1; if (released == 0) p->synthflags |= SFLAG_NEXT_PAUSE; InterpretPhoneme(NULL, 0, p, &phdata, NULL); len = DoSample3(&phdata, 0, -1); len = (len * 1000)/samplerate; // convert to mS len += PauseLength(p->prepause, 1); break; case phVSTOP: len = (80 * speed.wav_factor)/256; break; case phFRICATIVE: len = 0; InterpretPhoneme(NULL, 0, p, &phdata, NULL); if (p->synthflags & SFLAG_LENGTHEN) len = DoSample3(&phdata, p->length, -1); // play it twice for [s:] etc. len += DoSample3(&phdata, p->length, -1); len = (len * 1000)/samplerate; // convert to mS break; case phNASAL: if (next->type != phVOWEL) { memset(&fmtp, 0, sizeof(fmtp)); InterpretPhoneme(NULL, 0, p, &phdata, NULL); fmtp.fmt_addr = phdata.sound_addr[pd_FMT]; len = DoSpect2(p->ph, 0, &fmtp, p, -1); len = (len * 1000)/samplerate; if (next->type == phPAUSE) len += 50; final_pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 1); } break; case phLIQUID: if (next->type == phPAUSE) { len += 50; final_pitch = WritePitch(p->env, p->pitch1, p->pitch2, 0, 1); } break; } if (!done) { if (name2 != 0) { len1 = (len * len_percent)/100; ptr += sprintf(ptr, "%d\n%s\t", len1, WordToString(name2)); len -= len1; } ptr += sprintf(ptr, "%d%s\n", len, final_pitch); } if (pause) { len += PauseLength(pause, 0); ptr += sprintf(ptr, "_ \t%d\n", PauseLength(pause, 0)); pause = 0; } if (f_mbrola) fwrite(mbr_buf, 1, (ptr-mbr_buf), f_mbrola); // write .pho to a file else { int res = write_MBR(mbr_buf); if (res < 0) return 0; // don't get stuck on error if (res == 0) return 1; wcmdq[wcmdq_tail][0] = WCMD_MBROLA_DATA; wcmdq[wcmdq_tail][1] = len; WcmdqInc(); } phix++; } if (!f_mbrola) { flush_MBR(); // flush the mbrola output buffer wcmdq[wcmdq_tail][0] = WCMD_MBROLA_DATA; wcmdq[wcmdq_tail][1] = 500; WcmdqInc(); } return 0; } int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, int resume) { FILE *f_mbrola = NULL; if (*n_ph == 0) return 0; if (option_phonemes & espeakPHONEMES_MBROLA) { // send mbrola data to a file, not to the mbrola library f_mbrola = f_trans; } int again = MbrolaTranslate(phoneme_list, *n_ph, resume, f_mbrola); if (!again) *n_ph = 0; return again; } int MbrolaFill(int length, int resume, int amplitude) { // Read audio data from Mbrola (length is in millisecs) static int n_samples; int req_samples, result; int ix; short value16; int value; if (!resume) n_samples = samplerate * length / 1000; req_samples = (out_end - out_ptr)/2; if (req_samples > n_samples) req_samples = n_samples; result = read_MBR((short *)out_ptr, req_samples); if (result <= 0) return 0; for (ix = 0; ix < result; ix++) { value16 = out_ptr[0] + (out_ptr[1] << 8); value = value16 * amplitude; value = value / 40; // adjust this constant to give a suitable amplitude for mbrola voices if (value > 0x7fff) value = 0x7fff; if (value < -0x8000) value = 0x8000; out_ptr[0] = value; out_ptr[1] = value >> 8; out_ptr += 2; } n_samples -= result; return n_samples ? 1 : 0; } void MbrolaReset(void) { // Reset the Mbrola engine and flush the pending audio reset_MBR(); } #else // mbrola interface is not compiled, provide dummy functions. espeak_ng_STATUS LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int *srate) { return ENS_NOT_SUPPORTED; } int MbrolaGenerate(PHONEME_LIST *phoneme_list, int *n_ph, int resume) { return 0; } int MbrolaFill(int length, int resume, int amplitude) { return 0; } void MbrolaReset(void) { } #endif