123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616 |
- /*
- * Copyright (C) 2008 Google Inc.
- * Copyright (C) 2012 Reece H. Dunn
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
- * This file contains the TtsEngine implementation for the eSpeak
- * Text-to-Speech engine.
- *
- * Android Version: 2.2 (Froyo)
- * API Version: 8
- */
-
-
- #include <stdio.h>
- #include <unistd.h>
- #include <stdlib.h>
-
- #define LOG_TAG "eSpeak Engine"
- #define DEBUG true
-
- #include <speak_lib.h>
- #include <TtsEngine.h>
- #include <Log.h>
-
- /*
- * This is the Manager layer. It sits on top of the native eSpeak engine
- * and provides the interface to the defined Google TTS engine API.
- * The Google engine API is the boundary to allow a TTS engine to be swapped.
- * The Manager layer also provide the SSML tag interpretation.
- * The supported SSML tags are mapped to corresponding tags natively supported by eSpeak.
- * Native eSpeak functions always begin with espeak_XXX.
- *
- * Only a subset of SSML 1.0 tags are supported.
- * Some SSML tags involve significant complexity.
- * If the language is changed through an SSML tag, there is a latency for the load.
- */
-
- using namespace android;
-
- const char *ESPEAK_DIRECTORY = "espeak-data";
-
- const char *eSpeakBaseResources[] = {"intonations", "phondata", "phonindex", "phontab",
- "en_dict", "voices/en/en-us" };
-
- const int NUM_BASE_RESOURCES = 6;
-
- // Format is {espeak voice, iso3 code, name}
- const char *eSpeakSupportedVoices[][3] = {
- {"en-us", "eng", "English"},
- {"en-us", "eng-USA", "English (US)"},
- {"en", "eng-GBR", "English (UK)"},
- {"en-sc", "eng-GBR-sc", "English (Scottish)"},
- {"en-n", "eng-GBR-n", "English (Northern UK)"},
- {"en-rp", "eng-GBR-rp", "English (Received Pronunciation)"},
- {"en-wm", "eng-GBR-wm", "English (West Midlands)"},
- {"af", "afr", "Afrikaans"},
- {"bs", "bos", "Bosnian"},
- {"ca", "cat", "Catalan"},
- {"cs", "ces", "Czech"},
- {"da", "dan", "Danish"},
- {"de", "deu", "German"},
- {"el", "ell", "Greek"},
- {"eo", "epo", "Esperanto"},
- {"es", "spa", "Spanish"},
- {"es-la", "spa-MEX", "Spanish (Latin America)"},
- {"fi", "fin", "Finnish"},
- {"fr", "fra", "French"},
- {"hr", "hrv", "Croatian"},
- {"hu", "hun", "Hungarian"},
- {"it", "ita", "Italian"},
- {"kn", "kan", "Kannada"},
- {"ku", "kur", "Kurdish"},
- {"lv", "lav", "Latvian"},
- {"nl", "nld", "Dutch"},
- {"pl", "pol", "Polish"},
- {"pt", "por", "Portuguese (Brazil)"},
- {"pt", "por-BRA", "Portuguese (Brazil)"},
- {"pt-pt", "por-PRT", "Portuguese"},
- {"ro", "ron", "Romanian"},
- {"sk", "slk", "Slovak"},
- {"sr", "srp", "Serbian"},
- {"sv", "swe", "Swedish"},
- {"sw", "swa", "Swahili"},
- {"ta", "tam", "Tamil"},
- {"tr", "tur", "Turkish"},
- {"zh", "zho", "Chinese (Mandarin)"},
- {"cy", "cym", "Welsh"},
- {"hi", "hin", "Hindi"},
- {"hy", "hye", "Armenian"},
- {"id", "ind", "Indonesian"},
- {"is", "isl", "Icelandic"},
- {"ka", "kat", "Georgian"},
- {"la", "lat", "Latin"},
- {"mk", "mkd", "Macedonian"},
- {"no", "nor", "Norwegian"},
- {"ru", "rus", "Russian"},
- {"sq", "sqi", "Albanian"},
- {"vi", "vie", "Vietnamese"},
- {"zh-yue", "zho-HKG", "Chinese (Cantonese)"},
- {"grc", "grc", "Ancient Greek"},
- {"jbo", "jbo", "Lojban"},
- {"nci", "nci", "Nahuatl (Classical)"},
- {"pap", "pap", "Papiamento" }
- };
-
- const int NUM_SUPPORTED_VOICES = 55;
-
- // Callback to the TTS API
- synthDoneCB_t *ttsSynthDoneCBPointer;
-
- char *eSpeakDataPath = NULL;
-
- char currentLanguage[33];
- char currentLang[10];
- char currentCountry[10];
- char currentVariant[10];
-
- int sampleRate = 0;
- bool hasInitialized = false;
-
- /* Functions internal to the eSpeak engine wrapper */
- static void setSpeechRate(int speechRate) {
- espeak_ERROR err = espeak_SetParameter(espeakRATE, speechRate, 0);
- }
-
- /* Functions exposed to the TTS API */
-
- /* Callback from espeak. Should call back to the TTS API */
- static int eSpeakCallback(short *wav, int numsamples, espeak_EVENT *events) {
- LOGI("Callback with %d samples", numsamples);
-
- int8_t * castedWav = (int8_t *) wav;
- size_t bufferSize = 0;
- if (numsamples < 1) {
- int8_t silenceData[] = { 0, 0 };
- size_t silenceBufferSize = sizeof(silenceData)/sizeof(silenceData[0]);
- int8_t *silence = silenceData; // Passing in an empty buffer can cause a crash.
- ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, silence,
- silenceBufferSize, TTS_SYNTH_DONE);
- return 1;
- }
- bufferSize = numsamples * sizeof(short);
- ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, castedWav,
- bufferSize, TTS_SYNTH_PENDING);
- return 0; // continue synthesis (1 is to abort)
- }
-
- static bool fileExists(char *fileName) {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- FILE *file = fopen(fileName, "r");
-
- if (file == NULL) {
- return false;
- } else {
- fclose(file);
- return true;
- }
- }
-
- static bool hasBaseResources() {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- char filename[255];
-
- for (int i = 0; i < NUM_BASE_RESOURCES; i++) {
- sprintf(filename, "%s/%s/%s", eSpeakDataPath, ESPEAK_DIRECTORY, eSpeakBaseResources[i]);
-
- if (!fileExists(filename)) {
- LOGE("Missing resource: %s", filename);
- return false;
- }
- }
-
- return true;
- }
-
- /* Google Engine API function implementations */
-
- tts_result attemptInit() {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- if (hasInitialized) {
- return TTS_SUCCESS;
- }
-
- if (!hasBaseResources()) {
- return TTS_FAILURE;
- }
-
- // TODO Make sure that the speech data is loaded in
- // the directory /sdcard/espeak-data before calling this.
- sampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 4096, eSpeakDataPath, 0);
-
- if (sampleRate <= 0) {
- LOGE("eSpeak initialization failed!");
- return TTS_FAILURE;
- }
-
- espeak_SetSynthCallback(eSpeakCallback);
-
- espeak_VOICE voice;
- memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first
- const char *langNativeString = "en-us"; //Default to US English
- voice.languages = langNativeString;
- voice.variant = 0;
- espeak_SetVoiceByProperties(&voice);
-
- strcpy(currentLanguage, "en-us");
- hasInitialized = true;
-
- return TTS_SUCCESS;
- }
-
- /** init
- * Allocates eSpeak memory block and initializes the eSpeak system.
- * synthDoneCBPtr - Pointer to callback function which will receive generated samples
- * config - the engine configuration parameters, not used here
- * return tts_result
- */
- tts_result TtsEngine::init(synthDoneCB_t synthDoneCBPtr, const char *engineConfig) {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- ttsSynthDoneCBPointer = synthDoneCBPtr;
- hasInitialized = false;
-
- if ((engineConfig != NULL) && (strlen(engineConfig) > 0)) {
- eSpeakDataPath = (char *) malloc(strlen(engineConfig));
- strcpy(eSpeakDataPath, engineConfig);
- } else {
- eSpeakDataPath = NULL;
- LOGE("Data path not specified!");
- return TTS_FAILURE;
- }
-
- return attemptInit();
- }
-
- /** shutdown
- * Unloads all eSpeak resources; terminates eSpeak system and frees eSpeak memory block.
- * return tts_result
- */
- tts_result TtsEngine::shutdown(void) {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- if (eSpeakDataPath != NULL) {
- free(eSpeakDataPath);
- }
-
- espeak_Terminate();
-
- return TTS_SUCCESS;
- }
-
- tts_result TtsEngine::loadLanguage(const char *lang, const char *country, const char *variant) {
- if (DEBUG) LOGV("loadLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant);
-
- return TTS_FAILURE;
- }
-
- tts_support_result isLanguageSupported(const char *lang, const char *country, const char *variant,
- int *pindex) {
- if (DEBUG) LOGV("isLanguageSupported(\"%s\", \"%s\", \"%s\")", lang, country, variant);
-
- if ((lang == NULL) || (strlen(lang) == 0)) {
- LOGE("TtsEngine::isLanguageAvailable called with no language");
- return TTS_LANG_NOT_SUPPORTED;
- }
-
- if (pindex != NULL) {
- *pindex = -1;
- }
-
- int langIndex = -1;
- int countryIndex = -1;
- int variantIndex = -1;
-
- if (strlen(lang) == 3) {
- for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) {
- if (strncmp(lang, eSpeakSupportedVoices[i][1], 3) == 0) {
- LOGI("Found ISO3 language at index %d", i);
- langIndex = i;
- break;
- }
- }
- } else if (strlen(lang) == 2) {
- for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) {
- if (strncmp(lang, eSpeakSupportedVoices[i][0], 2) == 0) {
- LOGI("Found ISO2 language at index %d", i);
- langIndex = i;
- break;
- }
- }
- }
-
- if (langIndex < 0) {
- LOGV("TtsEngine::isLanguageAvailable called with unsupported language");
- return TTS_LANG_NOT_SUPPORTED;
- }
-
- if ((country == NULL) || (strlen(country) == 0)) {
- // TODO: Check whether resources are available for this language.
-
- if (pindex != NULL) {
- *pindex = langIndex;
- }
-
- LOGI("No country specified, language is available");
- return TTS_LANG_AVAILABLE;
- }
-
- char lang_country[10];
- sprintf(lang_country, "%s-%s", lang, country);
-
- // Find country
- if (strlen(country) == 3) {
- for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) {
- if ((strcmp(lang_country, eSpeakSupportedVoices[i][1]) == 0)) {
- LOGI("Found ISO3 country at index %d", i);
- countryIndex = i;
- break;
- }
- }
- } else if (strlen(country) == 2) {
- for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) {
- if ((strcmp(lang_country, eSpeakSupportedVoices[i][0]) == 0)) {
- LOGI("Found ISO2 country at index %d", i);
- countryIndex = i;
- break;
- }
- }
- }
-
- if (countryIndex < 0) {
- if (pindex != NULL) {
- *pindex = langIndex;
- }
-
- LOGI("No country found, language is available");
- return TTS_LANG_AVAILABLE;
- }
-
- if ((variant == NULL) || (strlen(variant) == 0)) {
- if (pindex != NULL) {
- *pindex = countryIndex;
- }
-
- LOGI("No variant specified, language and country are available");
- return TTS_LANG_COUNTRY_AVAILABLE;
- }
-
- char lang_country_variant[15];
- sprintf(lang_country_variant, "%s-%s-%s", lang, country, variant);
-
- // Find variant
- for (int i = countryIndex; i < NUM_SUPPORTED_VOICES; i++) {
- if ((strcmp(lang_country_variant, eSpeakSupportedVoices[i][1]) == 0)) {
- LOGI("Found variant at index %d", i);
- variantIndex = i;
- break;
- }
- }
-
- if (variantIndex < 0) {
- if (pindex != NULL) {
- *pindex = countryIndex;
- }
-
- LOGI("No variant found, language and country are available");
- return TTS_LANG_COUNTRY_AVAILABLE;
- }
-
- if (pindex != NULL) {
- *pindex = variantIndex;
- }
-
- LOGI("Language, country, and variant are available");
- return TTS_LANG_COUNTRY_VAR_AVAILABLE;
- }
-
- tts_result TtsEngine::setLanguage(const char *lang, const char *country, const char *variant) {
- if (DEBUG) LOGV("setLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant);
-
- // Make sure the engine is initialized!
- attemptInit();
-
- int index = -1;
-
- isLanguageSupported(lang, country, variant, &index);
-
- if (index < 0) {
- LOGE("setLanguage called with unsupported language");
- return TTS_FAILURE;
- }
-
- strcpy(currentLanguage, lang);
- strcpy(currentLang, lang);
- strcpy(currentCountry, country);
- strcpy(currentVariant, variant);
-
- char espeakLangStr[7];
- strcpy(espeakLangStr, eSpeakSupportedVoices[index][0]);
-
- espeak_VOICE voice;
- memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first
- voice.variant = 0;
- voice.languages = espeakLangStr;
- espeak_ERROR err = espeak_SetVoiceByProperties(&voice);
-
- if (err != EE_OK) {
- LOGE("Error code %d when setting voice properties!", err);
- return TTS_FAILURE;
- }
-
- return TTS_SUCCESS;
- }
-
- tts_support_result TtsEngine::isLanguageAvailable(const char *lang, const char *country,
- const char *variant) {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- return isLanguageSupported(lang, country, variant, NULL);
- }
-
- tts_result TtsEngine::getLanguage(char *language, char *country, char *variant) {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- strcpy(language, currentLang);
- strcpy(country, currentCountry);
- strcpy(variant, currentVariant);
-
- return TTS_SUCCESS;
- }
-
- /** setAudioFormat
- * sets the audio format to use for synthesis, returns what is actually used.
- * @encoding - reference to encoding format
- * @rate - reference to sample rate
- * @channels - reference to number of channels
- * return tts_result
- * */
- tts_result TtsEngine::setAudioFormat(tts_audio_format& encoding, uint32_t& rate, int& channels) {
- LOGE("setAudioFormat(%d, %d, %d) is unsupported", encoding, rate, channels);
-
- encoding = TTS_AUDIO_FORMAT_PCM_16_BIT;
- rate = sampleRate;
- channels = 1;
- return TTS_SUCCESS;
- }
-
- // Sets the property with the specified value
- tts_result TtsEngine::setProperty(const char *property, const char *value, const size_t size) {
- if (DEBUG) LOGV("setProperty(\"%s\", \"%s\", %d)", property, value, size);
-
- /* Set a specific property for the engine.
- Supported properties include: language (locale), rate, pitch, volume. */
- /* Sanity check */
- if (property == NULL) {
- LOGE("setProperty called with property NULL");
- return TTS_PROPERTY_UNSUPPORTED;
- }
-
- if (value == NULL) {
- LOGE("setProperty called with value NULL");
- return TTS_VALUE_INVALID;
- }
-
- espeak_ERROR result;
-
- if (strncmp(property, "language", 8) == 0) {
- // TODO: Set this property
- result = EE_OK;
- } else if (strncmp(property, "rate", 4) == 0) {
- int rate = atoi(value) * espeak_GetParameter(espeakRATE, 0) / 100;
- if (DEBUG) LOGV("setProperty rate : rate=%s, wpm=%d", value, rate);
- result = espeak_SetParameter(espeakRATE, rate, 0);
- } else if (strncmp(property, "pitch", 5) == 0) {
- int pitch = atoi(value);
- // The values of pitch from android range from 50 - 200, with 100 being normal.
- // The values espeak supports are from 0 - 100, with 50 being normal.
- // Therefore, halve the value to get the value that espeak supports:
- pitch = pitch / 2;
- if (DEBUG) LOGV("setProperty pitch : pitch=%d", pitch);
- result = espeak_SetParameter(espeakPITCH, pitch, 0);
- } else if (strncmp(property, "volume", 6) == 0) {
- int volume = atoi(value);
- result = espeak_SetParameter(espeakVOLUME, volume, 0);
- } else {
- return TTS_PROPERTY_UNSUPPORTED;
- }
-
- if (result == EE_OK) {
- return TTS_SUCCESS;
- } else {
- return TTS_FAILURE;
- }
- }
-
- // Sets the property with the specified value
- tts_result TtsEngine::getProperty(const char *property, char *value, size_t *iosize) {
- if (DEBUG) LOGV("getProperty(\"%s\", ...)", property);
-
- /* Get the property for the engine.
- This property was previously set by setProperty or by default. */
- /* sanity check */
- if (property == NULL) {
- LOGE("getProperty called with property NULL");
- return TTS_PROPERTY_UNSUPPORTED;
- }
-
- if (value == NULL) {
- LOGE("getProperty called with value NULL");
- return TTS_VALUE_INVALID;
- }
-
- if (strncmp(property, "language", 8) == 0) {
- if (currentLanguage == NULL) {
- strcpy(value, "");
- } else {
- if (*iosize < strlen(currentLanguage)+1) {
- *iosize = strlen(currentLanguage) + 1;
- return TTS_PROPERTY_SIZE_TOO_SMALL;
- }
- strcpy(value, currentLanguage);
- }
- return TTS_SUCCESS;
- } else if (strncmp(property, "rate", 4) == 0) {
- int rate = espeak_GetParameter(espeakRATE, 1) * 100 / espeak_GetParameter(espeakRATE, 0);
- char tmprate[4];
- sprintf(tmprate, "%d", rate);
- if (*iosize < strlen(tmprate)+1) {
- *iosize = strlen(tmprate) + 1;
- return TTS_PROPERTY_SIZE_TOO_SMALL;
- }
- strcpy(value, tmprate);
- return TTS_SUCCESS;
- } else if (strncmp(property, "pitch", 5) == 0) {
- char tmppitch[4];
- sprintf(tmppitch, "%d", (espeak_GetParameter(espeakPITCH, 1) * 2));
- if (*iosize < strlen(tmppitch)+1) {
- *iosize = strlen(tmppitch) + 1;
- return TTS_PROPERTY_SIZE_TOO_SMALL;
- }
- strcpy(value, tmppitch);
- return TTS_SUCCESS;
- } else if (strncmp(property, "volume", 6) == 0) {
- char tmpvolume[4];
- sprintf(tmpvolume, "%d", espeak_GetParameter(espeakVOLUME, 1));
- if (*iosize < strlen(tmpvolume)+1) {
- *iosize = strlen(tmpvolume) + 1;
- return TTS_PROPERTY_SIZE_TOO_SMALL;
- }
- strcpy(value, tmpvolume);
- return TTS_SUCCESS;
- }
-
- LOGE("Unsupported property");
- return TTS_PROPERTY_UNSUPPORTED;
- }
-
- /** synthesizeText
- * Synthesizes a text string.
- * The text string could be annotated with SSML tags.
- * @text - text to synthesize
- * @buffer - buffer which will receive generated samples
- * @bufferSize - size of buffer
- * @userdata - pointer to user data which will be passed back to callback function
- * return tts_result
- */
- tts_result TtsEngine::synthesizeText(const char *text, int8_t *buffer, size_t bufferSize,
- void *userdata) {
- if (DEBUG) LOGV("%s", __FUNCTION__);
-
- espeak_SetSynthCallback(eSpeakCallback);
-
- unsigned int unique_identifier;
-
- espeak_Synth(text, strlen(text), 0, // position
- POS_CHARACTER, 0, // end position (0 means no end position)
- espeakCHARS_UTF8, // text is UTF-8 encoded
- &unique_identifier, userdata);
- espeak_Synchronize();
-
- LOGI("Synthesis done");
-
- return TTS_SUCCESS;
- }
-
- /** stop
- * Aborts the running synthesis.
- * return tts_result
- */
- tts_result TtsEngine::stop() {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- espeak_Cancel();
- return TTS_SUCCESS;
- }
-
- #ifdef __cplusplus
- extern "C" {
- #endif
-
- TtsEngine* getTtsEngine() {
- if (DEBUG) LOGV("%s", __FUNCTION__);
- return new TtsEngine();
- }
-
- #ifdef __cplusplus
- }
- #endif
|