@@ -0,0 +1,26 @@ | |||
/* | |||
* Copyright (C) 2009 Google Inc. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
*/ | |||
#ifndef LOG_H_ | |||
#define LOG_H_ | |||
#include <android/log.h> | |||
#define LOGV(...) __android_log_print(ANDROID_LOG_VERBOSE, LOG_TAG, __VA_ARGS__) | |||
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__) | |||
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__) | |||
#endif /* LOG_H_ */ |
@@ -0,0 +1,245 @@ | |||
/* | |||
* Copyright (C) 2009 Google Inc. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
*/ | |||
// This header defines the interface used by the Android platform | |||
// to access Text-To-Speech functionality in shared libraries that implement | |||
// speech synthesis and the management of resources associated with the | |||
// synthesis. | |||
// An example of the implementation of this interface can be found in | |||
// FIXME: add path+name to implementation of default TTS engine | |||
// Libraries implementing this interface are used in: | |||
// frameworks/base/tts/jni/android_tts_SpeechSynthesis.cpp | |||
#ifndef TTS_ENGINE_H_ | |||
#define TTS_ENGINE_H_ | |||
namespace android { | |||
#define ANDROID_TTS_ENGINE_PROPERTY_CONFIG "engineConfig" | |||
#define ANDROID_TTS_ENGINE_PROPERTY_PITCH "pitch" | |||
#define ANDROID_TTS_ENGINE_PROPERTY_RATE "rate" | |||
#define ANDROID_TTS_ENGINE_PROPERTY_VOLUME "volume" | |||
enum tts_synth_status { | |||
TTS_SYNTH_DONE = 0, | |||
TTS_SYNTH_PENDING = 1 | |||
}; | |||
enum tts_callback_status { | |||
TTS_CALLBACK_HALT = 0, | |||
TTS_CALLBACK_CONTINUE = 1 | |||
}; | |||
// NOTE: This is duplicated in compat/jni/tts.h. Please | |||
// make changes there as well. | |||
enum tts_audio_format { | |||
TTS_AUDIO_FORMAT_INVALID = -1, | |||
TTS_AUDIO_FORMAT_DEFAULT = 0, | |||
TTS_AUDIO_FORMAT_PCM_16_BIT = 1, | |||
TTS_AUDIO_FORMAT_PCM_8_BIT = 2, | |||
}; | |||
// The callback is used by the implementation of this interface to notify its | |||
// client, the Android TTS service, that the last requested synthesis has been | |||
// completed. // TODO reword | |||
// The callback for synthesis completed takes: | |||
// @param [inout] void *& - The userdata pointer set in the original | |||
// synth call | |||
// @param [in] uint32_t - Track sampling rate in Hz | |||
// @param [in] tts_audio_format - The audio format | |||
// @param [in] int - The number of channels | |||
// @param [inout] int8_t *& - A buffer of audio data only valid during the | |||
// execution of the callback | |||
// @param [inout] size_t & - The size of the buffer | |||
// @param [in] tts_synth_status - indicate whether the synthesis is done, or | |||
// if more data is to be synthesized. | |||
// @return TTS_CALLBACK_HALT to indicate the synthesis must stop, | |||
// TTS_CALLBACK_CONTINUE to indicate the synthesis must continue if | |||
// there is more data to produce. | |||
typedef tts_callback_status (synthDoneCB_t)(void *&, uint32_t, | |||
tts_audio_format, int, int8_t *&, size_t&, tts_synth_status); | |||
class TtsEngine; | |||
extern "C" TtsEngine* getTtsEngine(); | |||
enum tts_result { | |||
TTS_SUCCESS = 0, | |||
TTS_FAILURE = -1, | |||
TTS_FEATURE_UNSUPPORTED = -2, | |||
TTS_VALUE_INVALID = -3, | |||
TTS_PROPERTY_UNSUPPORTED = -4, | |||
TTS_PROPERTY_SIZE_TOO_SMALL = -5, | |||
TTS_MISSING_RESOURCES = -6 | |||
}; | |||
enum tts_support_result { | |||
TTS_LANG_COUNTRY_VAR_AVAILABLE = 2, | |||
TTS_LANG_COUNTRY_AVAILABLE = 1, | |||
TTS_LANG_AVAILABLE = 0, | |||
TTS_LANG_MISSING_DATA = -1, | |||
TTS_LANG_NOT_SUPPORTED = -2 | |||
}; | |||
class TtsEngine | |||
{ | |||
public: | |||
virtual ~TtsEngine() {} | |||
// Initialize the TTS engine and returns whether initialization succeeded. | |||
// @param synthDoneCBPtr synthesis callback function pointer | |||
// @return TTS_SUCCESS, or TTS_FAILURE | |||
virtual tts_result init(synthDoneCB_t synthDoneCBPtr, const char *engineConfig); | |||
// Shut down the TTS engine and releases all associated resources. | |||
// @return TTS_SUCCESS, or TTS_FAILURE | |||
virtual tts_result shutdown(); | |||
// Interrupt synthesis and flushes any synthesized data that hasn't been | |||
// output yet. This will block until callbacks underway are completed. | |||
// @return TTS_SUCCESS, or TTS_FAILURE | |||
virtual tts_result stop(); | |||
// Returns the level of support for the language, country and variant. | |||
// @return TTS_LANG_COUNTRY_VAR_AVAILABLE if the language, country and variant are supported, | |||
// and the corresponding resources are correctly installed | |||
// TTS_LANG_COUNTRY_AVAILABLE if the language and country are supported and the | |||
// corresponding resources are correctly installed, but there is no match for | |||
// the specified variant | |||
// TTS_LANG_AVAILABLE if the language is supported and the | |||
// corresponding resources are correctly installed, but there is no match for | |||
// the specified country and variant | |||
// TTS_LANG_MISSING_DATA if the required resources to provide any level of support | |||
// for the language are not correctly installed | |||
// TTS_LANG_NOT_SUPPORTED if the language is not supported by the TTS engine. | |||
virtual tts_support_result isLanguageAvailable(const char *lang, const char *country, | |||
const char *variant); | |||
// Load the resources associated with the specified language. The loaded | |||
// language will only be used once a call to setLanguage() with the same | |||
// language value is issued. Language and country values are coded according to the ISO three | |||
// letter codes for languages and countries, as can be retrieved from a java.util.Locale | |||
// instance. The variant value is encoded as the variant string retrieved from a | |||
// java.util.Locale instance built with that variant data. | |||
// @param lang pointer to the ISO three letter code for the language | |||
// @param country pointer to the ISO three letter code for the country | |||
// @param variant pointer to the variant code | |||
// @return TTS_SUCCESS, or TTS_FAILURE | |||
virtual tts_result loadLanguage(const char *lang, const char *country, const char *variant); | |||
// Load the resources associated with the specified language, country and Locale variant. | |||
// The loaded language will only be used once a call to setLanguageFromLocale() with the same | |||
// language value is issued. Language and country values are coded according to the ISO three | |||
// letter codes for languages and countries, as can be retrieved from a java.util.Locale | |||
// instance. The variant value is encoded as the variant string retrieved from a | |||
// java.util.Locale instance built with that variant data. | |||
// @param lang pointer to the ISO three letter code for the language | |||
// @param country pointer to the ISO three letter code for the country | |||
// @param variant pointer to the variant code | |||
// @return TTS_SUCCESS, or TTS_FAILURE | |||
virtual tts_result setLanguage(const char *lang, const char *country, const char *variant); | |||
// Retrieve the currently set language, country and variant, or empty strings if none of | |||
// parameters have been set. Language and country are represented by their 3-letter ISO code | |||
// @param[out] pointer to the retrieved 3-letter code language value | |||
// @param[out] pointer to the retrieved 3-letter code country value | |||
// @param[out] pointer to the retrieved variant value | |||
// @return TTS_SUCCESS, or TTS_FAILURE | |||
virtual tts_result getLanguage(char *language, char *country, char *variant); | |||
// Notifies the engine what audio parameters should be used for the synthesis. | |||
// This is meant to be used as a hint, the engine implementation will set the output values | |||
// to those of the synthesis format, based on a given hint. | |||
// @param[inout] encoding in: the desired audio sample format | |||
// out: the format used by the TTS engine | |||
// @param[inout] rate in: the desired audio sample rate | |||
// out: the sample rate used by the TTS engine | |||
// @param[inout] channels in: the desired number of audio channels | |||
// out: the number of channels used by the TTS engine | |||
// @return TTS_SUCCESS, or TTS_FAILURE | |||
virtual tts_result setAudioFormat(tts_audio_format& encoding, uint32_t& rate, | |||
int& channels); | |||
// Set a property for the the TTS engine | |||
// "size" is the maximum size of "value" for properties "property" | |||
// @param property pointer to the property name | |||
// @param value pointer to the property value | |||
// @param size maximum size required to store this type of property | |||
// @return TTS_PROPERTY_UNSUPPORTED, or TTS_SUCCESS, or TTS_FAILURE, | |||
// or TTS_VALUE_INVALID | |||
virtual tts_result setProperty(const char *property, const char *value, | |||
const size_t size); | |||
// Retrieve a property from the TTS engine | |||
// @param property pointer to the property name | |||
// @param[out] value pointer to the retrieved language value | |||
// @param[inout] iosize in: stores the size available to store the | |||
// property value. | |||
// out: stores the size required to hold the language | |||
// value if getLanguage() returned | |||
// TTS_PROPERTY_SIZE_TOO_SMALL, unchanged otherwise | |||
// @return TTS_PROPERTY_UNSUPPORTED, or TTS_SUCCESS, | |||
// or TTS_PROPERTY_SIZE_TOO_SMALL | |||
virtual tts_result getProperty(const char *property, char *value, | |||
size_t *iosize); | |||
// Synthesize the text. | |||
// As the synthesis is performed, the engine invokes the callback to notify | |||
// the TTS framework that it has filled the given buffer, and indicates how | |||
// many bytes it wrote. The callback is called repeatedly until the engine | |||
// has generated all the audio data corresponding to the text. | |||
// Note about the format of the input: the text parameter may use the | |||
// following elements | |||
// and their respective attributes as defined in the SSML 1.0 specification: | |||
// * lang | |||
// * say-as: | |||
// o interpret-as | |||
// * phoneme | |||
// * voice: | |||
// o gender, | |||
// o age, | |||
// o variant, | |||
// o name | |||
// * emphasis | |||
// * break: | |||
// o strength, | |||
// o time | |||
// * prosody: | |||
// o pitch, | |||
// o contour, | |||
// o range, | |||
// o rate, | |||
// o duration, | |||
// o volume | |||
// * mark | |||
// Differences between this text format and SSML are: | |||
// * full SSML documents are not supported | |||
// * namespaces are not supported | |||
// Text is coded in UTF-8. | |||
// @param text the UTF-8 text to synthesize | |||
// @param userdata pointer to be returned when the call is invoked | |||
// @param buffer the location where the synthesized data must be written | |||
// @param bufferSize the number of bytes that can be written in buffer | |||
// @return TTS_SUCCESS or TTS_FAILURE | |||
virtual tts_result synthesizeText(const char *text, int8_t *buffer, | |||
size_t bufferSize, void *userdata); | |||
}; | |||
} // namespace android | |||
#endif /* TTS_ENGINE_H_ */ |
@@ -0,0 +1,593 @@ | |||
/* | |||
* Copyright (C) 2008 Google Inc. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
*/ | |||
#include <stdio.h> | |||
#include <unistd.h> | |||
#include <stdlib.h> | |||
#define LOG_TAG "eSpeak Engine" | |||
#include <speak_lib.h> | |||
#include <TtsEngine.h> | |||
#include <Log.h> | |||
/* | |||
* This is the Manager layer. It sits on top of the native eSpeak engine | |||
* and provides the interface to the defined Google TTS engine API. | |||
* The Google engine API is the boundary to allow a TTS engine to be swapped. | |||
* The Manager layer also provide the SSML tag interpretation. | |||
* The supported SSML tags are mapped to corresponding tags natively supported by eSpeak. | |||
* Native eSpeak functions always begin with espeak_XXX. | |||
* | |||
* Only a subset of SSML 1.0 tags are supported. | |||
* Some SSML tags involve significant complexity. | |||
* If the language is changed through an SSML tag, there is a latency for the load. | |||
*/ | |||
using namespace android; | |||
const char *ESPEAK_DIRECTORY = "espeak-data"; | |||
const char *eSpeakBaseResources[] = {"intonations", "phondata", "phonindex", "phontab", | |||
"en_dict", "voices/en/en-us" }; | |||
const int NUM_BASE_RESOURCES = 6; | |||
// Format is {espeak voice, iso3 code, name} | |||
const char *eSpeakSupportedVoices[][3] = { | |||
{"en-us", "eng", "English"}, | |||
{"en-us", "eng-USA", "English (US)"}, | |||
{"en", "eng-GBR", "English (UK)"}, | |||
{"en-sc", "eng-GBR-sc", "English (Scottish)"}, | |||
{"en-n", "eng-GBR-n", "English (Northern UK)"}, | |||
{"en-rp", "eng-GBR-rp", "English (Received Pronunciation)"}, | |||
{"en-wm", "eng-GBR-wm", "English (West Midlands)"}, | |||
{"af", "afr", "Afrikaans"}, | |||
{"bs", "bos", "Bosnian"}, | |||
{"ca", "cat", "Catalan"}, | |||
{"cs", "ces", "Czech"}, | |||
{"da", "dan", "Danish"}, | |||
{"de", "deu", "German"}, | |||
{"el", "ell", "Greek"}, | |||
{"eo", "epo", "Esperanto"}, | |||
{"es", "spa", "Spanish"}, | |||
{"es-la", "spa-MEX", "Spanish (Latin America)"}, | |||
{"fi", "fin", "Finnish"}, | |||
{"fr", "fra", "French"}, | |||
{"hr", "hrv", "Croatian"}, | |||
{"hu", "hun", "Hungarian"}, | |||
{"it", "ita", "Italian"}, | |||
{"kn", "kan", "Kannada"}, | |||
{"ku", "kur", "Kurdish"}, | |||
{"lv", "lav", "Latvian"}, | |||
{"nl", "nld", "Dutch"}, | |||
{"pl", "pol", "Polish"}, | |||
{"pt", "por", "Portuguese (Brazil)"}, | |||
{"pt", "por-BRA", "Portuguese (Brazil)"}, | |||
{"pt-pt", "por-PRT", "Portuguese"}, | |||
{"ro", "ron", "Romanian"}, | |||
{"sk", "slk", "Slovak"}, | |||
{"sr", "srp", "Serbian"}, | |||
{"sv", "swe", "Swedish"}, | |||
{"sw", "swa", "Swahili"}, | |||
{"ta", "tam", "Tamil"}, | |||
{"tr", "tur", "Turkish"}, | |||
{"zh", "zho", "Chinese (Mandarin)"}, | |||
{"cy", "cym", "Welsh"}, | |||
{"hi", "hin", "Hindi"}, | |||
{"hy", "hye", "Armenian"}, | |||
{"id", "ind", "Indonesian"}, | |||
{"is", "isl", "Icelandic"}, | |||
{"ka", "kat", "Georgian"}, | |||
{"la", "lat", "Latin"}, | |||
{"mk", "mkd", "Macedonian"}, | |||
{"no", "nor", "Norwegian"}, | |||
{"ru", "rus", "Russian"}, | |||
{"sq", "sqi", "Albanian"}, | |||
{"vi", "vie", "Vietnamese"}, | |||
{"zh-yue", "zho-HKG", "Chinese (Cantonese)"}, | |||
{"grc", "grc", "Ancient Greek"}, | |||
{"jbo", "jbo", "Lojban"}, | |||
{"nci", "nci", "Nahuatl (Classical)"}, | |||
{"pap", "pap", "Papiamento" } | |||
}; | |||
const int NUM_SUPPORTED_VOICES = 55; | |||
/* Integer constants */ | |||
const int DEFAULT_SPEECH_RATE = 150; | |||
// Callback to the TTS API | |||
synthDoneCB_t *ttsSynthDoneCBPointer; | |||
char *currentLanguage = (char *) "en-us"; | |||
char *currentRate = (char *) "150"; | |||
char *eSpeakDataPath = NULL; | |||
char currentLang[10]; | |||
char currentCountry[10]; | |||
char currentVariant[10]; | |||
bool hasInitialized = false; | |||
/* Functions internal to the eSpeak engine wrapper */ | |||
static void setSpeechRate(int speechRate) { | |||
espeak_ERROR err = espeak_SetParameter(espeakRATE, speechRate, 0); | |||
} | |||
/* Functions exposed to the TTS API */ | |||
/* Callback from espeak. Should call back to the TTS API */ | |||
static int eSpeakCallback(short *wav, int numsamples, espeak_EVENT *events) { | |||
LOGI("Callback with %d samples", numsamples); | |||
int8_t * castedWav = (int8_t *) wav; | |||
size_t bufferSize = 0; | |||
if (numsamples < 1) { | |||
size_t silenceBufferSize = 2; | |||
int8_t *silence = new int8_t[silenceBufferSize]; // TODO: This will be a small memory leak, but do it this way for now because passing in an empty buffer can cause a crash. | |||
silence[0] = 0; | |||
silence[1] = 0; | |||
ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, silence, | |||
silenceBufferSize, TTS_SYNTH_DONE); | |||
return 1; | |||
} | |||
bufferSize = numsamples * sizeof(short); | |||
ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, castedWav, | |||
bufferSize, TTS_SYNTH_PENDING); | |||
return 0; // continue synthesis (1 is to abort) | |||
} | |||
static bool fileExists(char *fileName) { | |||
FILE *file = fopen(fileName, "r"); | |||
if (file == NULL) { | |||
return false; | |||
} else { | |||
fclose(file); | |||
return true; | |||
} | |||
} | |||
static bool hasBaseResources() { | |||
char filename[255]; | |||
for (int i = 0; i < NUM_BASE_RESOURCES; i++) { | |||
sprintf(filename, "%s/%s/%s", eSpeakDataPath, ESPEAK_DIRECTORY, eSpeakBaseResources[i]); | |||
if (!fileExists(filename)) { | |||
LOGE("Missing resource: %s", filename); | |||
return false; | |||
} | |||
} | |||
return true; | |||
} | |||
/* Google Engine API function implementations */ | |||
tts_result attemptInit() { | |||
if (hasInitialized) { | |||
return TTS_SUCCESS; | |||
} | |||
if (!hasBaseResources()) { | |||
return TTS_FAILURE; | |||
} | |||
// TODO Make sure that the speech data is loaded in | |||
// the directory /sdcard/espeak-data before calling this. | |||
int sampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 4096, eSpeakDataPath, 0); | |||
if (sampleRate <= 0) { | |||
LOGE("eSpeak initialization failed!"); | |||
return TTS_FAILURE; | |||
} | |||
espeak_SetSynthCallback(eSpeakCallback); | |||
espeak_SetParameter(espeakRATE, DEFAULT_SPEECH_RATE, 0); | |||
espeak_VOICE voice; | |||
memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first | |||
const char *langNativeString = "en-us"; //Default to US English | |||
voice.languages = langNativeString; | |||
voice.variant = 0; | |||
espeak_SetVoiceByProperties(&voice); | |||
hasInitialized = true; | |||
return TTS_SUCCESS; | |||
} | |||
/** init | |||
* Allocates eSpeak memory block and initializes the eSpeak system. | |||
* synthDoneCBPtr - Pointer to callback function which will receive generated samples | |||
* config - the engine configuration parameters, not used here | |||
* return tts_result | |||
*/ | |||
tts_result TtsEngine::init(synthDoneCB_t synthDoneCBPtr, const char *engineConfig) { | |||
ttsSynthDoneCBPointer = synthDoneCBPtr; | |||
hasInitialized = false; | |||
if ((engineConfig != NULL) && (strlen(engineConfig) > 0)) { | |||
eSpeakDataPath = (char *) malloc(strlen(engineConfig)); | |||
strcpy(eSpeakDataPath, engineConfig); | |||
} else { | |||
eSpeakDataPath = NULL; | |||
LOGE("Data path not specified!"); | |||
return TTS_FAILURE; | |||
} | |||
return attemptInit(); | |||
} | |||
/** shutdown | |||
* Unloads all eSpeak resources; terminates eSpeak system and frees eSpeak memory block. | |||
* return tts_result | |||
*/ | |||
tts_result TtsEngine::shutdown(void) { | |||
if (eSpeakDataPath != NULL) { | |||
free(eSpeakDataPath); | |||
} | |||
espeak_Terminate(); | |||
return TTS_SUCCESS; | |||
} | |||
tts_result TtsEngine::loadLanguage(const char *lang, const char *country, const char *variant) { | |||
LOGV("loadLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant); | |||
return TTS_FAILURE; | |||
} | |||
tts_support_result isLanguageSupported(const char *lang, const char *country, const char *variant, | |||
int *pindex) { | |||
LOGV("isLanguageSupported(\"%s\", \"%s\", \"%s\")", lang, country, variant); | |||
if ((lang == NULL) || (strlen(lang) == 0)) { | |||
LOGE("TtsEngine::isLanguageAvailable called with no language"); | |||
return TTS_LANG_NOT_SUPPORTED; | |||
} | |||
if (pindex != NULL) { | |||
*pindex = -1; | |||
} | |||
int langIndex = -1; | |||
int countryIndex = -1; | |||
int variantIndex = -1; | |||
if (strlen(lang) == 3) { | |||
for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) { | |||
if (strncmp(lang, eSpeakSupportedVoices[i][1], 3) == 0) { | |||
LOGI("Found ISO3 language at index %d", i); | |||
langIndex = i; | |||
break; | |||
} | |||
} | |||
} else if (strlen(lang) == 2) { | |||
for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) { | |||
if (strncmp(lang, eSpeakSupportedVoices[i][0], 2) == 0) { | |||
LOGI("Found ISO2 language at index %d", i); | |||
langIndex = i; | |||
break; | |||
} | |||
} | |||
} | |||
if (langIndex < 0) { | |||
LOGV("TtsEngine::isLanguageAvailable called with unsupported language"); | |||
return TTS_LANG_NOT_SUPPORTED; | |||
} | |||
if ((country == NULL) || (strlen(country) == 0)) { | |||
// TODO: Check whether resources are available for this language. | |||
if (pindex != NULL) { | |||
*pindex = langIndex; | |||
} | |||
LOGI("No country specified, language is available"); | |||
return TTS_LANG_AVAILABLE; | |||
} | |||
char lang_country[10]; | |||
sprintf(lang_country, "%s-%s", lang, country); | |||
// Find country | |||
if (strlen(country) == 3) { | |||
for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) { | |||
if ((strcmp(lang_country, eSpeakSupportedVoices[i][1]) == 0)) { | |||
LOGI("Found ISO3 country at index %d", i); | |||
countryIndex = i; | |||
break; | |||
} | |||
} | |||
} else if (strlen(country) == 2) { | |||
for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) { | |||
if ((strcmp(lang_country, eSpeakSupportedVoices[i][0]) == 0)) { | |||
LOGI("Found ISO2 country at index %d", i); | |||
countryIndex = i; | |||
break; | |||
} | |||
} | |||
} | |||
if (countryIndex < 0) { | |||
if (pindex != NULL) { | |||
*pindex = langIndex; | |||
} | |||
LOGI("No country found, language is available"); | |||
return TTS_LANG_AVAILABLE; | |||
} | |||
if ((variant == NULL) || (strlen(variant) == 0)) { | |||
if (pindex != NULL) { | |||
*pindex = countryIndex; | |||
} | |||
LOGI("No variant specified, language and country are available"); | |||
return TTS_LANG_COUNTRY_AVAILABLE; | |||
} | |||
char lang_country_variant[15]; | |||
sprintf(lang_country_variant, "%s-%s-%s", lang, country, variant); | |||
// Find variant | |||
for (int i = countryIndex; i < NUM_SUPPORTED_VOICES; i++) { | |||
if ((strcmp(lang_country_variant, eSpeakSupportedVoices[i][1]) == 0)) { | |||
LOGI("Found variant at index %d", i); | |||
variantIndex = i; | |||
break; | |||
} | |||
} | |||
if (variantIndex < 0) { | |||
if (pindex != NULL) { | |||
*pindex = countryIndex; | |||
} | |||
LOGI("No variant found, language and country are available"); | |||
return TTS_LANG_COUNTRY_AVAILABLE; | |||
} | |||
if (pindex != NULL) { | |||
*pindex = variantIndex; | |||
} | |||
LOGI("Language, country, and variant are available"); | |||
return TTS_LANG_COUNTRY_VAR_AVAILABLE; | |||
} | |||
tts_result TtsEngine::setLanguage(const char *lang, const char *country, const char *variant) { | |||
LOGV("setLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant); | |||
// Make sure the engine is initialized! | |||
attemptInit(); | |||
int index = -1; | |||
isLanguageSupported(lang, country, variant, &index); | |||
if (index < 0) { | |||
LOGE("setLanguage called with unsupported language"); | |||
return TTS_FAILURE; | |||
} | |||
strcpy(currentLang, lang); | |||
strcpy(currentCountry, country); | |||
strcpy(currentVariant, variant); | |||
char espeakLangStr[7]; | |||
strcpy(espeakLangStr, eSpeakSupportedVoices[index][0]); | |||
espeak_VOICE voice; | |||
memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first | |||
voice.variant = 0; | |||
voice.languages = espeakLangStr; | |||
espeak_ERROR err = espeak_SetVoiceByProperties(&voice); | |||
currentLanguage = new char[strlen(lang)]; | |||
strcpy(currentLanguage, lang); | |||
if (err != EE_OK) { | |||
LOGE("Error code %d when setting voice properties!", err); | |||
return TTS_FAILURE; | |||
} | |||
return TTS_SUCCESS; | |||
} | |||
tts_support_result TtsEngine::isLanguageAvailable(const char *lang, const char *country, | |||
const char *variant) { | |||
return isLanguageSupported(lang, country, variant, NULL); | |||
} | |||
tts_result TtsEngine::getLanguage(char *language, char *country, char *variant) { | |||
strcpy(language, currentLang); | |||
strcpy(country, currentCountry); | |||
strcpy(variant, currentVariant); | |||
return TTS_SUCCESS; | |||
} | |||
/** setAudioFormat | |||
* sets the audio format to use for synthesis, returns what is actually used. | |||
* @encoding - reference to encoding format | |||
* @rate - reference to sample rate | |||
* @channels - reference to number of channels | |||
* return tts_result | |||
* */ | |||
tts_result TtsEngine::setAudioFormat(tts_audio_format& encoding, uint32_t& rate, int& channels) { | |||
LOGE("setAudioFormat(%d, %d, %d) is unsupported", encoding, rate, channels); | |||
// TODO: Fix this! | |||
return TTS_SUCCESS; | |||
} | |||
// Sets the property with the specified value | |||
tts_result TtsEngine::setProperty(const char *property, const char *value, const size_t size) { | |||
LOGV("setProperty(\"%s\", \"%s\", %d)", property, value, size); | |||
/* Set a specific property for the engine. | |||
Supported properties include: language (locale), rate, pitch, volume. */ | |||
/* Sanity check */ | |||
if (property == NULL) { | |||
LOGE("setProperty called with property NULL"); | |||
return TTS_PROPERTY_UNSUPPORTED; | |||
} | |||
if (value == NULL) { | |||
LOGE("setProperty called with value NULL"); | |||
return TTS_VALUE_INVALID; | |||
} | |||
espeak_ERROR result; | |||
if (strncmp(property, "language", 8) == 0) { | |||
// TODO: Set this property | |||
result = EE_OK; | |||
} else if (strncmp(property, "rate", 4) == 0) { | |||
int rate = atoi(value) * DEFAULT_SPEECH_RATE / 100; | |||
result = espeak_SetParameter(espeakRATE, rate, 0); | |||
} else if (strncmp(property, "pitch", 5) == 0) { | |||
int pitch = atoi(value); | |||
result = espeak_SetParameter(espeakPITCH, pitch, 0); | |||
} else if (strncmp(property, "volume", 6) == 0) { | |||
int volume = atoi(value); | |||
result = espeak_SetParameter(espeakVOLUME, volume, 0); | |||
} else { | |||
return TTS_PROPERTY_UNSUPPORTED; | |||
} | |||
if (result == EE_OK) { | |||
return TTS_SUCCESS; | |||
} else { | |||
return TTS_FAILURE; | |||
} | |||
} | |||
// Sets the property with the specified value | |||
tts_result TtsEngine::getProperty(const char *property, char *value, size_t *iosize) { | |||
LOGV("getProperty(\"%s\", ...)", property); | |||
/* Get the property for the engine. | |||
This property was previously set by setProperty or by default. */ | |||
/* sanity check */ | |||
if (property == NULL) { | |||
LOGE("getProperty called with property NULL"); | |||
return TTS_PROPERTY_UNSUPPORTED; | |||
} | |||
if (value == NULL) { | |||
LOGE("getProperty called with value NULL"); | |||
return TTS_VALUE_INVALID; | |||
} | |||
if (strncmp(property, "language", 8) == 0) { | |||
if (currentLanguage == NULL) { | |||
strcpy(value, ""); | |||
} else { | |||
if (*iosize < strlen(currentLanguage)+1) { | |||
*iosize = strlen(currentLanguage) + 1; | |||
return TTS_PROPERTY_SIZE_TOO_SMALL; | |||
} | |||
strcpy(value, currentLanguage); | |||
} | |||
return TTS_SUCCESS; | |||
} else if (strncmp(property, "rate", 4) == 0) { | |||
int rate = espeak_GetParameter(espeakRATE, 1) * 100 / DEFAULT_SPEECH_RATE; | |||
char tmprate[4]; | |||
sprintf(tmprate, "%d", rate); | |||
if (*iosize < strlen(tmprate)+1) { | |||
*iosize = strlen(tmprate) + 1; | |||
return TTS_PROPERTY_SIZE_TOO_SMALL; | |||
} | |||
strcpy(value, tmprate); | |||
return TTS_SUCCESS; | |||
} else if (strncmp(property, "pitch", 5) == 0) { | |||
char tmppitch[4]; | |||
sprintf(tmppitch, "%d", espeak_GetParameter(espeakPITCH, 1)); | |||
if (*iosize < strlen(tmppitch)+1) { | |||
*iosize = strlen(tmppitch) + 1; | |||
return TTS_PROPERTY_SIZE_TOO_SMALL; | |||
} | |||
strcpy(value, tmppitch); | |||
return TTS_SUCCESS; | |||
} else if (strncmp(property, "volume", 6) == 0) { | |||
char tmpvolume[4]; | |||
sprintf(tmpvolume, "%d", espeak_GetParameter(espeakVOLUME, 1)); | |||
if (*iosize < strlen(tmpvolume)+1) { | |||
*iosize = strlen(tmpvolume) + 1; | |||
return TTS_PROPERTY_SIZE_TOO_SMALL; | |||
} | |||
strcpy(value, tmpvolume); | |||
return TTS_SUCCESS; | |||
} | |||
LOGE("Unsupported property"); | |||
return TTS_PROPERTY_UNSUPPORTED; | |||
} | |||
/** synthesizeText | |||
* Synthesizes a text string. | |||
* The text string could be annotated with SSML tags. | |||
* @text - text to synthesize | |||
* @buffer - buffer which will receive generated samples | |||
* @bufferSize - size of buffer | |||
* @userdata - pointer to user data which will be passed back to callback function | |||
* return tts_result | |||
*/ | |||
tts_result TtsEngine::synthesizeText(const char *text, int8_t *buffer, size_t bufferSize, | |||
void *userdata) { | |||
LOGI("Synthesize: %s", text); | |||
espeak_SetSynthCallback(eSpeakCallback); | |||
unsigned int unique_identifier; | |||
espeak_Synth(text, strlen(text), 0, // position | |||
POS_CHARACTER, 0, // end position (0 means no end position) | |||
espeakCHARS_UTF8 | espeakSSML, // use or ignore xml tags | |||
&unique_identifier, userdata); | |||
espeak_Synchronize(); | |||
LOGI("Synthesis done"); | |||
return TTS_SUCCESS; | |||
} | |||
/** stop | |||
* Aborts the running synthesis. | |||
* return tts_result | |||
*/ | |||
tts_result TtsEngine::stop() { | |||
espeak_Cancel(); | |||
return TTS_SUCCESS; | |||
} | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
TtsEngine* getTtsEngine() { | |||
return new TtsEngine(); | |||
} | |||
#ifdef __cplusplus | |||
} | |||
#endif |
@@ -0,0 +1,341 @@ | |||
/* | |||
* Copyright (C) 2011 Google Inc. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
*/ | |||
#include <stdio.h> | |||
#include <unistd.h> | |||
#include <stdlib.h> | |||
#include <jni.h> | |||
#include <speak_lib.h> | |||
#include <TtsEngine.h> | |||
#include <Log.h> | |||
#define LOG_TAG "eSpeakService" | |||
#define DEBUG false | |||
enum audio_channel_count { | |||
CHANNEL_COUNT_MONO = 1, | |||
CHANNEL_COUNT_STEREO = 2 | |||
}; | |||
enum audio_encoding { | |||
ENCODING_INVALID = 0x00, | |||
ENCODING_DEFAULT = 0x01, | |||
ENCODING_PCM_16BIT = 0x02, | |||
ENCODING_PCM_8BIT = 0x03 | |||
}; | |||
enum synthesis_result { | |||
SYNTH_CONTINUE = 0, | |||
SYNTH_ABORT = 1 | |||
}; | |||
const int DEFAULT_SAMPLE_RATE = 22050; | |||
const int DEFAULT_CHANNEL_COUNT = CHANNEL_COUNT_MONO; | |||
const int DEFAULT_AUDIO_FORMAT = ENCODING_PCM_16BIT; | |||
const int DEFAULT_BUFFER_SIZE = 1000; | |||
struct native_data_t { | |||
JNIEnv *env; | |||
jobject object; | |||
int sampleRate; | |||
int channelCount; | |||
int audioFormat; | |||
int bufferSizeInMillis; | |||
native_data_t() { | |||
env = NULL; | |||
object = NULL; | |||
sampleRate = DEFAULT_SAMPLE_RATE; | |||
channelCount = DEFAULT_CHANNEL_COUNT; | |||
audioFormat = DEFAULT_AUDIO_FORMAT; | |||
bufferSizeInMillis = DEFAULT_BUFFER_SIZE; | |||
} | |||
}; | |||
jmethodID METHOD_nativeSynthCallback; | |||
jfieldID FIELD_mNativeData; | |||
static inline native_data_t *getNativeData(JNIEnv *env, jobject object) { | |||
return (native_data_t *) (env->GetIntField(object, FIELD_mNativeData)); | |||
} | |||
/* Callback from espeak. Should call back to the TTS API */ | |||
static int SynthCallback(short *audioData, int numSamples, | |||
espeak_EVENT *events) { | |||
native_data_t *nat = (native_data_t *) events->user_data; | |||
JNIEnv *env = nat->env; | |||
jobject object = nat->object; | |||
if (numSamples < 1) { | |||
env->CallVoidMethod(object, METHOD_nativeSynthCallback, NULL); | |||
return SYNTH_ABORT; | |||
} else { | |||
jbyteArray arrayAudioData = env->NewByteArray(numSamples * 2); | |||
env->SetByteArrayRegion(arrayAudioData, 0, (numSamples * 2), (jbyte *) audioData); | |||
env->CallVoidMethod(object, METHOD_nativeSynthCallback, arrayAudioData); | |||
return SYNTH_CONTINUE; | |||
} | |||
} | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif /* __cplusplus */ | |||
JNIEXPORT jint | |||
JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) { | |||
JNIEnv *env; | |||
if (vm->GetEnv((void **) &env, JNI_VERSION_1_6) != JNI_OK) { | |||
LOGE("Failed to get the environment using GetEnv()"); | |||
return -1; | |||
} | |||
return JNI_VERSION_1_6; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeClassInit( | |||
JNIEnv* env, jclass clazz) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
METHOD_nativeSynthCallback = env->GetMethodID(clazz, "nativeSynthCallback", "([B)V"); | |||
FIELD_mNativeData = env->GetFieldID(clazz, "mNativeData", "I"); | |||
return JNI_TRUE; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeCreate( | |||
JNIEnv *env, jobject object, jstring path) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
native_data_t *nat = new native_data_t; | |||
if (nat == NULL) { | |||
LOGE("%s: out of memory!", __FUNCTION__); | |||
return JNI_FALSE; | |||
} | |||
env->SetIntField(object, FIELD_mNativeData, (jint) nat); | |||
const char *c_path = env->GetStringUTFChars(path, NULL); | |||
nat->object = env->NewWeakGlobalRef(object); | |||
if (DEBUG) LOGV("Initializing with path %s", c_path); | |||
nat->sampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, nat->bufferSizeInMillis, c_path, 0); | |||
env->ReleaseStringUTFChars(path, c_path); | |||
if (nat->sampleRate > 0) { | |||
return JNI_TRUE; | |||
} else { | |||
return JNI_FALSE; | |||
} | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeDestroy( | |||
JNIEnv *env, jobject object) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
native_data_t *nat = getNativeData(env, object); | |||
env->DeleteWeakGlobalRef(nat->object); | |||
free(nat); | |||
return JNI_TRUE; | |||
} | |||
JNIEXPORT jint | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetSampleRate( | |||
JNIEnv *env, jobject object) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const native_data_t *nat = getNativeData(env, object); | |||
return (jint) nat->sampleRate; | |||
} | |||
JNIEXPORT jint | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetChannelCount( | |||
JNIEnv *env, jobject object) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const native_data_t *nat = getNativeData(env, object); | |||
return (jint) nat->channelCount; | |||
} | |||
JNIEXPORT jint | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetAudioFormat( | |||
JNIEnv *env, jobject object) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const native_data_t *nat = getNativeData(env, object); | |||
return (jint) nat->audioFormat; | |||
} | |||
JNIEXPORT jint | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetBufferSizeInMillis( | |||
JNIEnv *env, jobject object) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const native_data_t *nat = getNativeData(env, object); | |||
return (jint) nat->bufferSizeInMillis; | |||
} | |||
JNIEXPORT jobjectArray | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetAvailableVoices( | |||
JNIEnv *env, jobject object) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const espeak_VOICE **voices = espeak_ListVoices(NULL); | |||
int count; | |||
// First, count the number of voices returned. | |||
for (count = 0; voices[count] != NULL; count++); | |||
// Next, create a Java String array. | |||
jobjectArray voicesArray = (jobjectArray) env->NewObjectArray( | |||
count * 4, env->FindClass("java/lang/String"), NULL); | |||
const espeak_VOICE *v; | |||
char gender_buf[12]; | |||
char age_buf[12]; | |||
// Finally, populate the array. | |||
for (int i = 0, voicesIndex = 0; (v = voices[i]) != NULL; i++) { | |||
const char *lang_name = v->languages + 1; | |||
const char *identifier = v->identifier; | |||
sprintf(gender_buf, "%d", v->gender); | |||
sprintf(age_buf, "%d", v->age); | |||
env->SetObjectArrayElement( | |||
voicesArray, voicesIndex++, env->NewStringUTF(lang_name)); | |||
env->SetObjectArrayElement( | |||
voicesArray, voicesIndex++, env->NewStringUTF(identifier)); | |||
env->SetObjectArrayElement( | |||
voicesArray, voicesIndex++, env->NewStringUTF(gender_buf)); | |||
env->SetObjectArrayElement( | |||
voicesArray, voicesIndex++, env->NewStringUTF(age_buf)); | |||
} | |||
return voicesArray; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetVoiceByProperties( | |||
JNIEnv *env, jobject object, jstring name, jstring languages, jint gender, jint age, | |||
jint variant) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const char *c_name = env->GetStringUTFChars(name, NULL); | |||
const char *c_languages = env->GetStringUTFChars(languages, NULL); | |||
espeak_VOICE voice_select; | |||
memset(&voice_select, 0, sizeof(espeak_VOICE)); | |||
voice_select.name = c_name; | |||
voice_select.languages = c_languages; | |||
voice_select.age = (int) age; | |||
voice_select.gender = (int) gender; | |||
voice_select.variant = (int) variant; | |||
const espeak_ERROR result = espeak_SetVoiceByProperties(&voice_select); | |||
env->ReleaseStringUTFChars(name, c_name); | |||
env->ReleaseStringUTFChars(languages, c_languages); | |||
if (result == EE_OK) | |||
return JNI_TRUE; | |||
else | |||
return JNI_FALSE; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetLanguage( | |||
JNIEnv *env, jobject object, jstring language, jint variant) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const char *c_language = env->GetStringUTFChars(language, NULL); | |||
const int len = strlen(c_language); | |||
char *lang_copy = (char *) calloc(len, sizeof(char)); | |||
strcpy(lang_copy, c_language); | |||
env->ReleaseStringUTFChars(language, c_language); | |||
espeak_VOICE voice; | |||
memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first | |||
voice.languages = lang_copy; | |||
voice.variant = (int) variant; | |||
const espeak_ERROR result = espeak_SetVoiceByProperties(&voice); | |||
if (result == EE_OK) | |||
return JNI_TRUE; | |||
else | |||
return JNI_FALSE; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetRate( | |||
JNIEnv *env, jobject object, jint rate) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const espeak_ERROR result = espeak_SetParameter(espeakRATE, (int) rate, 0); | |||
if (result == EE_OK) | |||
return JNI_TRUE; | |||
else | |||
return JNI_FALSE; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetPitch( | |||
JNIEnv *env, jobject object, jint pitch) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
const espeak_ERROR result = espeak_SetParameter(espeakPITCH, (int) pitch, 0); | |||
if (result == EE_OK) | |||
return JNI_TRUE; | |||
else | |||
return JNI_FALSE; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSynthesize( | |||
JNIEnv *env, jobject object, jstring text) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
native_data_t *nat = getNativeData(env, object); | |||
const char *c_text = env->GetStringUTFChars(text, NULL); | |||
unsigned int unique_identifier; | |||
nat->env = env; | |||
espeak_SetSynthCallback(SynthCallback); | |||
espeak_Synth(c_text, strlen(c_text), 0, // position | |||
POS_CHARACTER, 0, // end position (0 means no end position) | |||
espeakCHARS_UTF8 | espeakSSML, // use or ignore xml tags | |||
&unique_identifier, nat); | |||
espeak_Synchronize(); | |||
env->ReleaseStringUTFChars(text, c_text); | |||
return JNI_TRUE; | |||
} | |||
JNIEXPORT jboolean | |||
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeStop( | |||
JNIEnv *env, jobject object) { | |||
if (DEBUG) LOGV("%s", __FUNCTION__); | |||
espeak_Cancel(); | |||
return JNI_TRUE; | |||
} | |||
#ifdef __cplusplus | |||
} | |||
#endif /* __cplusplus */ |