Browse Source

pull in the android JNI bindings from eyes-free

master
Reece H. Dunn 12 years ago
parent
commit
2b78ade000

+ 26
- 0
jni/include/Log.h View File

/*
* Copyright (C) 2009 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LOG_H_
#define LOG_H_

#include <android/log.h>

#define LOGV(...) __android_log_print(ANDROID_LOG_VERBOSE, LOG_TAG, __VA_ARGS__)
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)

#endif /* LOG_H_ */

+ 245
- 0
jni/include/TtsEngine.h View File

/*
* Copyright (C) 2009 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// This header defines the interface used by the Android platform
// to access Text-To-Speech functionality in shared libraries that implement
// speech synthesis and the management of resources associated with the
// synthesis.
// An example of the implementation of this interface can be found in
// FIXME: add path+name to implementation of default TTS engine
// Libraries implementing this interface are used in:
// frameworks/base/tts/jni/android_tts_SpeechSynthesis.cpp

#ifndef TTS_ENGINE_H_
#define TTS_ENGINE_H_

namespace android {

#define ANDROID_TTS_ENGINE_PROPERTY_CONFIG "engineConfig"
#define ANDROID_TTS_ENGINE_PROPERTY_PITCH "pitch"
#define ANDROID_TTS_ENGINE_PROPERTY_RATE "rate"
#define ANDROID_TTS_ENGINE_PROPERTY_VOLUME "volume"


enum tts_synth_status {
TTS_SYNTH_DONE = 0,
TTS_SYNTH_PENDING = 1
};

enum tts_callback_status {
TTS_CALLBACK_HALT = 0,
TTS_CALLBACK_CONTINUE = 1
};

// NOTE: This is duplicated in compat/jni/tts.h. Please
// make changes there as well.
enum tts_audio_format {
TTS_AUDIO_FORMAT_INVALID = -1,
TTS_AUDIO_FORMAT_DEFAULT = 0,
TTS_AUDIO_FORMAT_PCM_16_BIT = 1,
TTS_AUDIO_FORMAT_PCM_8_BIT = 2,
};

// The callback is used by the implementation of this interface to notify its
// client, the Android TTS service, that the last requested synthesis has been
// completed. // TODO reword
// The callback for synthesis completed takes:
// @param [inout] void *& - The userdata pointer set in the original
// synth call
// @param [in] uint32_t - Track sampling rate in Hz
// @param [in] tts_audio_format - The audio format
// @param [in] int - The number of channels
// @param [inout] int8_t *& - A buffer of audio data only valid during the
// execution of the callback
// @param [inout] size_t & - The size of the buffer
// @param [in] tts_synth_status - indicate whether the synthesis is done, or
// if more data is to be synthesized.
// @return TTS_CALLBACK_HALT to indicate the synthesis must stop,
// TTS_CALLBACK_CONTINUE to indicate the synthesis must continue if
// there is more data to produce.
typedef tts_callback_status (synthDoneCB_t)(void *&, uint32_t,
tts_audio_format, int, int8_t *&, size_t&, tts_synth_status);

class TtsEngine;
extern "C" TtsEngine* getTtsEngine();

enum tts_result {
TTS_SUCCESS = 0,
TTS_FAILURE = -1,
TTS_FEATURE_UNSUPPORTED = -2,
TTS_VALUE_INVALID = -3,
TTS_PROPERTY_UNSUPPORTED = -4,
TTS_PROPERTY_SIZE_TOO_SMALL = -5,
TTS_MISSING_RESOURCES = -6
};

enum tts_support_result {
TTS_LANG_COUNTRY_VAR_AVAILABLE = 2,
TTS_LANG_COUNTRY_AVAILABLE = 1,
TTS_LANG_AVAILABLE = 0,
TTS_LANG_MISSING_DATA = -1,
TTS_LANG_NOT_SUPPORTED = -2
};


class TtsEngine
{
public:
virtual ~TtsEngine() {}

// Initialize the TTS engine and returns whether initialization succeeded.
// @param synthDoneCBPtr synthesis callback function pointer
// @return TTS_SUCCESS, or TTS_FAILURE
virtual tts_result init(synthDoneCB_t synthDoneCBPtr, const char *engineConfig);

// Shut down the TTS engine and releases all associated resources.
// @return TTS_SUCCESS, or TTS_FAILURE
virtual tts_result shutdown();

// Interrupt synthesis and flushes any synthesized data that hasn't been
// output yet. This will block until callbacks underway are completed.
// @return TTS_SUCCESS, or TTS_FAILURE
virtual tts_result stop();

// Returns the level of support for the language, country and variant.
// @return TTS_LANG_COUNTRY_VAR_AVAILABLE if the language, country and variant are supported,
// and the corresponding resources are correctly installed
// TTS_LANG_COUNTRY_AVAILABLE if the language and country are supported and the
// corresponding resources are correctly installed, but there is no match for
// the specified variant
// TTS_LANG_AVAILABLE if the language is supported and the
// corresponding resources are correctly installed, but there is no match for
// the specified country and variant
// TTS_LANG_MISSING_DATA if the required resources to provide any level of support
// for the language are not correctly installed
// TTS_LANG_NOT_SUPPORTED if the language is not supported by the TTS engine.
virtual tts_support_result isLanguageAvailable(const char *lang, const char *country,
const char *variant);

// Load the resources associated with the specified language. The loaded
// language will only be used once a call to setLanguage() with the same
// language value is issued. Language and country values are coded according to the ISO three
// letter codes for languages and countries, as can be retrieved from a java.util.Locale
// instance. The variant value is encoded as the variant string retrieved from a
// java.util.Locale instance built with that variant data.
// @param lang pointer to the ISO three letter code for the language
// @param country pointer to the ISO three letter code for the country
// @param variant pointer to the variant code
// @return TTS_SUCCESS, or TTS_FAILURE
virtual tts_result loadLanguage(const char *lang, const char *country, const char *variant);

// Load the resources associated with the specified language, country and Locale variant.
// The loaded language will only be used once a call to setLanguageFromLocale() with the same
// language value is issued. Language and country values are coded according to the ISO three
// letter codes for languages and countries, as can be retrieved from a java.util.Locale
// instance. The variant value is encoded as the variant string retrieved from a
// java.util.Locale instance built with that variant data.
// @param lang pointer to the ISO three letter code for the language
// @param country pointer to the ISO three letter code for the country
// @param variant pointer to the variant code
// @return TTS_SUCCESS, or TTS_FAILURE
virtual tts_result setLanguage(const char *lang, const char *country, const char *variant);

// Retrieve the currently set language, country and variant, or empty strings if none of
// parameters have been set. Language and country are represented by their 3-letter ISO code
// @param[out] pointer to the retrieved 3-letter code language value
// @param[out] pointer to the retrieved 3-letter code country value
// @param[out] pointer to the retrieved variant value
// @return TTS_SUCCESS, or TTS_FAILURE
virtual tts_result getLanguage(char *language, char *country, char *variant);

// Notifies the engine what audio parameters should be used for the synthesis.
// This is meant to be used as a hint, the engine implementation will set the output values
// to those of the synthesis format, based on a given hint.
// @param[inout] encoding in: the desired audio sample format
// out: the format used by the TTS engine
// @param[inout] rate in: the desired audio sample rate
// out: the sample rate used by the TTS engine
// @param[inout] channels in: the desired number of audio channels
// out: the number of channels used by the TTS engine
// @return TTS_SUCCESS, or TTS_FAILURE
virtual tts_result setAudioFormat(tts_audio_format& encoding, uint32_t& rate,
int& channels);

// Set a property for the the TTS engine
// "size" is the maximum size of "value" for properties "property"
// @param property pointer to the property name
// @param value pointer to the property value
// @param size maximum size required to store this type of property
// @return TTS_PROPERTY_UNSUPPORTED, or TTS_SUCCESS, or TTS_FAILURE,
// or TTS_VALUE_INVALID
virtual tts_result setProperty(const char *property, const char *value,
const size_t size);

// Retrieve a property from the TTS engine
// @param property pointer to the property name
// @param[out] value pointer to the retrieved language value
// @param[inout] iosize in: stores the size available to store the
// property value.
// out: stores the size required to hold the language
// value if getLanguage() returned
// TTS_PROPERTY_SIZE_TOO_SMALL, unchanged otherwise
// @return TTS_PROPERTY_UNSUPPORTED, or TTS_SUCCESS,
// or TTS_PROPERTY_SIZE_TOO_SMALL
virtual tts_result getProperty(const char *property, char *value,
size_t *iosize);

// Synthesize the text.
// As the synthesis is performed, the engine invokes the callback to notify
// the TTS framework that it has filled the given buffer, and indicates how
// many bytes it wrote. The callback is called repeatedly until the engine
// has generated all the audio data corresponding to the text.
// Note about the format of the input: the text parameter may use the
// following elements
// and their respective attributes as defined in the SSML 1.0 specification:
// * lang
// * say-as:
// o interpret-as
// * phoneme
// * voice:
// o gender,
// o age,
// o variant,
// o name
// * emphasis
// * break:
// o strength,
// o time
// * prosody:
// o pitch,
// o contour,
// o range,
// o rate,
// o duration,
// o volume
// * mark
// Differences between this text format and SSML are:
// * full SSML documents are not supported
// * namespaces are not supported
// Text is coded in UTF-8.
// @param text the UTF-8 text to synthesize
// @param userdata pointer to be returned when the call is invoked
// @param buffer the location where the synthesized data must be written
// @param bufferSize the number of bytes that can be written in buffer
// @return TTS_SUCCESS or TTS_FAILURE
virtual tts_result synthesizeText(const char *text, int8_t *buffer,
size_t bufferSize, void *userdata);

};

} // namespace android

#endif /* TTS_ENGINE_H_ */

+ 593
- 0
jni/jni/com_google_espeakengine.cpp View File

/*
* Copyright (C) 2008 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>

#define LOG_TAG "eSpeak Engine"

#include <speak_lib.h>
#include <TtsEngine.h>
#include <Log.h>

/*
* This is the Manager layer. It sits on top of the native eSpeak engine
* and provides the interface to the defined Google TTS engine API.
* The Google engine API is the boundary to allow a TTS engine to be swapped.
* The Manager layer also provide the SSML tag interpretation.
* The supported SSML tags are mapped to corresponding tags natively supported by eSpeak.
* Native eSpeak functions always begin with espeak_XXX.
*
* Only a subset of SSML 1.0 tags are supported.
* Some SSML tags involve significant complexity.
* If the language is changed through an SSML tag, there is a latency for the load.
*/

using namespace android;

const char *ESPEAK_DIRECTORY = "espeak-data";

const char *eSpeakBaseResources[] = {"intonations", "phondata", "phonindex", "phontab",
"en_dict", "voices/en/en-us" };

const int NUM_BASE_RESOURCES = 6;

// Format is {espeak voice, iso3 code, name}
const char *eSpeakSupportedVoices[][3] = {
{"en-us", "eng", "English"},
{"en-us", "eng-USA", "English (US)"},
{"en", "eng-GBR", "English (UK)"},
{"en-sc", "eng-GBR-sc", "English (Scottish)"},
{"en-n", "eng-GBR-n", "English (Northern UK)"},
{"en-rp", "eng-GBR-rp", "English (Received Pronunciation)"},
{"en-wm", "eng-GBR-wm", "English (West Midlands)"},
{"af", "afr", "Afrikaans"},
{"bs", "bos", "Bosnian"},
{"ca", "cat", "Catalan"},
{"cs", "ces", "Czech"},
{"da", "dan", "Danish"},
{"de", "deu", "German"},
{"el", "ell", "Greek"},
{"eo", "epo", "Esperanto"},
{"es", "spa", "Spanish"},
{"es-la", "spa-MEX", "Spanish (Latin America)"},
{"fi", "fin", "Finnish"},
{"fr", "fra", "French"},
{"hr", "hrv", "Croatian"},
{"hu", "hun", "Hungarian"},
{"it", "ita", "Italian"},
{"kn", "kan", "Kannada"},
{"ku", "kur", "Kurdish"},
{"lv", "lav", "Latvian"},
{"nl", "nld", "Dutch"},
{"pl", "pol", "Polish"},
{"pt", "por", "Portuguese (Brazil)"},
{"pt", "por-BRA", "Portuguese (Brazil)"},
{"pt-pt", "por-PRT", "Portuguese"},
{"ro", "ron", "Romanian"},
{"sk", "slk", "Slovak"},
{"sr", "srp", "Serbian"},
{"sv", "swe", "Swedish"},
{"sw", "swa", "Swahili"},
{"ta", "tam", "Tamil"},
{"tr", "tur", "Turkish"},
{"zh", "zho", "Chinese (Mandarin)"},
{"cy", "cym", "Welsh"},
{"hi", "hin", "Hindi"},
{"hy", "hye", "Armenian"},
{"id", "ind", "Indonesian"},
{"is", "isl", "Icelandic"},
{"ka", "kat", "Georgian"},
{"la", "lat", "Latin"},
{"mk", "mkd", "Macedonian"},
{"no", "nor", "Norwegian"},
{"ru", "rus", "Russian"},
{"sq", "sqi", "Albanian"},
{"vi", "vie", "Vietnamese"},
{"zh-yue", "zho-HKG", "Chinese (Cantonese)"},
{"grc", "grc", "Ancient Greek"},
{"jbo", "jbo", "Lojban"},
{"nci", "nci", "Nahuatl (Classical)"},
{"pap", "pap", "Papiamento" }
};

const int NUM_SUPPORTED_VOICES = 55;

/* Integer constants */
const int DEFAULT_SPEECH_RATE = 150;

// Callback to the TTS API
synthDoneCB_t *ttsSynthDoneCBPointer;

char *currentLanguage = (char *) "en-us";
char *currentRate = (char *) "150";
char *eSpeakDataPath = NULL;

char currentLang[10];
char currentCountry[10];
char currentVariant[10];

bool hasInitialized = false;

/* Functions internal to the eSpeak engine wrapper */
static void setSpeechRate(int speechRate) {
espeak_ERROR err = espeak_SetParameter(espeakRATE, speechRate, 0);
}

/* Functions exposed to the TTS API */

/* Callback from espeak. Should call back to the TTS API */
static int eSpeakCallback(short *wav, int numsamples, espeak_EVENT *events) {
LOGI("Callback with %d samples", numsamples);

int8_t * castedWav = (int8_t *) wav;
size_t bufferSize = 0;
if (numsamples < 1) {
size_t silenceBufferSize = 2;
int8_t *silence = new int8_t[silenceBufferSize]; // TODO: This will be a small memory leak, but do it this way for now because passing in an empty buffer can cause a crash.
silence[0] = 0;
silence[1] = 0;
ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, silence,
silenceBufferSize, TTS_SYNTH_DONE);
return 1;
}
bufferSize = numsamples * sizeof(short);
ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, castedWav,
bufferSize, TTS_SYNTH_PENDING);
return 0; // continue synthesis (1 is to abort)
}

static bool fileExists(char *fileName) {
FILE *file = fopen(fileName, "r");

if (file == NULL) {
return false;
} else {
fclose(file);
return true;
}
}

static bool hasBaseResources() {
char filename[255];

for (int i = 0; i < NUM_BASE_RESOURCES; i++) {
sprintf(filename, "%s/%s/%s", eSpeakDataPath, ESPEAK_DIRECTORY, eSpeakBaseResources[i]);

if (!fileExists(filename)) {
LOGE("Missing resource: %s", filename);
return false;
}
}

return true;
}

/* Google Engine API function implementations */

tts_result attemptInit() {
if (hasInitialized) {
return TTS_SUCCESS;
}

if (!hasBaseResources()) {
return TTS_FAILURE;
}

// TODO Make sure that the speech data is loaded in
// the directory /sdcard/espeak-data before calling this.
int sampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 4096, eSpeakDataPath, 0);

if (sampleRate <= 0) {
LOGE("eSpeak initialization failed!");
return TTS_FAILURE;
}

espeak_SetSynthCallback(eSpeakCallback);
espeak_SetParameter(espeakRATE, DEFAULT_SPEECH_RATE, 0);

espeak_VOICE voice;
memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first
const char *langNativeString = "en-us"; //Default to US English
voice.languages = langNativeString;
voice.variant = 0;
espeak_SetVoiceByProperties(&voice);

hasInitialized = true;

return TTS_SUCCESS;
}

/** init
* Allocates eSpeak memory block and initializes the eSpeak system.
* synthDoneCBPtr - Pointer to callback function which will receive generated samples
* config - the engine configuration parameters, not used here
* return tts_result
*/
tts_result TtsEngine::init(synthDoneCB_t synthDoneCBPtr, const char *engineConfig) {
ttsSynthDoneCBPointer = synthDoneCBPtr;
hasInitialized = false;

if ((engineConfig != NULL) && (strlen(engineConfig) > 0)) {
eSpeakDataPath = (char *) malloc(strlen(engineConfig));
strcpy(eSpeakDataPath, engineConfig);
} else {
eSpeakDataPath = NULL;
LOGE("Data path not specified!");
return TTS_FAILURE;
}

return attemptInit();
}

/** shutdown
* Unloads all eSpeak resources; terminates eSpeak system and frees eSpeak memory block.
* return tts_result
*/
tts_result TtsEngine::shutdown(void) {
if (eSpeakDataPath != NULL) {
free(eSpeakDataPath);
}

espeak_Terminate();

return TTS_SUCCESS;
}

tts_result TtsEngine::loadLanguage(const char *lang, const char *country, const char *variant) {
LOGV("loadLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant);

return TTS_FAILURE;
}

tts_support_result isLanguageSupported(const char *lang, const char *country, const char *variant,
int *pindex) {
LOGV("isLanguageSupported(\"%s\", \"%s\", \"%s\")", lang, country, variant);

if ((lang == NULL) || (strlen(lang) == 0)) {
LOGE("TtsEngine::isLanguageAvailable called with no language");
return TTS_LANG_NOT_SUPPORTED;
}

if (pindex != NULL) {
*pindex = -1;
}

int langIndex = -1;
int countryIndex = -1;
int variantIndex = -1;

if (strlen(lang) == 3) {
for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) {
if (strncmp(lang, eSpeakSupportedVoices[i][1], 3) == 0) {
LOGI("Found ISO3 language at index %d", i);
langIndex = i;
break;
}
}
} else if (strlen(lang) == 2) {
for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) {
if (strncmp(lang, eSpeakSupportedVoices[i][0], 2) == 0) {
LOGI("Found ISO2 language at index %d", i);
langIndex = i;
break;
}
}
}

if (langIndex < 0) {
LOGV("TtsEngine::isLanguageAvailable called with unsupported language");
return TTS_LANG_NOT_SUPPORTED;
}

if ((country == NULL) || (strlen(country) == 0)) {
// TODO: Check whether resources are available for this language.

if (pindex != NULL) {
*pindex = langIndex;
}

LOGI("No country specified, language is available");
return TTS_LANG_AVAILABLE;
}

char lang_country[10];
sprintf(lang_country, "%s-%s", lang, country);

// Find country
if (strlen(country) == 3) {
for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) {
if ((strcmp(lang_country, eSpeakSupportedVoices[i][1]) == 0)) {
LOGI("Found ISO3 country at index %d", i);
countryIndex = i;
break;
}
}
} else if (strlen(country) == 2) {
for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) {
if ((strcmp(lang_country, eSpeakSupportedVoices[i][0]) == 0)) {
LOGI("Found ISO2 country at index %d", i);
countryIndex = i;
break;
}
}
}

if (countryIndex < 0) {
if (pindex != NULL) {
*pindex = langIndex;
}

LOGI("No country found, language is available");
return TTS_LANG_AVAILABLE;
}

if ((variant == NULL) || (strlen(variant) == 0)) {
if (pindex != NULL) {
*pindex = countryIndex;
}

LOGI("No variant specified, language and country are available");
return TTS_LANG_COUNTRY_AVAILABLE;
}

char lang_country_variant[15];
sprintf(lang_country_variant, "%s-%s-%s", lang, country, variant);

// Find variant
for (int i = countryIndex; i < NUM_SUPPORTED_VOICES; i++) {
if ((strcmp(lang_country_variant, eSpeakSupportedVoices[i][1]) == 0)) {
LOGI("Found variant at index %d", i);
variantIndex = i;
break;
}
}

if (variantIndex < 0) {
if (pindex != NULL) {
*pindex = countryIndex;
}

LOGI("No variant found, language and country are available");
return TTS_LANG_COUNTRY_AVAILABLE;
}

if (pindex != NULL) {
*pindex = variantIndex;
}

LOGI("Language, country, and variant are available");
return TTS_LANG_COUNTRY_VAR_AVAILABLE;
}

tts_result TtsEngine::setLanguage(const char *lang, const char *country, const char *variant) {
LOGV("setLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant);

// Make sure the engine is initialized!
attemptInit();

int index = -1;

isLanguageSupported(lang, country, variant, &index);

if (index < 0) {
LOGE("setLanguage called with unsupported language");
return TTS_FAILURE;
}

strcpy(currentLang, lang);
strcpy(currentCountry, country);
strcpy(currentVariant, variant);

char espeakLangStr[7];
strcpy(espeakLangStr, eSpeakSupportedVoices[index][0]);

espeak_VOICE voice;
memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first
voice.variant = 0;
voice.languages = espeakLangStr;
espeak_ERROR err = espeak_SetVoiceByProperties(&voice);
currentLanguage = new char[strlen(lang)];
strcpy(currentLanguage, lang);

if (err != EE_OK) {
LOGE("Error code %d when setting voice properties!", err);
return TTS_FAILURE;
}

return TTS_SUCCESS;
}

tts_support_result TtsEngine::isLanguageAvailable(const char *lang, const char *country,
const char *variant) {
return isLanguageSupported(lang, country, variant, NULL);
}

tts_result TtsEngine::getLanguage(char *language, char *country, char *variant) {
strcpy(language, currentLang);
strcpy(country, currentCountry);
strcpy(variant, currentVariant);

return TTS_SUCCESS;
}

/** setAudioFormat
* sets the audio format to use for synthesis, returns what is actually used.
* @encoding - reference to encoding format
* @rate - reference to sample rate
* @channels - reference to number of channels
* return tts_result
* */
tts_result TtsEngine::setAudioFormat(tts_audio_format& encoding, uint32_t& rate, int& channels) {
LOGE("setAudioFormat(%d, %d, %d) is unsupported", encoding, rate, channels);

// TODO: Fix this!
return TTS_SUCCESS;
}

// Sets the property with the specified value
tts_result TtsEngine::setProperty(const char *property, const char *value, const size_t size) {
LOGV("setProperty(\"%s\", \"%s\", %d)", property, value, size);

/* Set a specific property for the engine.
Supported properties include: language (locale), rate, pitch, volume. */
/* Sanity check */
if (property == NULL) {
LOGE("setProperty called with property NULL");
return TTS_PROPERTY_UNSUPPORTED;
}

if (value == NULL) {
LOGE("setProperty called with value NULL");
return TTS_VALUE_INVALID;
}

espeak_ERROR result;

if (strncmp(property, "language", 8) == 0) {
// TODO: Set this property
result = EE_OK;
} else if (strncmp(property, "rate", 4) == 0) {
int rate = atoi(value) * DEFAULT_SPEECH_RATE / 100;
result = espeak_SetParameter(espeakRATE, rate, 0);
} else if (strncmp(property, "pitch", 5) == 0) {
int pitch = atoi(value);
result = espeak_SetParameter(espeakPITCH, pitch, 0);
} else if (strncmp(property, "volume", 6) == 0) {
int volume = atoi(value);
result = espeak_SetParameter(espeakVOLUME, volume, 0);
} else {
return TTS_PROPERTY_UNSUPPORTED;
}

if (result == EE_OK) {
return TTS_SUCCESS;
} else {
return TTS_FAILURE;
}
}

// Sets the property with the specified value
tts_result TtsEngine::getProperty(const char *property, char *value, size_t *iosize) {
LOGV("getProperty(\"%s\", ...)", property);

/* Get the property for the engine.
This property was previously set by setProperty or by default. */
/* sanity check */
if (property == NULL) {
LOGE("getProperty called with property NULL");
return TTS_PROPERTY_UNSUPPORTED;
}

if (value == NULL) {
LOGE("getProperty called with value NULL");
return TTS_VALUE_INVALID;
}

if (strncmp(property, "language", 8) == 0) {
if (currentLanguage == NULL) {
strcpy(value, "");
} else {
if (*iosize < strlen(currentLanguage)+1) {
*iosize = strlen(currentLanguage) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, currentLanguage);
}
return TTS_SUCCESS;
} else if (strncmp(property, "rate", 4) == 0) {
int rate = espeak_GetParameter(espeakRATE, 1) * 100 / DEFAULT_SPEECH_RATE;
char tmprate[4];
sprintf(tmprate, "%d", rate);
if (*iosize < strlen(tmprate)+1) {
*iosize = strlen(tmprate) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, tmprate);
return TTS_SUCCESS;
} else if (strncmp(property, "pitch", 5) == 0) {
char tmppitch[4];
sprintf(tmppitch, "%d", espeak_GetParameter(espeakPITCH, 1));
if (*iosize < strlen(tmppitch)+1) {
*iosize = strlen(tmppitch) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, tmppitch);
return TTS_SUCCESS;
} else if (strncmp(property, "volume", 6) == 0) {
char tmpvolume[4];
sprintf(tmpvolume, "%d", espeak_GetParameter(espeakVOLUME, 1));
if (*iosize < strlen(tmpvolume)+1) {
*iosize = strlen(tmpvolume) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, tmpvolume);
return TTS_SUCCESS;
}

LOGE("Unsupported property");
return TTS_PROPERTY_UNSUPPORTED;
}

/** synthesizeText
* Synthesizes a text string.
* The text string could be annotated with SSML tags.
* @text - text to synthesize
* @buffer - buffer which will receive generated samples
* @bufferSize - size of buffer
* @userdata - pointer to user data which will be passed back to callback function
* return tts_result
*/
tts_result TtsEngine::synthesizeText(const char *text, int8_t *buffer, size_t bufferSize,
void *userdata) {
LOGI("Synthesize: %s", text);

espeak_SetSynthCallback(eSpeakCallback);

unsigned int unique_identifier;

espeak_Synth(text, strlen(text), 0, // position
POS_CHARACTER, 0, // end position (0 means no end position)
espeakCHARS_UTF8 | espeakSSML, // use or ignore xml tags
&unique_identifier, userdata);
espeak_Synchronize();

LOGI("Synthesis done");

return TTS_SUCCESS;
}

/** stop
* Aborts the running synthesis.
* return tts_result
*/
tts_result TtsEngine::stop() {
espeak_Cancel();
return TTS_SUCCESS;
}

#ifdef __cplusplus
extern "C" {
#endif

TtsEngine* getTtsEngine() {
return new TtsEngine();
}

#ifdef __cplusplus
}
#endif

+ 341
- 0
jni/jni/com_googlecode_eyesfree_espeak_eSpeakService.cpp View File

/*
* Copyright (C) 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <jni.h>

#include <speak_lib.h>
#include <TtsEngine.h>
#include <Log.h>

#define LOG_TAG "eSpeakService"
#define DEBUG false

enum audio_channel_count {
CHANNEL_COUNT_MONO = 1,
CHANNEL_COUNT_STEREO = 2
};

enum audio_encoding {
ENCODING_INVALID = 0x00,
ENCODING_DEFAULT = 0x01,
ENCODING_PCM_16BIT = 0x02,
ENCODING_PCM_8BIT = 0x03
};

enum synthesis_result {
SYNTH_CONTINUE = 0,
SYNTH_ABORT = 1
};

const int DEFAULT_SAMPLE_RATE = 22050;
const int DEFAULT_CHANNEL_COUNT = CHANNEL_COUNT_MONO;
const int DEFAULT_AUDIO_FORMAT = ENCODING_PCM_16BIT;
const int DEFAULT_BUFFER_SIZE = 1000;

struct native_data_t {
JNIEnv *env;
jobject object;
int sampleRate;
int channelCount;
int audioFormat;
int bufferSizeInMillis;

native_data_t() {
env = NULL;
object = NULL;
sampleRate = DEFAULT_SAMPLE_RATE;
channelCount = DEFAULT_CHANNEL_COUNT;
audioFormat = DEFAULT_AUDIO_FORMAT;
bufferSizeInMillis = DEFAULT_BUFFER_SIZE;
}
};

jmethodID METHOD_nativeSynthCallback;
jfieldID FIELD_mNativeData;

static inline native_data_t *getNativeData(JNIEnv *env, jobject object) {
return (native_data_t *) (env->GetIntField(object, FIELD_mNativeData));
}

/* Callback from espeak. Should call back to the TTS API */
static int SynthCallback(short *audioData, int numSamples,
espeak_EVENT *events) {
native_data_t *nat = (native_data_t *) events->user_data;
JNIEnv *env = nat->env;
jobject object = nat->object;

if (numSamples < 1) {
env->CallVoidMethod(object, METHOD_nativeSynthCallback, NULL);
return SYNTH_ABORT;
} else {
jbyteArray arrayAudioData = env->NewByteArray(numSamples * 2);
env->SetByteArrayRegion(arrayAudioData, 0, (numSamples * 2), (jbyte *) audioData);
env->CallVoidMethod(object, METHOD_nativeSynthCallback, arrayAudioData);
return SYNTH_CONTINUE;
}
}

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

JNIEXPORT jint
JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) {
JNIEnv *env;

if (vm->GetEnv((void **) &env, JNI_VERSION_1_6) != JNI_OK) {
LOGE("Failed to get the environment using GetEnv()");
return -1;
}

return JNI_VERSION_1_6;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeClassInit(
JNIEnv* env, jclass clazz) {
if (DEBUG) LOGV("%s", __FUNCTION__);
METHOD_nativeSynthCallback = env->GetMethodID(clazz, "nativeSynthCallback", "([B)V");
FIELD_mNativeData = env->GetFieldID(clazz, "mNativeData", "I");

return JNI_TRUE;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeCreate(
JNIEnv *env, jobject object, jstring path) {
if (DEBUG) LOGV("%s", __FUNCTION__);
native_data_t *nat = new native_data_t;

if (nat == NULL) {
LOGE("%s: out of memory!", __FUNCTION__);
return JNI_FALSE;
}

env->SetIntField(object, FIELD_mNativeData, (jint) nat);

const char *c_path = env->GetStringUTFChars(path, NULL);

nat->object = env->NewWeakGlobalRef(object);
if (DEBUG) LOGV("Initializing with path %s", c_path);
nat->sampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, nat->bufferSizeInMillis, c_path, 0);

env->ReleaseStringUTFChars(path, c_path);

if (nat->sampleRate > 0) {
return JNI_TRUE;
} else {
return JNI_FALSE;
}
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeDestroy(
JNIEnv *env, jobject object) {
if (DEBUG) LOGV("%s", __FUNCTION__);
native_data_t *nat = getNativeData(env, object);

env->DeleteWeakGlobalRef(nat->object);

free(nat);

return JNI_TRUE;
}

JNIEXPORT jint
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetSampleRate(
JNIEnv *env, jobject object) {
if (DEBUG) LOGV("%s", __FUNCTION__);
const native_data_t *nat = getNativeData(env, object);
return (jint) nat->sampleRate;
}

JNIEXPORT jint
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetChannelCount(
JNIEnv *env, jobject object) {
if (DEBUG) LOGV("%s", __FUNCTION__);
const native_data_t *nat = getNativeData(env, object);
return (jint) nat->channelCount;
}

JNIEXPORT jint
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetAudioFormat(
JNIEnv *env, jobject object) {
if (DEBUG) LOGV("%s", __FUNCTION__);
const native_data_t *nat = getNativeData(env, object);
return (jint) nat->audioFormat;
}

JNIEXPORT jint
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetBufferSizeInMillis(
JNIEnv *env, jobject object) {
if (DEBUG) LOGV("%s", __FUNCTION__);
const native_data_t *nat = getNativeData(env, object);
return (jint) nat->bufferSizeInMillis;
}

JNIEXPORT jobjectArray
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeGetAvailableVoices(
JNIEnv *env, jobject object) {
if (DEBUG) LOGV("%s", __FUNCTION__);

const espeak_VOICE **voices = espeak_ListVoices(NULL);

int count;

// First, count the number of voices returned.
for (count = 0; voices[count] != NULL; count++);

// Next, create a Java String array.
jobjectArray voicesArray = (jobjectArray) env->NewObjectArray(
count * 4, env->FindClass("java/lang/String"), NULL);

const espeak_VOICE *v;
char gender_buf[12];
char age_buf[12];

// Finally, populate the array.
for (int i = 0, voicesIndex = 0; (v = voices[i]) != NULL; i++) {
const char *lang_name = v->languages + 1;
const char *identifier = v->identifier;
sprintf(gender_buf, "%d", v->gender);
sprintf(age_buf, "%d", v->age);

env->SetObjectArrayElement(
voicesArray, voicesIndex++, env->NewStringUTF(lang_name));
env->SetObjectArrayElement(
voicesArray, voicesIndex++, env->NewStringUTF(identifier));
env->SetObjectArrayElement(
voicesArray, voicesIndex++, env->NewStringUTF(gender_buf));
env->SetObjectArrayElement(
voicesArray, voicesIndex++, env->NewStringUTF(age_buf));
}

return voicesArray;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetVoiceByProperties(
JNIEnv *env, jobject object, jstring name, jstring languages, jint gender, jint age,
jint variant) {
if (DEBUG) LOGV("%s", __FUNCTION__);

const char *c_name = env->GetStringUTFChars(name, NULL);
const char *c_languages = env->GetStringUTFChars(languages, NULL);

espeak_VOICE voice_select;
memset(&voice_select, 0, sizeof(espeak_VOICE));

voice_select.name = c_name;
voice_select.languages = c_languages;
voice_select.age = (int) age;
voice_select.gender = (int) gender;
voice_select.variant = (int) variant;

const espeak_ERROR result = espeak_SetVoiceByProperties(&voice_select);

env->ReleaseStringUTFChars(name, c_name);
env->ReleaseStringUTFChars(languages, c_languages);

if (result == EE_OK)
return JNI_TRUE;
else
return JNI_FALSE;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetLanguage(
JNIEnv *env, jobject object, jstring language, jint variant) {
if (DEBUG) LOGV("%s", __FUNCTION__);
const char *c_language = env->GetStringUTFChars(language, NULL);
const int len = strlen(c_language);
char *lang_copy = (char *) calloc(len, sizeof(char));
strcpy(lang_copy, c_language);
env->ReleaseStringUTFChars(language, c_language);

espeak_VOICE voice;
memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first
voice.languages = lang_copy;
voice.variant = (int) variant;
const espeak_ERROR result = espeak_SetVoiceByProperties(&voice);

if (result == EE_OK)
return JNI_TRUE;
else
return JNI_FALSE;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetRate(
JNIEnv *env, jobject object, jint rate) {
if (DEBUG) LOGV("%s", __FUNCTION__);
const espeak_ERROR result = espeak_SetParameter(espeakRATE, (int) rate, 0);

if (result == EE_OK)
return JNI_TRUE;
else
return JNI_FALSE;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSetPitch(
JNIEnv *env, jobject object, jint pitch) {
if (DEBUG) LOGV("%s", __FUNCTION__);
const espeak_ERROR result = espeak_SetParameter(espeakPITCH, (int) pitch, 0);

if (result == EE_OK)
return JNI_TRUE;
else
return JNI_FALSE;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeSynthesize(
JNIEnv *env, jobject object, jstring text) {
if (DEBUG) LOGV("%s", __FUNCTION__);
native_data_t *nat = getNativeData(env, object);
const char *c_text = env->GetStringUTFChars(text, NULL);
unsigned int unique_identifier;

nat->env = env;

espeak_SetSynthCallback(SynthCallback);
espeak_Synth(c_text, strlen(c_text), 0, // position
POS_CHARACTER, 0, // end position (0 means no end position)
espeakCHARS_UTF8 | espeakSSML, // use or ignore xml tags
&unique_identifier, nat);
espeak_Synchronize();

env->ReleaseStringUTFChars(text, c_text);

return JNI_TRUE;
}

JNIEXPORT jboolean
JNICALL Java_com_googlecode_eyesfree_espeak_SpeechSynthesis_nativeStop(
JNIEnv *env, jobject object) {
if (DEBUG) LOGV("%s", __FUNCTION__);
espeak_Cancel();

return JNI_TRUE;
}

#ifdef __cplusplus
}
#endif /* __cplusplus */

Loading…
Cancel
Save