eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

eSpeakService.cpp 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. /*
  2. * Copyright (C) 2012-2013 Reece H. Dunn
  3. * Copyright (C) 2011 Google Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /*
  18. * This file contains the JNI bindings to eSpeak used by SpeechSynthesis.java.
  19. *
  20. * Android Version: 4.0 (Ice Cream Sandwich)
  21. * API Version: 14
  22. */
  23. #include <stdio.h>
  24. #include <unistd.h>
  25. #include <stdlib.h>
  26. #include <jni.h>
  27. #include <speak_lib.h>
  28. #include <Log.h>
  29. /** @name Java to Wide String Helpers
  30. * @brief These are helpers for converting a jstring to wchar_t*.
  31. *
  32. * This assumes that wchar_t is a 32-bit (UTF-32) value.
  33. */
  34. //@{
  35. const char *utf8_read(const char *in, wchar_t &c)
  36. {
  37. if (uint8_t(*in) < 0x80)
  38. c = *in++;
  39. else switch (uint8_t(*in) & 0xF0)
  40. {
  41. default:
  42. c = uint8_t(*in++) & 0x1F;
  43. c = (c << 6) + (uint8_t(*in++) & 0x3F);
  44. break;
  45. case 0xE0:
  46. c = uint8_t(*in++) & 0x0F;
  47. c = (c << 6) + (uint8_t(*in++) & 0x3F);
  48. c = (c << 6) + (uint8_t(*in++) & 0x3F);
  49. break;
  50. case 0xF0:
  51. c = uint8_t(*in++) & 0x07;
  52. c = (c << 6) + (uint8_t(*in++) & 0x3F);
  53. c = (c << 6) + (uint8_t(*in++) & 0x3F);
  54. c = (c << 6) + (uint8_t(*in++) & 0x3F);
  55. break;
  56. }
  57. return in;
  58. }
  59. class unicode_string
  60. {
  61. static_assert(sizeof(wchar_t) == 4, "wchar_t is not UTF-32");
  62. public:
  63. unicode_string(JNIEnv *env, jstring str);
  64. ~unicode_string();
  65. const wchar_t *c_str() const { return mString; }
  66. private:
  67. wchar_t *mString;
  68. };
  69. unicode_string::unicode_string(JNIEnv *env, jstring str)
  70. : mString(NULL)
  71. {
  72. if (str == NULL) return;
  73. const char *utf8 = env->GetStringUTFChars(str, NULL);
  74. mString = (wchar_t *)malloc((strlen(utf8) + 1) * sizeof(wchar_t));
  75. const char *utf8_current = utf8;
  76. wchar_t *utf32_current = mString;
  77. while (*utf8_current)
  78. {
  79. utf8_current = utf8_read(utf8_current, *utf32_current);
  80. ++utf32_current;
  81. }
  82. *utf32_current = 0;
  83. env->ReleaseStringUTFChars(str, utf8);
  84. }
  85. unicode_string::~unicode_string()
  86. {
  87. if (mString) free(mString);
  88. }
  89. //@}
  90. #define LOG_TAG "eSpeakService"
  91. #define DEBUG true
  92. enum audio_channel_count {
  93. CHANNEL_COUNT_MONO = 1,
  94. CHANNEL_COUNT_STEREO = 2
  95. };
  96. enum audio_encoding {
  97. ENCODING_INVALID = 0x00,
  98. ENCODING_DEFAULT = 0x01,
  99. ENCODING_PCM_16BIT = 0x02,
  100. ENCODING_PCM_8BIT = 0x03
  101. };
  102. enum synthesis_result {
  103. SYNTH_CONTINUE = 0,
  104. SYNTH_ABORT = 1
  105. };
  106. const int DEFAULT_CHANNEL_COUNT = CHANNEL_COUNT_MONO;
  107. const int DEFAULT_AUDIO_FORMAT = ENCODING_PCM_16BIT;
  108. const int DEFAULT_BUFFER_SIZE = 1000;
  109. struct native_data_t {
  110. JNIEnv *env;
  111. jobject object;
  112. int sampleRate;
  113. int channelCount;
  114. int audioFormat;
  115. int bufferSizeInMillis;
  116. native_data_t() {
  117. env = NULL;
  118. object = NULL;
  119. sampleRate = 0;
  120. channelCount = DEFAULT_CHANNEL_COUNT;
  121. audioFormat = DEFAULT_AUDIO_FORMAT;
  122. bufferSizeInMillis = DEFAULT_BUFFER_SIZE;
  123. }
  124. };
  125. jmethodID METHOD_nativeSynthCallback;
  126. jfieldID FIELD_mNativeData;
  127. static inline native_data_t *getNativeData(JNIEnv *env, jobject object) {
  128. return (native_data_t *) (env->GetIntField(object, FIELD_mNativeData));
  129. }
  130. /* Callback from espeak. Should call back to the TTS API */
  131. static int SynthCallback(short *audioData, int numSamples,
  132. espeak_EVENT *events) {
  133. native_data_t *nat = (native_data_t *) events->user_data;
  134. JNIEnv *env = nat->env;
  135. jobject object = nat->object;
  136. if (numSamples < 1) {
  137. env->CallVoidMethod(object, METHOD_nativeSynthCallback, NULL);
  138. return SYNTH_ABORT;
  139. } else {
  140. jbyteArray arrayAudioData = env->NewByteArray(numSamples * 2);
  141. env->SetByteArrayRegion(arrayAudioData, 0, (numSamples * 2), (jbyte *) audioData);
  142. env->CallVoidMethod(object, METHOD_nativeSynthCallback, arrayAudioData);
  143. return SYNTH_CONTINUE;
  144. }
  145. }
  146. #ifdef __cplusplus
  147. extern "C" {
  148. #endif /* __cplusplus */
  149. JNIEXPORT jint
  150. JNICALL JNI_OnLoad(JavaVM *vm, void *reserved) {
  151. JNIEnv *env;
  152. if (vm->GetEnv((void **) &env, JNI_VERSION_1_6) != JNI_OK) {
  153. LOGE("Failed to get the environment using GetEnv()");
  154. return -1;
  155. }
  156. return JNI_VERSION_1_6;
  157. }
  158. JNIEXPORT jboolean
  159. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeClassInit(
  160. JNIEnv* env, jclass clazz) {
  161. if (DEBUG) LOGV("%s", __FUNCTION__);
  162. METHOD_nativeSynthCallback = env->GetMethodID(clazz, "nativeSynthCallback", "([B)V");
  163. FIELD_mNativeData = env->GetFieldID(clazz, "mNativeData", "I");
  164. return JNI_TRUE;
  165. }
  166. JNIEXPORT jboolean
  167. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeCreate(
  168. JNIEnv *env, jobject object, jstring path) {
  169. if (DEBUG) LOGV("%s [env=%p, object=%p]", __FUNCTION__, env, object);
  170. native_data_t *nat = new native_data_t;
  171. if (nat == NULL) {
  172. LOGE("%s: out of memory!", __FUNCTION__);
  173. return JNI_FALSE;
  174. }
  175. env->SetIntField(object, FIELD_mNativeData, (jint) nat);
  176. const char *c_path = path ? env->GetStringUTFChars(path, NULL) : NULL;
  177. nat->object = env->NewWeakGlobalRef(object);
  178. if (DEBUG) LOGV("Initializing with path %s", c_path);
  179. nat->sampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, nat->bufferSizeInMillis, c_path, 0);
  180. if (c_path) env->ReleaseStringUTFChars(path, c_path);
  181. return (nat->sampleRate > 0) ? JNI_TRUE : JNI_FALSE;
  182. }
  183. JNIEXPORT jboolean
  184. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeDestroy(
  185. JNIEnv *env, jobject object) {
  186. if (DEBUG) LOGV("%s [env=%p, object=%p]", __FUNCTION__, env, object);
  187. native_data_t *nat = getNativeData(env, object);
  188. if (nat) {
  189. env->DeleteWeakGlobalRef(nat->object);
  190. delete nat;
  191. } else {
  192. env->DeleteWeakGlobalRef(object);
  193. }
  194. return JNI_TRUE;
  195. }
  196. JNIEXPORT jobject
  197. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeGetVersion(
  198. JNIEnv *env, jclass clazz) {
  199. if (DEBUG) LOGV("%s", __FUNCTION__);
  200. return env->NewStringUTF(espeak_Info(NULL));
  201. }
  202. JNIEXPORT jint
  203. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeGetSampleRate(
  204. JNIEnv *env, jobject object) {
  205. if (DEBUG) LOGV("%s", __FUNCTION__);
  206. const native_data_t *nat = getNativeData(env, object);
  207. return (jint)(nat ? nat->sampleRate : 0);
  208. }
  209. JNIEXPORT jint
  210. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeGetChannelCount(
  211. JNIEnv *env, jobject object) {
  212. if (DEBUG) LOGV("%s", __FUNCTION__);
  213. const native_data_t *nat = getNativeData(env, object);
  214. return (jint) nat->channelCount;
  215. }
  216. JNIEXPORT jint
  217. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeGetAudioFormat(
  218. JNIEnv *env, jobject object) {
  219. if (DEBUG) LOGV("%s", __FUNCTION__);
  220. const native_data_t *nat = getNativeData(env, object);
  221. return (jint) nat->audioFormat;
  222. }
  223. JNIEXPORT jint
  224. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeGetBufferSizeInMillis(
  225. JNIEnv *env, jobject object) {
  226. if (DEBUG) LOGV("%s", __FUNCTION__);
  227. const native_data_t *nat = getNativeData(env, object);
  228. return (jint) nat->bufferSizeInMillis;
  229. }
  230. JNIEXPORT jobjectArray
  231. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeGetAvailableVoices(
  232. JNIEnv *env, jobject object) {
  233. if (DEBUG) LOGV("%s", __FUNCTION__);
  234. const espeak_VOICE **voices = espeak_ListVoices(NULL);
  235. int count;
  236. // First, count the number of voices returned.
  237. for (count = 0; voices[count] != NULL; count++);
  238. // Next, create a Java String array.
  239. jobjectArray voicesArray = (jobjectArray) env->NewObjectArray(
  240. count * 4, env->FindClass("java/lang/String"), NULL);
  241. const espeak_VOICE *v;
  242. char gender_buf[12];
  243. char age_buf[12];
  244. // Finally, populate the array.
  245. for (int i = 0, voicesIndex = 0; (v = voices[i]) != NULL; i++) {
  246. const char *lang_name = v->languages + 1;
  247. const char *identifier = v->identifier;
  248. sprintf(gender_buf, "%d", v->gender);
  249. sprintf(age_buf, "%d", v->age);
  250. env->SetObjectArrayElement(
  251. voicesArray, voicesIndex++, env->NewStringUTF(lang_name));
  252. env->SetObjectArrayElement(
  253. voicesArray, voicesIndex++, env->NewStringUTF(identifier));
  254. env->SetObjectArrayElement(
  255. voicesArray, voicesIndex++, env->NewStringUTF(gender_buf));
  256. env->SetObjectArrayElement(
  257. voicesArray, voicesIndex++, env->NewStringUTF(age_buf));
  258. }
  259. return voicesArray;
  260. }
  261. JNIEXPORT jboolean
  262. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeSetVoiceByName(
  263. JNIEnv *env, jobject object, jstring name) {
  264. const char *c_name = name ? env->GetStringUTFChars(name, NULL) : NULL;
  265. if (DEBUG) LOGV("%s(name=%s)", __FUNCTION__, c_name);
  266. const espeak_ERROR result = espeak_SetVoiceByName(c_name);
  267. if (c_name) env->ReleaseStringUTFChars(name, c_name);
  268. switch (result) {
  269. case EE_OK: return JNI_TRUE;
  270. case EE_INTERNAL_ERROR: LOGE("espeak_SetVoiceByName: internal error."); break;
  271. case EE_BUFFER_FULL: LOGE("espeak_SetVoiceByName: buffer full."); break;
  272. case EE_NOT_FOUND: LOGE("espeak_SetVoiceByName: not found."); break;
  273. }
  274. return JNI_FALSE;
  275. }
  276. JNIEXPORT jboolean
  277. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeSetVoiceByProperties(
  278. JNIEnv *env, jobject object, jstring language, jint gender, jint age) {
  279. const char *c_language = language ? env->GetStringUTFChars(language, NULL) : NULL;
  280. if (DEBUG) LOGV("%s(language=%s, gender=%d, age=%d)", __FUNCTION__, c_language, gender, age);
  281. espeak_VOICE voice_select;
  282. memset(&voice_select, 0, sizeof(espeak_VOICE));
  283. voice_select.languages = c_language;
  284. voice_select.gender = (int) gender;
  285. voice_select.age = (int) age;
  286. const espeak_ERROR result = espeak_SetVoiceByProperties(&voice_select);
  287. if (c_language) env->ReleaseStringUTFChars(language, c_language);
  288. switch (result) {
  289. case EE_OK: return JNI_TRUE;
  290. case EE_INTERNAL_ERROR: LOGE("espeak_SetVoiceByProperties: internal error."); break;
  291. case EE_BUFFER_FULL: LOGE("espeak_SetVoiceByProperties: buffer full."); break;
  292. case EE_NOT_FOUND: LOGE("espeak_SetVoiceByProperties: not found."); break;
  293. }
  294. return JNI_FALSE;
  295. }
  296. JNIEXPORT jboolean
  297. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeSetParameter(
  298. JNIEnv *env, jobject object, jint parameter, jint value) {
  299. if (DEBUG) LOGV("%s(parameter=%d, value=%d)", __FUNCTION__, parameter, value);
  300. const espeak_ERROR result = espeak_SetParameter((espeak_PARAMETER)parameter, (int)value, 0);
  301. switch (result) {
  302. case EE_OK: return JNI_TRUE;
  303. case EE_INTERNAL_ERROR: LOGE("espeak_SetParameter: internal error."); break;
  304. case EE_BUFFER_FULL: LOGE("espeak_SetParameter: buffer full."); break;
  305. case EE_NOT_FOUND: LOGE("espeak_SetParameter: not found."); break;
  306. }
  307. return JNI_FALSE;
  308. }
  309. JNIEXPORT jint
  310. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeGetParameter(
  311. JNIEnv *env, jobject object, jint parameter, jint current) {
  312. if (DEBUG) LOGV("%s(parameter=%d, pitch=%d)", __FUNCTION__, parameter, current);
  313. return espeak_GetParameter((espeak_PARAMETER)parameter, (int)current);
  314. }
  315. JNIEXPORT jboolean
  316. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeSetPunctuationCharacters(
  317. JNIEnv *env, jobject object, jstring characters) {
  318. if (DEBUG) LOGV("%s)", __FUNCTION__);
  319. unicode_string list(env, characters);
  320. const espeak_ERROR result = espeak_SetPunctuationList(list.c_str());
  321. switch (result) {
  322. case EE_OK: return JNI_TRUE;
  323. case EE_INTERNAL_ERROR: LOGE("espeak_SetPunctuationList: internal error."); break;
  324. case EE_BUFFER_FULL: LOGE("espeak_SetPunctuationList: buffer full."); break;
  325. case EE_NOT_FOUND: LOGE("espeak_SetPunctuationList: not found."); break;
  326. }
  327. return JNI_FALSE;
  328. }
  329. JNIEXPORT jboolean
  330. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeSynthesize(
  331. JNIEnv *env, jobject object, jstring text, jboolean isSsml) {
  332. if (DEBUG) LOGV("%s", __FUNCTION__);
  333. native_data_t *nat = getNativeData(env, object);
  334. const char *c_text = text ? env->GetStringUTFChars(text, NULL) : NULL;
  335. unsigned int unique_identifier;
  336. nat->env = env;
  337. espeak_SetSynthCallback(SynthCallback);
  338. const espeak_ERROR result = espeak_Synth(c_text, strlen(c_text), 0, // position
  339. POS_CHARACTER, 0, // end position (0 means no end position)
  340. isSsml ? espeakCHARS_UTF8 | espeakSSML // UTF-8 encoded SSML
  341. : espeakCHARS_UTF8, // UTF-8 encoded text
  342. &unique_identifier, nat);
  343. espeak_Synchronize();
  344. if (c_text) env->ReleaseStringUTFChars(text, c_text);
  345. switch (result) {
  346. case EE_OK: return JNI_TRUE;
  347. case EE_INTERNAL_ERROR: LOGE("espeak_Synth: internal error."); break;
  348. case EE_BUFFER_FULL: LOGE("espeak_Synth: buffer full."); break;
  349. case EE_NOT_FOUND: LOGE("espeak_Synth: not found."); break;
  350. }
  351. return JNI_TRUE;
  352. }
  353. JNIEXPORT jboolean
  354. JNICALL Java_com_reecedunn_espeak_SpeechSynthesis_nativeStop(
  355. JNIEnv *env, jobject object) {
  356. if (DEBUG) LOGV("%s", __FUNCTION__);
  357. espeak_Cancel();
  358. return JNI_TRUE;
  359. }
  360. #ifdef __cplusplus
  361. }
  362. #endif /* __cplusplus */