mahta.fetrat
/
HomoFast-eSpeak-Persian


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
							/*
 * Copyright (C) 2008 Google Inc.
 * Copyright (C) 2012 Reece H. Dunn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * This file contains the TtsEngine implementation for the eSpeak
 * Text-to-Speech engine.
 *
 * Android Version: 2.2 (Froyo)
 * API Version:     8
 */


#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>

#define LOG_TAG "eSpeak Engine"
#define DEBUG true

#include <speak_lib.h>
#include <TtsEngine.h>
#include <Log.h>

/*
 * This is the Manager layer.  It sits on top of the native eSpeak engine
 * and provides the interface to the defined Google TTS engine API.
 * The Google engine API is the boundary to allow a TTS engine to be swapped.
 * The Manager layer also provide the SSML tag interpretation.
 * The supported SSML tags are mapped to corresponding tags natively supported by eSpeak.
 * Native eSpeak functions always begin with espeak_XXX.
 *
 * Only a subset of SSML 1.0 tags are supported.
 * Some SSML tags involve significant complexity.
 * If the language is changed through an SSML tag, there is a latency for the load.
 */

using namespace android;

const char *ESPEAK_DIRECTORY = "espeak-data";

const char *eSpeakBaseResources[] = {"intonations", "phondata", "phonindex", "phontab",
    "en_dict", "voices/en/en-us" };

const int NUM_BASE_RESOURCES = 6;

// Format is {espeak voice, iso3 code, name}
const char *eSpeakSupportedVoices[][3] = {
        {"en-us",  "eng",        "English"},
        {"en-us",  "eng-USA",    "English (US)"},
        {"en",     "eng-GBR",    "English (UK)"},
        {"en-sc",  "eng-GBR-sc", "English (Scottish)"},
        {"en-n",   "eng-GBR-n",  "English (Northern UK)"},
        {"en-rp",  "eng-GBR-rp", "English (Received Pronunciation)"},
        {"en-wm",  "eng-GBR-wm", "English (West Midlands)"},
        {"af",     "afr",        "Afrikaans"},
        {"bs",     "bos",        "Bosnian"},
        {"ca",     "cat",        "Catalan"},
        {"cs",     "ces",        "Czech"},
        {"da",     "dan",        "Danish"},
        {"de",     "deu",        "German"},
        {"el",     "ell",        "Greek"},
        {"eo",     "epo",        "Esperanto"},
        {"es",     "spa",        "Spanish"},
        {"es-la",  "spa-MEX",    "Spanish (Latin America)"},
        {"fi",     "fin",        "Finnish"},
        {"fr",     "fra",        "French"},
        {"hr",     "hrv",        "Croatian"},
        {"hu",     "hun",        "Hungarian"},
        {"it",     "ita",        "Italian"},
        {"kn",     "kan",        "Kannada"},
        {"ku",     "kur",        "Kurdish"},
        {"lv",     "lav",        "Latvian"},
        {"nl",     "nld",        "Dutch"},
        {"pl",     "pol",        "Polish"},
        {"pt",     "por",        "Portuguese (Brazil)"},
        {"pt",     "por-BRA",    "Portuguese (Brazil)"},
        {"pt-pt",  "por-PRT",    "Portuguese"},
        {"ro",     "ron",        "Romanian"},
        {"sk",     "slk",        "Slovak"},
        {"sr",     "srp",        "Serbian"},
        {"sv",     "swe",        "Swedish"},
        {"sw",     "swa",        "Swahili"},
        {"ta",     "tam",        "Tamil"},
        {"tr",     "tur",        "Turkish"},
        {"zh",     "zho",        "Chinese (Mandarin)"},
        {"cy",     "cym",        "Welsh"},
        {"hi",     "hin",        "Hindi"},
        {"hy",     "hye",        "Armenian"},
        {"id",     "ind",        "Indonesian"},
        {"is",     "isl",        "Icelandic"},
        {"ka",     "kat",        "Georgian"},
        {"la",     "lat",        "Latin"},
        {"mk",     "mkd",        "Macedonian"},
        {"no",     "nor",        "Norwegian"},
        {"ru",     "rus",        "Russian"},
        {"sq",     "sqi",        "Albanian"},
        {"vi",     "vie",        "Vietnamese"},
        {"zh-yue", "zho-HKG",    "Chinese (Cantonese)"},
        {"grc",    "grc",        "Ancient Greek"},
        {"jbo",    "jbo",        "Lojban"},
        {"nci",    "nci",        "Nahuatl (Classical)"},
        {"pap",    "pap",        "Papiamento" }
    };

const int NUM_SUPPORTED_VOICES = 55;

// Callback to the TTS API
synthDoneCB_t *ttsSynthDoneCBPointer;

char *currentLanguage = (char *) "en-us";
char *currentRate = (char *) "150";
char *eSpeakDataPath = NULL;

char currentLang[10];
char currentCountry[10];
char currentVariant[10];

bool hasInitialized = false;

/* Functions internal to the eSpeak engine wrapper */
static void setSpeechRate(int speechRate) {
  espeak_ERROR err = espeak_SetParameter(espeakRATE, speechRate, 0);
}

/* Functions exposed to the TTS API */

/* Callback from espeak.  Should call back to the TTS API */
static int eSpeakCallback(short *wav, int numsamples, espeak_EVENT *events) {
  LOGI("Callback with %d samples", numsamples);

  int8_t * castedWav = (int8_t *) wav;
  size_t bufferSize = 0;
  if (numsamples < 1) {
    int8_t silenceData[] = { 0, 0 };
    size_t silenceBufferSize = sizeof(silenceData)/sizeof(silenceData[0]);
    int8_t *silence = silenceData; // Passing in an empty buffer can cause a crash.
    ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, silence,
                          silenceBufferSize, TTS_SYNTH_DONE);
    return 1;
  }
  bufferSize = numsamples * sizeof(short);
  ttsSynthDoneCBPointer(events->user_data, 22050, TTS_AUDIO_FORMAT_PCM_16_BIT, 1, castedWav,
                        bufferSize, TTS_SYNTH_PENDING);
  return 0; // continue synthesis (1 is to abort)
}

static bool fileExists(char *fileName) {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  FILE *file = fopen(fileName, "r");

  if (file == NULL) {
    return false;
  } else {
    fclose(file);
    return true;
  }
}

static bool hasBaseResources() {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  char filename[255];

  for (int i = 0; i < NUM_BASE_RESOURCES; i++) {
    sprintf(filename, "%s/%s/%s", eSpeakDataPath, ESPEAK_DIRECTORY, eSpeakBaseResources[i]);

    if (!fileExists(filename)) {
      LOGE("Missing resource: %s", filename);
      return false;
    }
  }

  return true;
}

/* Google Engine API function implementations */

tts_result attemptInit() {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  if (hasInitialized) {
    return TTS_SUCCESS;
  }

  if (!hasBaseResources()) {
    return TTS_FAILURE;
  }

  // TODO Make sure that the speech data is loaded in
  // the directory /sdcard/espeak-data before calling this.
  int sampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 4096, eSpeakDataPath, 0);

  if (sampleRate <= 0) {
    LOGE("eSpeak initialization failed!");
    return TTS_FAILURE;
  }

  espeak_SetSynthCallback(eSpeakCallback);

  espeak_VOICE voice;
  memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first
  const char *langNativeString = "en-us"; //Default to US English
  voice.languages = langNativeString;
  voice.variant = 0;
  espeak_SetVoiceByProperties(&voice);

  hasInitialized = true;

  return TTS_SUCCESS;
}

/** init
 *  Allocates eSpeak memory block and initializes the eSpeak system.
 *  synthDoneCBPtr - Pointer to callback function which will receive generated samples
 *  config - the engine configuration parameters, not used here
 *  return tts_result
 */
tts_result TtsEngine::init(synthDoneCB_t synthDoneCBPtr, const char *engineConfig) {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  ttsSynthDoneCBPointer = synthDoneCBPtr;
  hasInitialized = false;

  if ((engineConfig != NULL) && (strlen(engineConfig) > 0)) {
    eSpeakDataPath = (char *) malloc(strlen(engineConfig));
    strcpy(eSpeakDataPath, engineConfig);
  } else {
    eSpeakDataPath = NULL;
    LOGE("Data path not specified!");
    return TTS_FAILURE;
  }

  return attemptInit();
}

/** shutdown
 *  Unloads all eSpeak resources; terminates eSpeak system and frees eSpeak memory block.
 *  return tts_result
 */
tts_result TtsEngine::shutdown(void) {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  if (eSpeakDataPath != NULL) {
    free(eSpeakDataPath);
  }

  espeak_Terminate();

  return TTS_SUCCESS;
}

tts_result TtsEngine::loadLanguage(const char *lang, const char *country, const char *variant) {
  if (DEBUG) LOGV("loadLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant);

  return TTS_FAILURE;
}

tts_support_result isLanguageSupported(const char *lang, const char *country, const char *variant,
                                       int *pindex) {
  if (DEBUG) LOGV("isLanguageSupported(\"%s\", \"%s\", \"%s\")", lang, country, variant);

  if ((lang == NULL) || (strlen(lang) == 0)) {
    LOGE("TtsEngine::isLanguageAvailable called with no language");
    return TTS_LANG_NOT_SUPPORTED;
  }

  if (pindex != NULL) {
    *pindex = -1;
  }

  int langIndex = -1;
  int countryIndex = -1;
  int variantIndex = -1;

  if (strlen(lang) == 3) {
    for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) {
      if (strncmp(lang, eSpeakSupportedVoices[i][1], 3) == 0) {
        LOGI("Found ISO3 language at index %d", i);
        langIndex = i;
        break;
      }
    }
  } else if (strlen(lang) == 2) {
    for (int i = 0; i < NUM_SUPPORTED_VOICES; i++) {
      if (strncmp(lang, eSpeakSupportedVoices[i][0], 2) == 0) {
        LOGI("Found ISO2 language at index %d", i);
        langIndex = i;
        break;
      }
    }
  }

  if (langIndex < 0) {
    LOGV("TtsEngine::isLanguageAvailable called with unsupported language");
    return TTS_LANG_NOT_SUPPORTED;
  }

  if ((country == NULL) || (strlen(country) == 0)) {
    // TODO: Check whether resources are available for this language.

    if (pindex != NULL) {
      *pindex = langIndex;
    }

    LOGI("No country specified, language is available");
    return TTS_LANG_AVAILABLE;
  }

  char lang_country[10];
  sprintf(lang_country, "%s-%s", lang, country);

  // Find country
  if (strlen(country) == 3) {
    for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) {
      if ((strcmp(lang_country, eSpeakSupportedVoices[i][1]) == 0)) {
        LOGI("Found ISO3 country at index %d", i);
        countryIndex = i;
        break;
      }
    }
  } else if (strlen(country) == 2) {
    for (int i = langIndex; i < NUM_SUPPORTED_VOICES; i++) {
      if ((strcmp(lang_country, eSpeakSupportedVoices[i][0]) == 0)) {
        LOGI("Found ISO2 country at index %d", i);
        countryIndex = i;
        break;
      }
    }
  }

  if (countryIndex < 0) {
    if (pindex != NULL) {
      *pindex = langIndex;
    }

    LOGI("No country found, language is available");
    return TTS_LANG_AVAILABLE;
  }

  if ((variant == NULL) || (strlen(variant) == 0)) {
    if (pindex != NULL) {
      *pindex = countryIndex;
    }

    LOGI("No variant specified, language and country are available");
    return TTS_LANG_COUNTRY_AVAILABLE;
  }

  char lang_country_variant[15];
  sprintf(lang_country_variant, "%s-%s-%s", lang, country, variant);

  // Find variant
  for (int i = countryIndex; i < NUM_SUPPORTED_VOICES; i++) {
    if ((strcmp(lang_country_variant, eSpeakSupportedVoices[i][1]) == 0)) {
      LOGI("Found variant at index %d", i);
      variantIndex = i;
      break;
    }
  }

  if (variantIndex < 0) {
    if (pindex != NULL) {
      *pindex = countryIndex;
    }

    LOGI("No variant found, language and country are available");
    return TTS_LANG_COUNTRY_AVAILABLE;
  }

  if (pindex != NULL) {
    *pindex = variantIndex;
  }

  LOGI("Language, country, and variant are available");
  return TTS_LANG_COUNTRY_VAR_AVAILABLE;
}

tts_result TtsEngine::setLanguage(const char *lang, const char *country, const char *variant) {
  if (DEBUG) LOGV("setLanguage(\"%s\", \"%s\", \"%s\")", lang, country, variant);

  // Make sure the engine is initialized!
  attemptInit();

  int index = -1;

  isLanguageSupported(lang, country, variant, &index);

  if (index < 0) {
    LOGE("setLanguage called with unsupported language");
    return TTS_FAILURE;
  }

  strcpy(currentLang, lang);
  strcpy(currentCountry, country);
  strcpy(currentVariant, variant);

  char espeakLangStr[7];
  strcpy(espeakLangStr, eSpeakSupportedVoices[index][0]);

  espeak_VOICE voice;
  memset(&voice, 0, sizeof(espeak_VOICE)); // Zero out the voice first
  voice.variant = 0;
  voice.languages = espeakLangStr;
  espeak_ERROR err = espeak_SetVoiceByProperties(&voice);
  currentLanguage = new char[strlen(lang)];
  strcpy(currentLanguage, lang);

  if (err != EE_OK) {
    LOGE("Error code %d when setting voice properties!", err);
    return TTS_FAILURE;
  }

  return TTS_SUCCESS;
}

tts_support_result TtsEngine::isLanguageAvailable(const char *lang, const char *country,
                                                  const char *variant) {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  return isLanguageSupported(lang, country, variant, NULL);
}

tts_result TtsEngine::getLanguage(char *language, char *country, char *variant) {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  strcpy(language, currentLang);
  strcpy(country, currentCountry);
  strcpy(variant, currentVariant);

  return TTS_SUCCESS;
}

/** setAudioFormat
 * sets the audio format to use for synthesis, returns what is actually used.
 * @encoding - reference to encoding format
 * @rate - reference to sample rate
 * @channels - reference to number of channels
 * return tts_result
 * */
tts_result TtsEngine::setAudioFormat(tts_audio_format& encoding, uint32_t& rate, int& channels) {
  LOGE("setAudioFormat(%d, %d, %d) is unsupported", encoding, rate, channels);

  // TODO: Fix this!
  return TTS_SUCCESS;
}

// Sets the property with the specified value
tts_result TtsEngine::setProperty(const char *property, const char *value, const size_t size) {
  if (DEBUG) LOGV("setProperty(\"%s\", \"%s\", %d)", property, value, size);

  /* Set a specific property for the engine.
   Supported properties include: language (locale), rate, pitch, volume.    */
  /* Sanity check */
  if (property == NULL) {
    LOGE("setProperty called with property NULL");
    return TTS_PROPERTY_UNSUPPORTED;
  }

  if (value == NULL) {
    LOGE("setProperty called with value NULL");
    return TTS_VALUE_INVALID;
  }

  espeak_ERROR result;

  if (strncmp(property, "language", 8) == 0) {
    // TODO: Set this property
    result = EE_OK;
  } else if (strncmp(property, "rate", 4) == 0) {
    int rate = atoi(value) * espeak_GetParameter(espeakRATE, 0) / 100;
    if (DEBUG) LOGV("setProperty rate : rate=%s, wpm=%d", value, rate);
    result = espeak_SetParameter(espeakRATE, rate, 0);
  } else if (strncmp(property, "pitch", 5) == 0) {
    int pitch = atoi(value);
    // The values of pitch from android range from 50 - 200, with 100 being normal.
    // The values espeak supports are from 0 - 100, with 50 being normal.
    // Therefore, halve the value to get the value that espeak supports:
    pitch = pitch / 2;
    if (DEBUG) LOGV("setProperty pitch : pitch=%d", pitch);
    result = espeak_SetParameter(espeakPITCH, pitch, 0);
  } else if (strncmp(property, "volume", 6) == 0) {
    int volume = atoi(value);
    result = espeak_SetParameter(espeakVOLUME, volume, 0);
  } else {
    return TTS_PROPERTY_UNSUPPORTED;
  }

  if (result == EE_OK) {
    return TTS_SUCCESS;
  } else {
    return TTS_FAILURE;
  }
}

// Sets the property with the specified value
tts_result TtsEngine::getProperty(const char *property, char *value, size_t *iosize) {
  if (DEBUG) LOGV("getProperty(\"%s\", ...)", property);

  /* Get the property for the engine.
   This property was previously set by setProperty or by default.       */
  /* sanity check */
  if (property == NULL) {
    LOGE("getProperty called with property NULL");
    return TTS_PROPERTY_UNSUPPORTED;
  }

  if (value == NULL) {
    LOGE("getProperty called with value NULL");
    return TTS_VALUE_INVALID;
  }

  if (strncmp(property, "language", 8) == 0) {
    if (currentLanguage == NULL) {
        strcpy(value, "");
    } else {
        if (*iosize < strlen(currentLanguage)+1)  {
            *iosize = strlen(currentLanguage) + 1;
            return TTS_PROPERTY_SIZE_TOO_SMALL;
        }
        strcpy(value, currentLanguage);
    }
    return TTS_SUCCESS;
  } else if (strncmp(property, "rate", 4) == 0) {
    int rate = espeak_GetParameter(espeakRATE, 1) * 100 / espeak_GetParameter(espeakRATE, 0);
    char tmprate[4];
    sprintf(tmprate, "%d", rate);
    if (*iosize < strlen(tmprate)+1) {
        *iosize = strlen(tmprate) + 1;
        return TTS_PROPERTY_SIZE_TOO_SMALL;
    }
    strcpy(value, tmprate);
    return TTS_SUCCESS;
  } else if (strncmp(property, "pitch", 5) == 0) {
    char tmppitch[4];
    sprintf(tmppitch, "%d", (espeak_GetParameter(espeakPITCH, 1) * 2));
    if (*iosize < strlen(tmppitch)+1) {
        *iosize = strlen(tmppitch) + 1;
        return TTS_PROPERTY_SIZE_TOO_SMALL;
    }
    strcpy(value, tmppitch);
    return TTS_SUCCESS;
  } else if (strncmp(property, "volume", 6) == 0) {
    char tmpvolume[4];
    sprintf(tmpvolume, "%d", espeak_GetParameter(espeakVOLUME, 1));
    if (*iosize < strlen(tmpvolume)+1) {
        *iosize = strlen(tmpvolume) + 1;
        return TTS_PROPERTY_SIZE_TOO_SMALL;
    }
    strcpy(value, tmpvolume);
    return TTS_SUCCESS;
  }

  LOGE("Unsupported property");
  return TTS_PROPERTY_UNSUPPORTED;
}

/** synthesizeText
 *  Synthesizes a text string.
 *  The text string could be annotated with SSML tags.
 *  @text     - text to synthesize
 *  @buffer   - buffer which will receive generated samples
 *  @bufferSize - size of buffer
 *  @userdata - pointer to user data which will be passed back to callback function
 *  return tts_result
 */
tts_result TtsEngine::synthesizeText(const char *text, int8_t *buffer, size_t bufferSize,
                                     void *userdata) {
  if (DEBUG) LOGV("%s", __FUNCTION__);

  espeak_SetSynthCallback(eSpeakCallback);

  unsigned int unique_identifier;

  espeak_Synth(text, strlen(text), 0, // position
                     POS_CHARACTER, 0, // end position (0 means no end position)
                     espeakCHARS_UTF8 | espeakSSML, // use or ignore xml tags
                     &unique_identifier, userdata);
  espeak_Synchronize();

  LOGI("Synthesis done");

  return TTS_SUCCESS;
}

/** stop
 *  Aborts the running synthesis.
 *  return tts_result
 */
tts_result TtsEngine::stop() {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  espeak_Cancel();
  return TTS_SUCCESS;
}

#ifdef __cplusplus
extern "C" {
#endif

TtsEngine* getTtsEngine() {
  if (DEBUG) LOGV("%s", __FUNCTION__);
  return new TtsEngine();
}

#ifdef __cplusplus
}
#endif