Code cleanup: relocate functions & remove unused #definesmaster
@@ -28,13 +28,15 @@ | |||
#include <stdlib.h> | |||
#include <string.h> | |||
#include <sys/stat.h> | |||
#include <wctype.h> | |||
#include <espeak-ng/espeak_ng.h> | |||
#include <espeak-ng/speak_lib.h> | |||
#include <espeak-ng/encoding.h> | |||
#include <ucd/ucd.h> | |||
#include "common.h" | |||
#include "translate.h" | |||
#pragma GCC visibility push(default) | |||
@@ -57,4 +59,254 @@ void strncpy0(char *to, const char *from, int size) | |||
strncpy(to, from, size); | |||
to[size-1] = 0; | |||
} | |||
int utf8_in(int *c, const char *buf) | |||
{ | |||
/* Read a unicode characater from a UTF8 string | |||
* Returns the number of UTF8 bytes used. | |||
* buf: position of buffer is moved, if character is read | |||
* c: holds UTF-16 representation of multibyte character by | |||
* skipping UTF-8 header bits of bytes in following way: | |||
* 2-byte character "ā": | |||
* hex binary | |||
* c481 1100010010000001 | |||
* | 11000100 000001 | |||
* V \ \ | | | |||
* 0101 0000000100000001 | |||
* 3-byte character "ꙅ": | |||
* ea9985 111010101001100110000101 | |||
* 1010 011001 000101 | |||
* | + +--.\ \ | | | |||
* V `--. \`. `.| | | |||
* A645 1010011001000101 | |||
* 4-byte character "𠜎": | |||
* f0a09c8e 11110000101000001001110010001110 | |||
* V 000 100000 011100 001110 | |||
* 02070e 000000100000011100001110 | |||
*/ | |||
return utf8_in2(c, buf, 0); | |||
} | |||
#pragma GCC visibility pop | |||
int utf8_out(unsigned int c, char *buf) | |||
{ | |||
// write a UTF-16 character into a buffer as UTF-8 | |||
// returns the number of bytes written | |||
int n_bytes; | |||
int j; | |||
int shift; | |||
static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
if (c < 0x80) { | |||
buf[0] = c; | |||
return 1; | |||
} | |||
if (c >= 0x110000) { | |||
buf[0] = ' '; // out of range character code | |||
return 1; | |||
} | |||
if (c < 0x0800) | |||
n_bytes = 1; | |||
else if (c < 0x10000) | |||
n_bytes = 2; | |||
else | |||
n_bytes = 3; | |||
shift = 6*n_bytes; | |||
buf[0] = code[n_bytes] | (c >> shift); | |||
for (j = 0; j < n_bytes; j++) { | |||
shift -= 6; | |||
buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
} | |||
return n_bytes+1; | |||
} | |||
int utf8_in2(int *c, const char *buf, int backwards) | |||
{ | |||
// Reads a unicode characater from a UTF8 string | |||
// Returns the number of UTF8 bytes used. | |||
// c: holds integer representation of multibyte character | |||
// buf: position of buffer is moved, if character is read | |||
// backwards: set if we are moving backwards through the UTF8 string | |||
int c1; | |||
int n_bytes; | |||
int ix; | |||
static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 }; | |||
// find the start of the next/previous character | |||
while ((*buf & 0xc0) == 0x80) { | |||
// skip over non-initial bytes of a multi-byte utf8 character | |||
if (backwards) | |||
buf--; | |||
else | |||
buf++; | |||
} | |||
n_bytes = 0; | |||
if ((c1 = *buf++) & 0x80) { | |||
if ((c1 & 0xe0) == 0xc0) | |||
n_bytes = 1; | |||
else if ((c1 & 0xf0) == 0xe0) | |||
n_bytes = 2; | |||
else if ((c1 & 0xf8) == 0xf0) | |||
n_bytes = 3; | |||
c1 &= mask[n_bytes]; | |||
for (ix = 0; ix < n_bytes; ix++) | |||
{ | |||
if (!*buf) | |||
/* Oops, truncated */ | |||
break; | |||
c1 = (c1 << 6) + (*buf++ & 0x3f); | |||
} | |||
n_bytes = ix; | |||
} | |||
*c = c1; | |||
return n_bytes+1; | |||
} | |||
int IsAlpha(unsigned int c) | |||
{ | |||
// Replacement for iswalph() which also checks for some in-word symbols | |||
static const unsigned short extra_indic_alphas[] = { | |||
0xa70, 0xa71, // Gurmukhi: tippi, addak | |||
0 | |||
}; | |||
if (iswalpha(c)) | |||
return 1; | |||
if (c < 0x300) | |||
return 0; | |||
if ((c >= 0x901) && (c <= 0xdf7)) { | |||
// Indic scripts: Devanagari, Tamil, etc | |||
if ((c & 0x7f) < 0x64) | |||
return 1; | |||
if (lookupwchar(extra_indic_alphas, c) != 0) | |||
return 1; | |||
if ((c >= 0xd7a) && (c <= 0xd7f)) | |||
return 1; // malaytalam chillu characters | |||
return 0; | |||
} | |||
if ((c >= 0x5b0) && (c <= 0x5c2)) | |||
return 1; // Hebrew vowel marks | |||
if (c == 0x0605) | |||
return 1; | |||
if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e))) | |||
return 1; // arabic vowel marks | |||
if ((c >= 0x300) && (c <= 0x36f)) | |||
return 1; // combining accents | |||
if ((c >= 0xf40) && (c <= 0xfbc)) | |||
return 1; // tibetan | |||
if ((c >= 0x1100) && (c <= 0x11ff)) | |||
return 1; // Korean jamo | |||
if ((c >= 0x2800) && (c <= 0x28ff)) | |||
return 1; // braille | |||
if ((c > 0x3040) && (c <= 0xa700)) | |||
return 1; // Chinese/Japanese. Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure | |||
return 0; | |||
} | |||
// brackets, also 0x2014 to 0x021f which don't need to be in this list | |||
static const unsigned short brackets[] = { | |||
'(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`', | |||
0xab, 0xbb, // double angle brackets | |||
0x300a, 0x300b, // double angle brackets (ideograph) | |||
0xe000+'<', // private usage area | |||
0 | |||
}; | |||
int IsBracket(int c) | |||
{ | |||
if ((c >= 0x2014) && (c <= 0x201f)) | |||
return 1; | |||
return lookupwchar(brackets, c); | |||
} | |||
int IsDigit09(unsigned int c) | |||
{ | |||
if ((c >= '0') && (c <= '9')) | |||
return 1; | |||
return 0; | |||
} | |||
int IsDigit(unsigned int c) | |||
{ | |||
if (iswdigit(c)) | |||
return 1; | |||
if ((c >= 0x966) && (c <= 0x96f)) | |||
return 1; | |||
return 0; | |||
} | |||
int IsSpace(unsigned int c) | |||
{ | |||
if (c == 0) | |||
return 0; | |||
if ((c >= 0x2500) && (c < 0x25a0)) | |||
return 1; // box drawing characters | |||
if ((c >= 0xfff9) && (c <= 0xffff)) | |||
return 1; // unicode specials | |||
return iswspace(c); | |||
} | |||
int isspace2(unsigned int c) | |||
{ | |||
// can't use isspace() because on Windows, isspace(0xe1) gives TRUE ! | |||
int c2; | |||
if (((c2 = (c & 0xff)) == 0) || (c > ' ')) | |||
return 0; | |||
return 1; | |||
} | |||
int is_str_totally_null(const char* str, int size) { | |||
// Tests if all bytes of str are null up to size | |||
// This should never be reimplemented with integers, because | |||
// this function has to work with unaligned char* | |||
// (casting to int when unaligned may result in ungaranteed behaviors) | |||
return (*str == 0 && memcmp(str, str+1, size-1) == 0); | |||
} | |||
int Read4Bytes(FILE *f) | |||
{ | |||
// Read 4 bytes (least significant first) into a word | |||
int ix; | |||
unsigned char c; | |||
int acc = 0; | |||
for (ix = 0; ix < 4; ix++) { | |||
c = fgetc(f) & 0xff; | |||
acc += (c << (ix*8)); | |||
} | |||
return acc; | |||
} | |||
int towlower2(unsigned int c, Translator *translator) | |||
{ | |||
// check for non-standard upper to lower case conversions | |||
if (c == 'I' && translator->langopts.dotless_i) | |||
return 0x131; // I -> ı | |||
return ucd_tolower(c); | |||
} | |||
@@ -21,10 +21,25 @@ | |||
#define ESPEAK_NG_COMMON_H | |||
#include "espeak-ng/espeak_ng.h" | |||
#include "translate.h" | |||
extern ESPEAK_NG_API int GetFileLength(const char *filename); | |||
extern ESPEAK_NG_API void strncpy0(char *to, const char *from, int size); | |||
int IsAlpha(unsigned int c); | |||
int IsBracket(int c); | |||
int IsDigit(unsigned int c); | |||
int IsDigit09(unsigned int c); | |||
int IsSpace(unsigned int c); | |||
int isspace2(unsigned int c); | |||
int is_str_totally_null(const char* str, int size); // Tests if all bytes of str up to size are null | |||
int Read4Bytes(FILE *f); | |||
int towlower2(unsigned int c, Translator *translator); // Supports Turkish I | |||
ESPEAK_NG_API int utf8_in(int *c, const char *buf); | |||
int utf8_in2(int *c, const char *buf, int backwards); | |||
int utf8_out(unsigned int c, char *buf); | |||
#ifdef __cplusplus | |||
} | |||
#endif |
@@ -35,11 +35,10 @@ | |||
#include <espeak-ng/speak_lib.h> | |||
#include <espeak-ng/encoding.h> | |||
#include "common.h" // for GetFileLength, strncpy0 | |||
#include "common.h" // for GetFileLength, strncpy0, ... | |||
#include "error.h" // for create_file_error_context | |||
#include "mnemonics.h" // for LookupMnemName, MNEM_TAB | |||
#include "phoneme.h" // for PHONEME_TAB, PHONEME_TAB_LIST | |||
#include "readclause.h" // for Read4Bytes | |||
#include "spect.h" // for SpectFrame, peak_t, SpectSeq | |||
#include "speech.h" // for path_home, GetFileLength | |||
#include "synthdata.h" // for LoadPhData |
@@ -39,10 +39,8 @@ | |||
#include "error.h" // for create_file_error_context | |||
#include "mnemonics.h" // for LookupMnemName, MNEM_TAB | |||
#include "phoneme.h" // for PHONEME_TAB_LIST, phonSWITCH, phone... | |||
#include "readclause.h" // for towlower2 | |||
#include "speech.h" // for path_home | |||
#include "synthesize.h" // for Write4Bytes | |||
#include "translate.h" // for isspace2, IsDigit09, utf8_in, utf8_out | |||
static FILE *f_log = NULL; | |||
@@ -36,13 +36,15 @@ | |||
#include "dictionary.h" | |||
#include "numbers.h" // for LookupAccentedLetter, Look... | |||
#include "phoneme.h" // for PHONEME_TAB, phVOWEL, phon... | |||
#include "readclause.h" // for WordToString2, is_str_tota... | |||
#include "readclause.h" // for WordToString2 | |||
#include "speech.h" // for path_home | |||
#include "compiledict.h" // for DecodeRule | |||
#include "synthdata.h" // for PhonemeCode, InterpretPhoneme | |||
#include "synthesize.h" // for STRESS_IS_PRIMARY, phoneme... | |||
#include "translate.h" // for Translator, utf8_in, LANGU... | |||
static int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out); | |||
typedef struct { | |||
int points; | |||
const char *phonemes; | |||
@@ -761,96 +763,7 @@ int IsVowel(Translator *tr, int letter) | |||
return IsLetter(tr, letter, LETTERGP_VOWEL2); | |||
} | |||
static int Unpronouncable2(Translator *tr, char *word) | |||
{ | |||
int c; | |||
int end_flags; | |||
char ph_buf[N_WORD_PHONEMES]; | |||
ph_buf[0] = 0; | |||
c = word[-1]; | |||
word[-1] = ' '; // ensure there is a space before the "word" | |||
end_flags = TranslateRules(tr, word, ph_buf, sizeof(ph_buf), NULL, FLAG_UNPRON_TEST, NULL); | |||
word[-1] = c; | |||
if ((end_flags == 0) || (end_flags & SUFX_UNPRON)) | |||
return 1; | |||
return 0; | |||
} | |||
int Unpronouncable(Translator *tr, char *word, int posn) | |||
{ | |||
/* Determines whether a word in 'unpronouncable', i.e. whether it should | |||
be spoken as individual letters. | |||
This function may be language specific. This is a generic version. | |||
*/ | |||
int c; | |||
int c1 = 0; | |||
int vowel_posn = 9; | |||
int index; | |||
int count; | |||
ALPHABET *alphabet; | |||
utf8_in(&c, word); | |||
if ((tr->letter_bits_offset > 0) && (c < 0x241)) { | |||
// Latin characters for a language with a non-latin alphabet | |||
return 0; // so we can re-translate the word as English | |||
} | |||
if (((alphabet = AlphabetFromChar(c)) != NULL) && (alphabet->offset != tr->letter_bits_offset)) { | |||
// Character is not in our alphabet | |||
return 0; | |||
} | |||
if (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 1) | |||
return 0; | |||
if (((c = *word) == ' ') || (c == 0) || (c == '\'')) | |||
return 0; | |||
index = 0; | |||
count = 0; | |||
for (;;) { | |||
index += utf8_in(&c, &word[index]); | |||
if ((c == 0) || (c == ' ')) | |||
break; | |||
if ((c == '\'') && ((count > 1) || (posn > 0))) | |||
break; // "tv'" but not "l'" | |||
if (count == 0) | |||
c1 = c; | |||
if ((c == '\'') && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 3)) { | |||
// don't count apostrophe | |||
} else | |||
count++; | |||
if (IsVowel(tr, c)) { | |||
vowel_posn = count; // position of the first vowel | |||
break; | |||
} | |||
if ((c != '\'') && !iswalpha(c)) | |||
return 0; | |||
} | |||
if ((vowel_posn > 2) && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 2)) { | |||
// Lookup unpronounable rules in *_rules | |||
return Unpronouncable2(tr, word); | |||
} | |||
if (c1 == tr->langopts.param[LOPT_UNPRONOUNCABLE]) | |||
vowel_posn--; // disregard this as the initial letter when counting | |||
if (vowel_posn > (tr->langopts.max_initial_consonants+1)) | |||
return 1; // no vowel, or no vowel in first few letters | |||
return 0; | |||
} | |||
static int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char *vowel_stress, int *vowel_count, int *stressed_syllable, int control) | |||
int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char *vowel_stress, int *vowel_count, int *stressed_syllable, int control) | |||
{ | |||
// control = 1, set stress to 1 for forced unstressed vowels | |||
unsigned char phcode; | |||
@@ -962,55 +875,11 @@ static int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char * | |||
return max_stress; | |||
} | |||
static char stress_phonemes[] = { | |||
const char stress_phonemes[] = { | |||
phonSTRESS_D, phonSTRESS_U, phonSTRESS_2, phonSTRESS_3, | |||
phonSTRESS_P, phonSTRESS_P2, phonSTRESS_TONIC | |||
}; | |||
void ChangeWordStress(Translator *tr, char *word, int new_stress) | |||
{ | |||
int ix; | |||
unsigned char *p; | |||
int max_stress; | |||
int vowel_count; // num of vowels + 1 | |||
int stressed_syllable = 0; // position of stressed syllable | |||
unsigned char phonetic[N_WORD_PHONEMES]; | |||
signed char vowel_stress[N_WORD_PHONEMES/2]; | |||
strcpy((char *)phonetic, word); | |||
max_stress = GetVowelStress(tr, phonetic, vowel_stress, &vowel_count, &stressed_syllable, 0); | |||
if (new_stress >= STRESS_IS_PRIMARY) { | |||
// promote to primary stress | |||
for (ix = 1; ix < vowel_count; ix++) { | |||
if (vowel_stress[ix] >= max_stress) { | |||
vowel_stress[ix] = new_stress; | |||
break; | |||
} | |||
} | |||
} else { | |||
// remove primary stress | |||
for (ix = 1; ix < vowel_count; ix++) { | |||
if (vowel_stress[ix] > new_stress) // >= allows for diminished stress (=1) | |||
vowel_stress[ix] = new_stress; | |||
} | |||
} | |||
// write out phonemes | |||
ix = 1; | |||
p = phonetic; | |||
while (*p != 0) { | |||
if ((phoneme_tab[*p]->type == phVOWEL) && !(phoneme_tab[*p]->phflags & phNONSYLLABIC)) { | |||
if ((vowel_stress[ix] == STRESS_IS_DIMINISHED) || (vowel_stress[ix] > STRESS_IS_UNSTRESSED)) | |||
*word++ = stress_phonemes[(unsigned char)vowel_stress[ix]]; | |||
ix++; | |||
} | |||
*word++ = *p++; | |||
} | |||
*word = 0; | |||
} | |||
void SetWordStress(Translator *tr, char *output, unsigned int *dictionary_flags, int tonic, int control) | |||
{ | |||
/* Guess stress pattern of word. This is language specific | |||
@@ -2345,7 +2214,7 @@ int TranslateRules(Translator *tr, char *p_start, char *phonemes, int ph_size, c | |||
if (letter == 0xe000+'(') { | |||
if (pre_pause < tr->langopts.param[LOPT_BRACKET_PAUSE_ANNOUNCED]) | |||
pre_pause = tr->langopts.param[LOPT_BRACKET_PAUSE_ANNOUNCED]; // a bracket, already spoken by AnnouncePunctuation() | |||
} | |||
} | |||
if (IsBracket(letter)) { | |||
if (pre_pause < tr->langopts.param[LOPT_BRACKET_PAUSE]) | |||
pre_pause = tr->langopts.param[LOPT_BRACKET_PAUSE]; | |||
@@ -2456,37 +2325,6 @@ int TranslateRules(Translator *tr, char *p_start, char *phonemes, int ph_size, c | |||
return 0; | |||
} | |||
void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags) | |||
{ | |||
// apply after the translation is complete | |||
int ix; | |||
int len; | |||
char *p; | |||
len = strlen(phonemes); | |||
if (tr->langopts.param[LOPT_ALT] & 2) { | |||
for (ix = 0; ix < (len-1); ix++) { | |||
if (phonemes[ix] == phonSTRESS_P) { | |||
p = &phonemes[ix+1]; | |||
if ((dict_flags & FLAG_ALT2_TRANS) != 0) { | |||
if (*p == PhonemeCode('E')) | |||
*p = PhonemeCode('e'); | |||
if (*p == PhonemeCode('O')) | |||
*p = PhonemeCode('o'); | |||
} else { | |||
if (*p == PhonemeCode('e')) | |||
*p = PhonemeCode('E'); | |||
if (*p == PhonemeCode('o')) | |||
*p = PhonemeCode('O'); | |||
} | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
int TransposeAlphabet(Translator *tr, char *text) | |||
{ | |||
// transpose cyrillic alphabet (for example) into ascii (single byte) character codes | |||
@@ -2856,6 +2694,21 @@ static const char *LookupDict2(Translator *tr, const char *word, const char *wor | |||
return 0; | |||
} | |||
static int utf8_nbytes(const char *buf) | |||
{ | |||
// Returns the number of bytes for the first UTF-8 character in buf | |||
unsigned char c = (unsigned char)buf[0]; | |||
if (c < 0x80) | |||
return 1; | |||
if (c < 0xe0) | |||
return 2; | |||
if (c < 0xf0) | |||
return 3; | |||
return 4; | |||
} | |||
/* Lookup a specified word in the word dictionary. | |||
Returns phonetic data in 'phonetic' and bits in 'flags' | |||
@@ -3022,7 +2875,7 @@ int Lookup(Translator *tr, const char *word, char *ph_out) | |||
return flags0; | |||
} | |||
int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out) | |||
static int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out) | |||
{ | |||
char buf[100]; | |||
static unsigned int flags[2]; |
@@ -31,22 +31,22 @@ extern "C" | |||
{ | |||
#endif | |||
extern const char stress_phonemes[]; | |||
int LoadDictionary(Translator *tr, const char *name, int no_error); | |||
int HashDictionary(const char *string); | |||
const char *EncodePhonemes(const char *p, char *outptr, int *bad_phoneme); | |||
void DecodePhonemes(const char *inptr, char *outptr); | |||
char *WritePhMnemonic(char *phon_out, PHONEME_TAB *ph, PHONEME_LIST *plist, int use_ipa, int *flags); | |||
const char *GetTranslatedPhonemeString(int phoneme_mode); | |||
int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char *vowel_stress, int *vowel_count, int *stressed_syllable, int control); | |||
int IsVowel(Translator *tr, int letter); | |||
int Unpronouncable(Translator *tr, char *word, int posn); | |||
void ChangeWordStress(Translator *tr, char *word, int new_stress); | |||
void SetWordStress(Translator *tr, char *output, unsigned int *dictionary_flags, int tonic, int control); | |||
void AppendPhonemes(Translator *tr, char *string, int size, const char *ph); | |||
int TranslateRules(Translator *tr, char *p_start, char *phonemes, int ph_size, char *end_phonemes, int word_flags, unsigned int *dict_flags); | |||
int TransposeAlphabet(Translator *tr, char *text); | |||
int Lookup(Translator *tr, const char *word, char *ph_out); | |||
int LookupDictList(Translator *tr, char **wordptr, char *ph_out, unsigned int *flags, int end_flags, WORD_TAB *wtab); | |||
int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out); | |||
int RemoveEnding(Translator *tr, char *word, int end_type, char *word_copy); | |||
#ifdef __cplusplus |
@@ -112,9 +112,6 @@ typedef struct { | |||
#define Rnpp 10 | |||
#define R1p 11 | |||
#define R2p 12 | |||
#define R3p 13 | |||
#define R4p 14 | |||
#define R5p 15 | |||
#define R6p 16 | |||
#define RGL 17 |
@@ -32,12 +32,13 @@ | |||
#include <espeak-ng/encoding.h> | |||
#include "numbers.h" | |||
#include "common.h" | |||
#include "dictionary.h" // for Lookup, TranslateRules, EncodePhonemes, Look... | |||
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonEND_WORD, phonP... | |||
#include "readclause.h" // for WordToString2, towlower2 | |||
#include "readclause.h" // for WordToString2 | |||
#include "synthdata.h" // for SelectPhonemeTable | |||
#include "synthesize.h" // for phoneme_tab | |||
#include "translate.h" // for Translator, LANGUAGE_OPTIONS, IsDigit09, WOR... | |||
#include "translate.h" // for Translator, LANGUAGE_OPTIONS, WOR... | |||
#include "voice.h" // for voice, voice_t | |||
#define M_LIGATURE 0x8000 |
@@ -131,23 +131,6 @@ int clause_type_from_codepoint(uint32_t c) | |||
return CLAUSE_NONE; | |||
} | |||
int is_str_totally_null(const char* str, int size) { | |||
// Tests if all bytes of str are null up to size | |||
// This should never be reimplemented with integers, because | |||
// this function has to work with unaligned char* | |||
// (casting to int when unaligned may result in ungaranteed behaviors) | |||
return (*str == 0 && memcmp(str, str+1, size-1) == 0); | |||
} | |||
int towlower2(unsigned int c, Translator *translator) | |||
{ | |||
// check for non-standard upper to lower case conversions | |||
if (c == 'I' && translator->langopts.dotless_i) | |||
return 0x131; // I -> ı | |||
return ucd_tolower(c); | |||
} | |||
static int IsRomanU(unsigned int c) | |||
{ | |||
if ((c == 'I') || (c == 'V') || (c == 'X') || (c == 'L')) | |||
@@ -288,20 +271,6 @@ static const char *LookupCharName(Translator *tr, int c, int only) | |||
return buf; | |||
} | |||
int Read4Bytes(FILE *f) | |||
{ | |||
// Read 4 bytes (least significant first) into a word | |||
int ix; | |||
unsigned char c; | |||
int acc = 0; | |||
for (ix = 0; ix < 4; ix++) { | |||
c = fgetc(f) & 0xff; | |||
acc += (c << (ix*8)); | |||
} | |||
return acc; | |||
} | |||
static int AnnouncePunctuation(Translator *tr, int c1, int *c2_ptr, char *output, int *bufix, int end_clause) | |||
{ | |||
// announce punctuation names |
@@ -34,14 +34,9 @@ typedef struct { | |||
extern PARAM_STACK param_stack[]; | |||
// Tests if all bytes of str up to size are null | |||
int is_str_totally_null(const char* str, int size); | |||
int clause_type_from_codepoint(uint32_t c); | |||
int towlower2(unsigned int c, Translator *translator); // Supports Turkish I | |||
int Eof(void); | |||
const char *WordToString2(unsigned int word); | |||
int Read4Bytes(FILE *f); | |||
int AddNameData(const char *name, | |||
int wide); | |||
int ReadClause(Translator *tr, |
@@ -38,7 +38,6 @@ | |||
#include "soundicon.h" | |||
#include "common.h" // for GetFileLength | |||
#include "error.h" // for create_file_error_context | |||
#include "readclause.h" // for Read4Bytes | |||
#include "speech.h" // for path_home, PATHSEP | |||
#include "synthesize.h" // for samplerate | |||
@@ -33,7 +33,6 @@ float polint(float xa[], float ya[], int n, float x); | |||
#define FRAME_WIDTH 1000 // max width for 8000kHz frame | |||
#define MAX_DISPLAY_FREQ 9500 | |||
#define FRAME_HEIGHT 240 | |||
#define T_AMPLITUDE 308 | |||
#define T_AV 312 |
@@ -45,7 +45,7 @@ | |||
#include "readclause.h" // for PARAM_STACK, param_stack, AddNameData | |||
#include "soundicon.h" // for LoadSoundFile2 | |||
#include "synthesize.h" // for SPEED_FACTORS, speed | |||
#include "translate.h" // for CTRL_EMBEDDED, IsDigit09, utf8_out | |||
#include "translate.h" // for CTRL_EMBEDDED | |||
#include "voice.h" // for SelectVoice, SelectVoiceByName | |||
#include "speech.h" // for MAKE_MEM_UNDEFINED | |||
@@ -34,7 +34,6 @@ | |||
#include "dictionary.h" | |||
#include "mbrola.h" | |||
#include "readclause.h" | |||
#include "setlengths.h" | |||
#include "synthdata.h" | |||
#include "wavegen.h" |
@@ -31,6 +31,7 @@ | |||
#include <espeak-ng/speak_lib.h> | |||
#include <espeak-ng/encoding.h> | |||
#include "common.h" | |||
#include "setlengths.h" // for SetLengthMods | |||
#include "translate.h" // for Translator, LANGUAGE_OPTIONS, L, NUM... | |||
@@ -33,6 +33,7 @@ | |||
#include <espeak-ng/encoding.h> | |||
#include "translate.h" | |||
#include "common.h" | |||
#include "dictionary.h" // for TranslateRules, LookupDictList, Cha... | |||
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonPAUSE_... | |||
#include "phonemelist.h" // for MakePhonemeList | |||
@@ -104,111 +105,9 @@ static char source[N_TR_SOURCE+40]; // extra space for embedded command & voice | |||
int n_replace_phonemes; | |||
REPLACE_PHONEMES replace_phonemes[N_REPLACE_PHONEMES]; | |||
// brackets, also 0x2014 to 0x021f which don't need to be in this list | |||
static const unsigned short brackets[] = { | |||
'(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`', | |||
0xab, 0xbb, // double angle brackets | |||
0x300a, 0x300b, // double angle brackets (ideograph) | |||
0xe000+'<', // private usage area | |||
0 | |||
}; | |||
// other characters which break a word, but don't produce a pause | |||
static const unsigned short breaks[] = { '_', 0 }; | |||
int IsAlpha(unsigned int c) | |||
{ | |||
// Replacement for iswalph() which also checks for some in-word symbols | |||
static const unsigned short extra_indic_alphas[] = { | |||
0xa70, 0xa71, // Gurmukhi: tippi, addak | |||
0 | |||
}; | |||
if (iswalpha(c)) | |||
return 1; | |||
if (c < 0x300) | |||
return 0; | |||
if ((c >= 0x901) && (c <= 0xdf7)) { | |||
// Indic scripts: Devanagari, Tamil, etc | |||
if ((c & 0x7f) < 0x64) | |||
return 1; | |||
if (lookupwchar(extra_indic_alphas, c) != 0) | |||
return 1; | |||
if ((c >= 0xd7a) && (c <= 0xd7f)) | |||
return 1; // malaytalam chillu characters | |||
return 0; | |||
} | |||
if ((c >= 0x5b0) && (c <= 0x5c2)) | |||
return 1; // Hebrew vowel marks | |||
if (c == 0x0605) | |||
return 1; | |||
if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e))) | |||
return 1; // arabic vowel marks | |||
if ((c >= 0x300) && (c <= 0x36f)) | |||
return 1; // combining accents | |||
if ((c >= 0xf40) && (c <= 0xfbc)) | |||
return 1; // tibetan | |||
if ((c >= 0x1100) && (c <= 0x11ff)) | |||
return 1; // Korean jamo | |||
if ((c >= 0x2800) && (c <= 0x28ff)) | |||
return 1; // braille | |||
if ((c > 0x3040) && (c <= 0xa700)) | |||
return 1; // Chinese/Japanese. Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure | |||
return 0; | |||
} | |||
int IsDigit09(unsigned int c) | |||
{ | |||
if ((c >= '0') && (c <= '9')) | |||
return 1; | |||
return 0; | |||
} | |||
int IsDigit(unsigned int c) | |||
{ | |||
if (iswdigit(c)) | |||
return 1; | |||
if ((c >= 0x966) && (c <= 0x96f)) | |||
return 1; | |||
return 0; | |||
} | |||
static int IsSpace(unsigned int c) | |||
{ | |||
if (c == 0) | |||
return 0; | |||
if ((c >= 0x2500) && (c < 0x25a0)) | |||
return 1; // box drawing characters | |||
if ((c >= 0xfff9) && (c <= 0xffff)) | |||
return 1; // unicode specials | |||
return iswspace(c); | |||
} | |||
int isspace2(unsigned int c) | |||
{ | |||
// can't use isspace() because on Windows, isspace(0xe1) gives TRUE ! | |||
int c2; | |||
if (((c2 = (c & 0xff)) == 0) || (c > ' ')) | |||
return 0; | |||
return 1; | |||
} | |||
void DeleteTranslator(Translator *tr) | |||
{ | |||
if (!tr) return; | |||
@@ -230,136 +129,6 @@ int lookupwchar(const unsigned short *list, int c) | |||
return 0; | |||
} | |||
int IsBracket(int c) | |||
{ | |||
if ((c >= 0x2014) && (c <= 0x201f)) | |||
return 1; | |||
return lookupwchar(brackets, c); | |||
} | |||
int utf8_nbytes(const char *buf) | |||
{ | |||
// Returns the number of bytes for the first UTF-8 character in buf | |||
unsigned char c = (unsigned char)buf[0]; | |||
if (c < 0x80) | |||
return 1; | |||
if (c < 0xe0) | |||
return 2; | |||
if (c < 0xf0) | |||
return 3; | |||
return 4; | |||
} | |||
int utf8_in2(int *c, const char *buf, int backwards) | |||
{ | |||
// Reads a unicode characater from a UTF8 string | |||
// Returns the number of UTF8 bytes used. | |||
// c: holds integer representation of multibyte character | |||
// buf: position of buffer is moved, if character is read | |||
// backwards: set if we are moving backwards through the UTF8 string | |||
int c1; | |||
int n_bytes; | |||
int ix; | |||
static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 }; | |||
// find the start of the next/previous character | |||
while ((*buf & 0xc0) == 0x80) { | |||
// skip over non-initial bytes of a multi-byte utf8 character | |||
if (backwards) | |||
buf--; | |||
else | |||
buf++; | |||
} | |||
n_bytes = 0; | |||
if ((c1 = *buf++) & 0x80) { | |||
if ((c1 & 0xe0) == 0xc0) | |||
n_bytes = 1; | |||
else if ((c1 & 0xf0) == 0xe0) | |||
n_bytes = 2; | |||
else if ((c1 & 0xf8) == 0xf0) | |||
n_bytes = 3; | |||
c1 &= mask[n_bytes]; | |||
for (ix = 0; ix < n_bytes; ix++) | |||
{ | |||
if (!*buf) | |||
/* Oops, truncated */ | |||
break; | |||
c1 = (c1 << 6) + (*buf++ & 0x3f); | |||
} | |||
n_bytes = ix; | |||
} | |||
*c = c1; | |||
return n_bytes+1; | |||
} | |||
#pragma GCC visibility push(default) | |||
int utf8_in(int *c, const char *buf) | |||
{ | |||
/* Read a unicode characater from a UTF8 string | |||
* Returns the number of UTF8 bytes used. | |||
* buf: position of buffer is moved, if character is read | |||
* c: holds UTF-16 representation of multibyte character by | |||
* skipping UTF-8 header bits of bytes in following way: | |||
* 2-byte character "ā": | |||
* hex binary | |||
* c481 1100010010000001 | |||
* | 11000100 000001 | |||
* V \ \ | | | |||
* 0101 0000000100000001 | |||
* 3-byte character "ꙅ": | |||
* ea9985 111010101001100110000101 | |||
* 1010 011001 000101 | |||
* | + +--.\ \ | | | |||
* V `--. \`. `.| | | |||
* A645 1010011001000101 | |||
* 4-byte character "𠜎": | |||
* f0a09c8e 11110000101000001001110010001110 | |||
* V 000 100000 011100 001110 | |||
* 02070e 000000100000011100001110 | |||
*/ | |||
return utf8_in2(c, buf, 0); | |||
} | |||
#pragma GCC visibility pop | |||
int utf8_out(unsigned int c, char *buf) | |||
{ | |||
// write a UTF-16 character into a buffer as UTF-8 | |||
// returns the number of bytes written | |||
int n_bytes; | |||
int j; | |||
int shift; | |||
static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
if (c < 0x80) { | |||
buf[0] = c; | |||
return 1; | |||
} | |||
if (c >= 0x110000) { | |||
buf[0] = ' '; // out of range character code | |||
return 1; | |||
} | |||
if (c < 0x0800) | |||
n_bytes = 1; | |||
else if (c < 0x10000) | |||
n_bytes = 2; | |||
else | |||
n_bytes = 3; | |||
shift = 6*n_bytes; | |||
buf[0] = code[n_bytes] | (c >> shift); | |||
for (j = 0; j < n_bytes; j++) { | |||
shift -= 6; | |||
buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
} | |||
return n_bytes+1; | |||
} | |||
char *strchr_w(const char *s, int c) | |||
{ | |||
// return NULL for any non-ascii character |
@@ -64,9 +64,6 @@ extern "C" | |||
#define FLAG_ALT_TRANS 0x8000 // language specific | |||
#define FLAG_ALT2_TRANS 0x10000 // language specific | |||
#define FLAG_ALT3_TRANS 0x20000 // language specific | |||
#define FLAG_ALT4_TRANS 0x40000 // language specific | |||
#define FLAG_ALT5_TRANS 0x80000 // language specific | |||
#define FLAG_ALT6_TRANS 0x100000 // language specific | |||
#define FLAG_ALT7_TRANS 0x200000 // language specific | |||
#define FLAG_COMBINE 0x800000 // combine with the next word | |||
@@ -661,21 +658,11 @@ extern int (*phoneme_callback)(const char *); | |||
#define LEADING_2_BITS 0xC0 // 0b11000000 | |||
#define UTF8_TAIL_BITS 0x80 // 0b10000000 | |||
ESPEAK_NG_API int utf8_in(int *c, const char *buf); | |||
int utf8_in2(int *c, const char *buf, int backwards); | |||
int utf8_out(unsigned int c, char *buf); | |||
int utf8_nbytes(const char *buf); | |||
int lookupwchar(const unsigned short *list, int c); | |||
char *strchr_w(const char *s, int c); | |||
int IsBracket(int c); | |||
void InitNamedata(void); | |||
void InitText(int flags); | |||
void InitText2(void); | |||
int IsDigit(unsigned int c); | |||
int IsDigit09(unsigned int c); | |||
int IsAlpha(unsigned int c); | |||
int isspace2(unsigned int c); | |||
ALPHABET *AlphabetFromChar(int c); | |||
Translator *SelectTranslator(const char *name); | |||
@@ -686,8 +673,6 @@ void ProcessLanguageOptions(LANGUAGE_OPTIONS *langopts); | |||
void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len); | |||
void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags); | |||
int TranslateWord(Translator *tr, char *word1, WORD_TAB *wtab, char *word_out); | |||
void TranslateClause(Translator *tr, int *tone, char **voice_change); | |||
@@ -36,7 +36,7 @@ | |||
#include "translate.h" | |||
#include "translateword.h" | |||
#include "common.h" // for strncpy0 | |||
#include "dictionary.h" // for TranslateRules, LookupDictList, Cha... | |||
#include "dictionary.h" // for TranslateRules, LookupDictList | |||
#include "numbers.h" // for SetSpellingStress, ... | |||
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonPAUSE_... | |||
#include "readclause.h" // for towlower2 | |||
@@ -46,10 +46,14 @@ | |||
static void addPluralSuffixes(int flags, Translator *tr, char last_char, char *word_phonemes); | |||
static void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags); | |||
static void ChangeWordStress(Translator *tr, char *word, int new_stress); | |||
static int CheckDottedAbbrev(char *word1); | |||
static int NonAsciiNumber(int letter); | |||
static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word, ALPHABET *current_alphabet, char word_phonemes[]); | |||
static int TranslateLetter(Translator *tr, char *word, char *phonemes, int control, ALPHABET *current_alphabet); | |||
static int Unpronouncable(Translator *tr, char *word, int posn); | |||
static int Unpronouncable2(Translator *tr, char *word); | |||
int TranslateWord3(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_out, bool *any_stressed_words, ALPHABET *current_alphabet, char word_phonemes[], size_t size_word_phonemes) | |||
{ | |||
@@ -667,6 +671,82 @@ int TranslateWord3(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_ | |||
} | |||
void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags) | |||
{ | |||
// apply after the translation is complete | |||
int ix; | |||
int len; | |||
char *p; | |||
len = strlen(phonemes); | |||
if (tr->langopts.param[LOPT_ALT] & 2) { | |||
for (ix = 0; ix < (len-1); ix++) { | |||
if (phonemes[ix] == phonSTRESS_P) { | |||
p = &phonemes[ix+1]; | |||
if ((dict_flags & FLAG_ALT2_TRANS) != 0) { | |||
if (*p == PhonemeCode('E')) | |||
*p = PhonemeCode('e'); | |||
if (*p == PhonemeCode('O')) | |||
*p = PhonemeCode('o'); | |||
} else { | |||
if (*p == PhonemeCode('e')) | |||
*p = PhonemeCode('E'); | |||
if (*p == PhonemeCode('o')) | |||
*p = PhonemeCode('O'); | |||
} | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
static void ChangeWordStress(Translator *tr, char *word, int new_stress) | |||
{ | |||
int ix; | |||
unsigned char *p; | |||
int max_stress; | |||
int vowel_count; // num of vowels + 1 | |||
int stressed_syllable = 0; // position of stressed syllable | |||
unsigned char phonetic[N_WORD_PHONEMES]; | |||
signed char vowel_stress[N_WORD_PHONEMES/2]; | |||
strcpy((char *)phonetic, word); | |||
max_stress = GetVowelStress(tr, phonetic, vowel_stress, &vowel_count, &stressed_syllable, 0); | |||
if (new_stress >= STRESS_IS_PRIMARY) { | |||
// promote to primary stress | |||
for (ix = 1; ix < vowel_count; ix++) { | |||
if (vowel_stress[ix] >= max_stress) { | |||
vowel_stress[ix] = new_stress; | |||
break; | |||
} | |||
} | |||
} else { | |||
// remove primary stress | |||
for (ix = 1; ix < vowel_count; ix++) { | |||
if (vowel_stress[ix] > new_stress) // >= allows for diminished stress (=1) | |||
vowel_stress[ix] = new_stress; | |||
} | |||
} | |||
// write out phonemes | |||
ix = 1; | |||
p = phonetic; | |||
while (*p != 0) { | |||
if ((phoneme_tab[*p]->type == phVOWEL) && !(phoneme_tab[*p]->phflags & phNONSYLLABIC)) { | |||
if ((vowel_stress[ix] == STRESS_IS_DIMINISHED) || (vowel_stress[ix] > STRESS_IS_UNSTRESSED)) | |||
*word++ = stress_phonemes[(unsigned char)vowel_stress[ix]]; | |||
ix++; | |||
} | |||
*word++ = *p++; | |||
} | |||
*word = 0; | |||
} | |||
static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word, ALPHABET *current_alphabet, char word_phonemes[]) | |||
{ | |||
int posn = 0; | |||
@@ -1034,3 +1114,92 @@ static int NonAsciiNumber(int letter) | |||
} | |||
return -1; | |||
} | |||
static int Unpronouncable(Translator *tr, char *word, int posn) | |||
{ | |||
/* Determines whether a word in 'unpronouncable', i.e. whether it should | |||
be spoken as individual letters. | |||
This function may be language specific. This is a generic version. | |||
*/ | |||
int c; | |||
int c1 = 0; | |||
int vowel_posn = 9; | |||
int index; | |||
int count; | |||
ALPHABET *alphabet; | |||
utf8_in(&c, word); | |||
if ((tr->letter_bits_offset > 0) && (c < 0x241)) { | |||
// Latin characters for a language with a non-latin alphabet | |||
return 0; // so we can re-translate the word as English | |||
} | |||
if (((alphabet = AlphabetFromChar(c)) != NULL) && (alphabet->offset != tr->letter_bits_offset)) { | |||
// Character is not in our alphabet | |||
return 0; | |||
} | |||
if (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 1) | |||
return 0; | |||
if (((c = *word) == ' ') || (c == 0) || (c == '\'')) | |||
return 0; | |||
index = 0; | |||
count = 0; | |||
for (;;) { | |||
index += utf8_in(&c, &word[index]); | |||
if ((c == 0) || (c == ' ')) | |||
break; | |||
if ((c == '\'') && ((count > 1) || (posn > 0))) | |||
break; // "tv'" but not "l'" | |||
if (count == 0) | |||
c1 = c; | |||
if ((c == '\'') && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 3)) { | |||
// don't count apostrophe | |||
} else | |||
count++; | |||
if (IsVowel(tr, c)) { | |||
vowel_posn = count; // position of the first vowel | |||
break; | |||
} | |||
if ((c != '\'') && !iswalpha(c)) | |||
return 0; | |||
} | |||
if ((vowel_posn > 2) && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 2)) { | |||
// Lookup unpronounable rules in *_rules | |||
return Unpronouncable2(tr, word); | |||
} | |||
if (c1 == tr->langopts.param[LOPT_UNPRONOUNCABLE]) | |||
vowel_posn--; // disregard this as the initial letter when counting | |||
if (vowel_posn > (tr->langopts.max_initial_consonants+1)) | |||
return 1; // no vowel, or no vowel in first few letters | |||
return 0; | |||
} | |||
static int Unpronouncable2(Translator *tr, char *word) | |||
{ | |||
int c; | |||
int end_flags; | |||
char ph_buf[N_WORD_PHONEMES]; | |||
ph_buf[0] = 0; | |||
c = word[-1]; | |||
word[-1] = ' '; // ensure there is a space before the "word" | |||
end_flags = TranslateRules(tr, word, ph_buf, sizeof(ph_buf), NULL, FLAG_UNPRON_TEST, NULL); | |||
word[-1] = c; | |||
if ((end_flags == 0) || (end_flags & SUFX_UNPRON)) | |||
return 1; | |||
return 0; | |||
} |