Browse Source

Merge pull request #1379

Code cleanup: relocate functions & remove unused #defines
master
jaacoppi 2 years ago
parent
commit
df8af89ddd
No account linked to committer's email address

+ 253
- 1
src/libespeak-ng/common.c View File

@@ -28,13 +28,15 @@
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <wctype.h>

#include <espeak-ng/espeak_ng.h>
#include <espeak-ng/speak_lib.h>
#include <espeak-ng/encoding.h>
#include <ucd/ucd.h>

#include "common.h"
#include "translate.h"

#pragma GCC visibility push(default)

@@ -57,4 +59,254 @@ void strncpy0(char *to, const char *from, int size)
strncpy(to, from, size);
to[size-1] = 0;
}

int utf8_in(int *c, const char *buf)
{
/* Read a unicode characater from a UTF8 string
* Returns the number of UTF8 bytes used.
* buf: position of buffer is moved, if character is read
* c: holds UTF-16 representation of multibyte character by
* skipping UTF-8 header bits of bytes in following way:
* 2-byte character "ā":
* hex binary
* c481 1100010010000001
* | 11000100 000001
* V \ \ | |
* 0101 0000000100000001
* 3-byte character "ꙅ":
* ea9985 111010101001100110000101
* 1010 011001 000101
* | + +--.\ \ | |
* V `--. \`. `.| |
* A645 1010011001000101
* 4-byte character "𠜎":
* f0a09c8e 11110000101000001001110010001110
* V 000 100000 011100 001110
* 02070e 000000100000011100001110
*/
return utf8_in2(c, buf, 0);
}
#pragma GCC visibility pop

int utf8_out(unsigned int c, char *buf)
{
// write a UTF-16 character into a buffer as UTF-8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

int utf8_in2(int *c, const char *buf, int backwards)
{
// Reads a unicode characater from a UTF8 string
// Returns the number of UTF8 bytes used.
// c: holds integer representation of multibyte character
// buf: position of buffer is moved, if character is read
// backwards: set if we are moving backwards through the UTF8 string

int c1;
int n_bytes;
int ix;
static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };

// find the start of the next/previous character
while ((*buf & 0xc0) == 0x80) {
// skip over non-initial bytes of a multi-byte utf8 character
if (backwards)
buf--;
else
buf++;
}

n_bytes = 0;

if ((c1 = *buf++) & 0x80) {
if ((c1 & 0xe0) == 0xc0)
n_bytes = 1;
else if ((c1 & 0xf0) == 0xe0)
n_bytes = 2;
else if ((c1 & 0xf8) == 0xf0)
n_bytes = 3;

c1 &= mask[n_bytes];
for (ix = 0; ix < n_bytes; ix++)
{
if (!*buf)
/* Oops, truncated */
break;
c1 = (c1 << 6) + (*buf++ & 0x3f);
}
n_bytes = ix;
}
*c = c1;
return n_bytes+1;
}


int IsAlpha(unsigned int c)
{
// Replacement for iswalph() which also checks for some in-word symbols

static const unsigned short extra_indic_alphas[] = {
0xa70, 0xa71, // Gurmukhi: tippi, addak
0
};

if (iswalpha(c))
return 1;

if (c < 0x300)
return 0;

if ((c >= 0x901) && (c <= 0xdf7)) {
// Indic scripts: Devanagari, Tamil, etc
if ((c & 0x7f) < 0x64)
return 1;
if (lookupwchar(extra_indic_alphas, c) != 0)
return 1;
if ((c >= 0xd7a) && (c <= 0xd7f))
return 1; // malaytalam chillu characters

return 0;
}

if ((c >= 0x5b0) && (c <= 0x5c2))
return 1; // Hebrew vowel marks

if (c == 0x0605)
return 1;

if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
return 1; // arabic vowel marks

if ((c >= 0x300) && (c <= 0x36f))
return 1; // combining accents

if ((c >= 0xf40) && (c <= 0xfbc))
return 1; // tibetan

if ((c >= 0x1100) && (c <= 0x11ff))
return 1; // Korean jamo

if ((c >= 0x2800) && (c <= 0x28ff))
return 1; // braille

if ((c > 0x3040) && (c <= 0xa700))
return 1; // Chinese/Japanese. Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure

return 0;
}

// brackets, also 0x2014 to 0x021f which don't need to be in this list
static const unsigned short brackets[] = {
'(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`',
0xab, 0xbb, // double angle brackets
0x300a, 0x300b, // double angle brackets (ideograph)
0xe000+'<', // private usage area
0
};

int IsBracket(int c)
{
if ((c >= 0x2014) && (c <= 0x201f))
return 1;
return lookupwchar(brackets, c);
}

int IsDigit09(unsigned int c)
{
if ((c >= '0') && (c <= '9'))
return 1;
return 0;
}

int IsDigit(unsigned int c)
{
if (iswdigit(c))
return 1;

if ((c >= 0x966) && (c <= 0x96f))
return 1;

return 0;
}

int IsSpace(unsigned int c)
{
if (c == 0)
return 0;
if ((c >= 0x2500) && (c < 0x25a0))
return 1; // box drawing characters
if ((c >= 0xfff9) && (c <= 0xffff))
return 1; // unicode specials
return iswspace(c);
}

int isspace2(unsigned int c)
{
// can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
int c2;

if (((c2 = (c & 0xff)) == 0) || (c > ' '))
return 0;
return 1;
}

int is_str_totally_null(const char* str, int size) {
// Tests if all bytes of str are null up to size
// This should never be reimplemented with integers, because
// this function has to work with unaligned char*
// (casting to int when unaligned may result in ungaranteed behaviors)
return (*str == 0 && memcmp(str, str+1, size-1) == 0);
}

int Read4Bytes(FILE *f)
{
// Read 4 bytes (least significant first) into a word
int ix;
unsigned char c;
int acc = 0;

for (ix = 0; ix < 4; ix++) {
c = fgetc(f) & 0xff;
acc += (c << (ix*8));
}
return acc;
}

int towlower2(unsigned int c, Translator *translator)
{
// check for non-standard upper to lower case conversions
if (c == 'I' && translator->langopts.dotless_i)
return 0x131; // I -> ı

return ucd_tolower(c);
}



+ 15
- 0
src/libespeak-ng/common.h View File

@@ -21,10 +21,25 @@
#define ESPEAK_NG_COMMON_H

#include "espeak-ng/espeak_ng.h"
#include "translate.h"

extern ESPEAK_NG_API int GetFileLength(const char *filename);
extern ESPEAK_NG_API void strncpy0(char *to, const char *from, int size);

int IsAlpha(unsigned int c);
int IsBracket(int c);
int IsDigit(unsigned int c);
int IsDigit09(unsigned int c);
int IsSpace(unsigned int c);
int isspace2(unsigned int c);
int is_str_totally_null(const char* str, int size); // Tests if all bytes of str up to size are null
int Read4Bytes(FILE *f);
int towlower2(unsigned int c, Translator *translator); // Supports Turkish I

ESPEAK_NG_API int utf8_in(int *c, const char *buf);
int utf8_in2(int *c, const char *buf, int backwards);
int utf8_out(unsigned int c, char *buf);

#ifdef __cplusplus
}
#endif

+ 1
- 2
src/libespeak-ng/compiledata.c View File

@@ -35,11 +35,10 @@
#include <espeak-ng/speak_lib.h>
#include <espeak-ng/encoding.h>

#include "common.h" // for GetFileLength, strncpy0
#include "common.h" // for GetFileLength, strncpy0, ...
#include "error.h" // for create_file_error_context
#include "mnemonics.h" // for LookupMnemName, MNEM_TAB
#include "phoneme.h" // for PHONEME_TAB, PHONEME_TAB_LIST
#include "readclause.h" // for Read4Bytes
#include "spect.h" // for SpectFrame, peak_t, SpectSeq
#include "speech.h" // for path_home, GetFileLength
#include "synthdata.h" // for LoadPhData

+ 0
- 2
src/libespeak-ng/compiledict.c View File

@@ -39,10 +39,8 @@
#include "error.h" // for create_file_error_context
#include "mnemonics.h" // for LookupMnemName, MNEM_TAB
#include "phoneme.h" // for PHONEME_TAB_LIST, phonSWITCH, phone...
#include "readclause.h" // for towlower2
#include "speech.h" // for path_home
#include "synthesize.h" // for Write4Bytes
#include "translate.h" // for isspace2, IsDigit09, utf8_in, utf8_out

static FILE *f_log = NULL;


+ 22
- 169
src/libespeak-ng/dictionary.c View File

@@ -36,13 +36,15 @@
#include "dictionary.h"
#include "numbers.h" // for LookupAccentedLetter, Look...
#include "phoneme.h" // for PHONEME_TAB, phVOWEL, phon...
#include "readclause.h" // for WordToString2, is_str_tota...
#include "readclause.h" // for WordToString2
#include "speech.h" // for path_home
#include "compiledict.h" // for DecodeRule
#include "synthdata.h" // for PhonemeCode, InterpretPhoneme
#include "synthesize.h" // for STRESS_IS_PRIMARY, phoneme...
#include "translate.h" // for Translator, utf8_in, LANGU...

static int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out);

typedef struct {
int points;
const char *phonemes;
@@ -761,96 +763,7 @@ int IsVowel(Translator *tr, int letter)
return IsLetter(tr, letter, LETTERGP_VOWEL2);
}

static int Unpronouncable2(Translator *tr, char *word)
{
int c;
int end_flags;
char ph_buf[N_WORD_PHONEMES];

ph_buf[0] = 0;
c = word[-1];
word[-1] = ' '; // ensure there is a space before the "word"
end_flags = TranslateRules(tr, word, ph_buf, sizeof(ph_buf), NULL, FLAG_UNPRON_TEST, NULL);
word[-1] = c;
if ((end_flags == 0) || (end_flags & SUFX_UNPRON))
return 1;
return 0;
}

int Unpronouncable(Translator *tr, char *word, int posn)
{
/* Determines whether a word in 'unpronouncable', i.e. whether it should
be spoken as individual letters.

This function may be language specific. This is a generic version.
*/

int c;
int c1 = 0;
int vowel_posn = 9;
int index;
int count;
ALPHABET *alphabet;

utf8_in(&c, word);
if ((tr->letter_bits_offset > 0) && (c < 0x241)) {
// Latin characters for a language with a non-latin alphabet
return 0; // so we can re-translate the word as English
}

if (((alphabet = AlphabetFromChar(c)) != NULL) && (alphabet->offset != tr->letter_bits_offset)) {
// Character is not in our alphabet
return 0;
}

if (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 1)
return 0;

if (((c = *word) == ' ') || (c == 0) || (c == '\''))
return 0;

index = 0;
count = 0;
for (;;) {
index += utf8_in(&c, &word[index]);
if ((c == 0) || (c == ' '))
break;

if ((c == '\'') && ((count > 1) || (posn > 0)))
break; // "tv'" but not "l'"

if (count == 0)
c1 = c;

if ((c == '\'') && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 3)) {
// don't count apostrophe
} else
count++;

if (IsVowel(tr, c)) {
vowel_posn = count; // position of the first vowel
break;
}

if ((c != '\'') && !iswalpha(c))
return 0;
}

if ((vowel_posn > 2) && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 2)) {
// Lookup unpronounable rules in *_rules
return Unpronouncable2(tr, word);
}

if (c1 == tr->langopts.param[LOPT_UNPRONOUNCABLE])
vowel_posn--; // disregard this as the initial letter when counting

if (vowel_posn > (tr->langopts.max_initial_consonants+1))
return 1; // no vowel, or no vowel in first few letters

return 0;
}

static int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char *vowel_stress, int *vowel_count, int *stressed_syllable, int control)
int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char *vowel_stress, int *vowel_count, int *stressed_syllable, int control)
{
// control = 1, set stress to 1 for forced unstressed vowels
unsigned char phcode;
@@ -962,55 +875,11 @@ static int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char *
return max_stress;
}

static char stress_phonemes[] = {
const char stress_phonemes[] = {
phonSTRESS_D, phonSTRESS_U, phonSTRESS_2, phonSTRESS_3,
phonSTRESS_P, phonSTRESS_P2, phonSTRESS_TONIC
};

void ChangeWordStress(Translator *tr, char *word, int new_stress)
{
int ix;
unsigned char *p;
int max_stress;
int vowel_count; // num of vowels + 1
int stressed_syllable = 0; // position of stressed syllable
unsigned char phonetic[N_WORD_PHONEMES];
signed char vowel_stress[N_WORD_PHONEMES/2];

strcpy((char *)phonetic, word);
max_stress = GetVowelStress(tr, phonetic, vowel_stress, &vowel_count, &stressed_syllable, 0);

if (new_stress >= STRESS_IS_PRIMARY) {
// promote to primary stress
for (ix = 1; ix < vowel_count; ix++) {
if (vowel_stress[ix] >= max_stress) {
vowel_stress[ix] = new_stress;
break;
}
}
} else {
// remove primary stress
for (ix = 1; ix < vowel_count; ix++) {
if (vowel_stress[ix] > new_stress) // >= allows for diminished stress (=1)
vowel_stress[ix] = new_stress;
}
}

// write out phonemes
ix = 1;
p = phonetic;
while (*p != 0) {
if ((phoneme_tab[*p]->type == phVOWEL) && !(phoneme_tab[*p]->phflags & phNONSYLLABIC)) {
if ((vowel_stress[ix] == STRESS_IS_DIMINISHED) || (vowel_stress[ix] > STRESS_IS_UNSTRESSED))
*word++ = stress_phonemes[(unsigned char)vowel_stress[ix]];

ix++;
}
*word++ = *p++;
}
*word = 0;
}

void SetWordStress(Translator *tr, char *output, unsigned int *dictionary_flags, int tonic, int control)
{
/* Guess stress pattern of word. This is language specific
@@ -2345,7 +2214,7 @@ int TranslateRules(Translator *tr, char *p_start, char *phonemes, int ph_size, c
if (letter == 0xe000+'(') {
if (pre_pause < tr->langopts.param[LOPT_BRACKET_PAUSE_ANNOUNCED])
pre_pause = tr->langopts.param[LOPT_BRACKET_PAUSE_ANNOUNCED]; // a bracket, already spoken by AnnouncePunctuation()
}
}
if (IsBracket(letter)) {
if (pre_pause < tr->langopts.param[LOPT_BRACKET_PAUSE])
pre_pause = tr->langopts.param[LOPT_BRACKET_PAUSE];
@@ -2456,37 +2325,6 @@ int TranslateRules(Translator *tr, char *p_start, char *phonemes, int ph_size, c
return 0;
}

void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags)
{
// apply after the translation is complete

int ix;
int len;
char *p;

len = strlen(phonemes);

if (tr->langopts.param[LOPT_ALT] & 2) {
for (ix = 0; ix < (len-1); ix++) {
if (phonemes[ix] == phonSTRESS_P) {
p = &phonemes[ix+1];
if ((dict_flags & FLAG_ALT2_TRANS) != 0) {
if (*p == PhonemeCode('E'))
*p = PhonemeCode('e');
if (*p == PhonemeCode('O'))
*p = PhonemeCode('o');
} else {
if (*p == PhonemeCode('e'))
*p = PhonemeCode('E');
if (*p == PhonemeCode('o'))
*p = PhonemeCode('O');
}
break;
}
}
}
}

int TransposeAlphabet(Translator *tr, char *text)
{
// transpose cyrillic alphabet (for example) into ascii (single byte) character codes
@@ -2856,6 +2694,21 @@ static const char *LookupDict2(Translator *tr, const char *word, const char *wor
return 0;
}


static int utf8_nbytes(const char *buf)
{
// Returns the number of bytes for the first UTF-8 character in buf

unsigned char c = (unsigned char)buf[0];
if (c < 0x80)
return 1;
if (c < 0xe0)
return 2;
if (c < 0xf0)
return 3;
return 4;
}

/* Lookup a specified word in the word dictionary.
Returns phonetic data in 'phonetic' and bits in 'flags'

@@ -3022,7 +2875,7 @@ int Lookup(Translator *tr, const char *word, char *ph_out)
return flags0;
}

int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out)
static int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out)
{
char buf[100];
static unsigned int flags[2];

+ 3
- 3
src/libespeak-ng/dictionary.h View File

@@ -31,22 +31,22 @@ extern "C"
{
#endif

extern const char stress_phonemes[];

int LoadDictionary(Translator *tr, const char *name, int no_error);
int HashDictionary(const char *string);
const char *EncodePhonemes(const char *p, char *outptr, int *bad_phoneme);
void DecodePhonemes(const char *inptr, char *outptr);
char *WritePhMnemonic(char *phon_out, PHONEME_TAB *ph, PHONEME_LIST *plist, int use_ipa, int *flags);
const char *GetTranslatedPhonemeString(int phoneme_mode);
int GetVowelStress(Translator *tr, unsigned char *phonemes, signed char *vowel_stress, int *vowel_count, int *stressed_syllable, int control);
int IsVowel(Translator *tr, int letter);
int Unpronouncable(Translator *tr, char *word, int posn);
void ChangeWordStress(Translator *tr, char *word, int new_stress);
void SetWordStress(Translator *tr, char *output, unsigned int *dictionary_flags, int tonic, int control);
void AppendPhonemes(Translator *tr, char *string, int size, const char *ph);
int TranslateRules(Translator *tr, char *p_start, char *phonemes, int ph_size, char *end_phonemes, int word_flags, unsigned int *dict_flags);
int TransposeAlphabet(Translator *tr, char *text);
int Lookup(Translator *tr, const char *word, char *ph_out);
int LookupDictList(Translator *tr, char **wordptr, char *ph_out, unsigned int *flags, int end_flags, WORD_TAB *wtab);
int LookupFlags(Translator *tr, const char *word, unsigned int **flags_out);
int RemoveEnding(Translator *tr, char *word, int end_type, char *word_copy);

#ifdef __cplusplus

+ 0
- 3
src/libespeak-ng/klatt.h View File

@@ -112,9 +112,6 @@ typedef struct {
#define Rnpp 10
#define R1p 11
#define R2p 12
#define R3p 13
#define R4p 14
#define R5p 15
#define R6p 16

#define RGL 17

+ 3
- 2
src/libespeak-ng/numbers.c View File

@@ -32,12 +32,13 @@
#include <espeak-ng/encoding.h>

#include "numbers.h"
#include "common.h"
#include "dictionary.h" // for Lookup, TranslateRules, EncodePhonemes, Look...
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonEND_WORD, phonP...
#include "readclause.h" // for WordToString2, towlower2
#include "readclause.h" // for WordToString2
#include "synthdata.h" // for SelectPhonemeTable
#include "synthesize.h" // for phoneme_tab
#include "translate.h" // for Translator, LANGUAGE_OPTIONS, IsDigit09, WOR...
#include "translate.h" // for Translator, LANGUAGE_OPTIONS, WOR...
#include "voice.h" // for voice, voice_t

#define M_LIGATURE 0x8000

+ 0
- 31
src/libespeak-ng/readclause.c View File

@@ -131,23 +131,6 @@ int clause_type_from_codepoint(uint32_t c)
return CLAUSE_NONE;
}

int is_str_totally_null(const char* str, int size) {
// Tests if all bytes of str are null up to size
// This should never be reimplemented with integers, because
// this function has to work with unaligned char*
// (casting to int when unaligned may result in ungaranteed behaviors)
return (*str == 0 && memcmp(str, str+1, size-1) == 0);
}

int towlower2(unsigned int c, Translator *translator)
{
// check for non-standard upper to lower case conversions
if (c == 'I' && translator->langopts.dotless_i)
return 0x131; // I -> ı

return ucd_tolower(c);
}

static int IsRomanU(unsigned int c)
{
if ((c == 'I') || (c == 'V') || (c == 'X') || (c == 'L'))
@@ -288,20 +271,6 @@ static const char *LookupCharName(Translator *tr, int c, int only)
return buf;
}

int Read4Bytes(FILE *f)
{
// Read 4 bytes (least significant first) into a word
int ix;
unsigned char c;
int acc = 0;

for (ix = 0; ix < 4; ix++) {
c = fgetc(f) & 0xff;
acc += (c << (ix*8));
}
return acc;
}

static int AnnouncePunctuation(Translator *tr, int c1, int *c2_ptr, char *output, int *bufix, int end_clause)
{
// announce punctuation names

+ 0
- 5
src/libespeak-ng/readclause.h View File

@@ -34,14 +34,9 @@ typedef struct {

extern PARAM_STACK param_stack[];

// Tests if all bytes of str up to size are null
int is_str_totally_null(const char* str, int size);

int clause_type_from_codepoint(uint32_t c);
int towlower2(unsigned int c, Translator *translator); // Supports Turkish I
int Eof(void);
const char *WordToString2(unsigned int word);
int Read4Bytes(FILE *f);
int AddNameData(const char *name,
int wide);
int ReadClause(Translator *tr,

+ 0
- 1
src/libespeak-ng/soundicon.c View File

@@ -38,7 +38,6 @@
#include "soundicon.h"
#include "common.h" // for GetFileLength
#include "error.h" // for create_file_error_context
#include "readclause.h" // for Read4Bytes
#include "speech.h" // for path_home, PATHSEP
#include "synthesize.h" // for samplerate


+ 0
- 1
src/libespeak-ng/spect.h View File

@@ -33,7 +33,6 @@ float polint(float xa[], float ya[], int n, float x);

#define FRAME_WIDTH 1000 // max width for 8000kHz frame
#define MAX_DISPLAY_FREQ 9500
#define FRAME_HEIGHT 240

#define T_AMPLITUDE 308
#define T_AV 312

+ 1
- 1
src/libespeak-ng/ssml.c View File

@@ -45,7 +45,7 @@
#include "readclause.h" // for PARAM_STACK, param_stack, AddNameData
#include "soundicon.h" // for LoadSoundFile2
#include "synthesize.h" // for SPEED_FACTORS, speed
#include "translate.h" // for CTRL_EMBEDDED, IsDigit09, utf8_out
#include "translate.h" // for CTRL_EMBEDDED
#include "voice.h" // for SelectVoice, SelectVoiceByName
#include "speech.h" // for MAKE_MEM_UNDEFINED


+ 0
- 1
src/libespeak-ng/synth_mbrola.c View File

@@ -34,7 +34,6 @@

#include "dictionary.h"
#include "mbrola.h"
#include "readclause.h"
#include "setlengths.h"
#include "synthdata.h"
#include "wavegen.h"

+ 1
- 0
src/libespeak-ng/tr_languages.c View File

@@ -31,6 +31,7 @@
#include <espeak-ng/speak_lib.h>
#include <espeak-ng/encoding.h>

#include "common.h"
#include "setlengths.h" // for SetLengthMods
#include "translate.h" // for Translator, LANGUAGE_OPTIONS, L, NUM...


+ 1
- 232
src/libespeak-ng/translate.c View File

@@ -33,6 +33,7 @@
#include <espeak-ng/encoding.h>

#include "translate.h"
#include "common.h"
#include "dictionary.h" // for TranslateRules, LookupDictList, Cha...
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonPAUSE_...
#include "phonemelist.h" // for MakePhonemeList
@@ -104,111 +105,9 @@ static char source[N_TR_SOURCE+40]; // extra space for embedded command & voice
int n_replace_phonemes;
REPLACE_PHONEMES replace_phonemes[N_REPLACE_PHONEMES];

// brackets, also 0x2014 to 0x021f which don't need to be in this list
static const unsigned short brackets[] = {
'(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`',
0xab, 0xbb, // double angle brackets
0x300a, 0x300b, // double angle brackets (ideograph)
0xe000+'<', // private usage area
0
};

// other characters which break a word, but don't produce a pause
static const unsigned short breaks[] = { '_', 0 };

int IsAlpha(unsigned int c)
{
// Replacement for iswalph() which also checks for some in-word symbols

static const unsigned short extra_indic_alphas[] = {
0xa70, 0xa71, // Gurmukhi: tippi, addak
0
};

if (iswalpha(c))
return 1;

if (c < 0x300)
return 0;

if ((c >= 0x901) && (c <= 0xdf7)) {
// Indic scripts: Devanagari, Tamil, etc
if ((c & 0x7f) < 0x64)
return 1;
if (lookupwchar(extra_indic_alphas, c) != 0)
return 1;
if ((c >= 0xd7a) && (c <= 0xd7f))
return 1; // malaytalam chillu characters

return 0;
}

if ((c >= 0x5b0) && (c <= 0x5c2))
return 1; // Hebrew vowel marks

if (c == 0x0605)
return 1;

if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
return 1; // arabic vowel marks

if ((c >= 0x300) && (c <= 0x36f))
return 1; // combining accents

if ((c >= 0xf40) && (c <= 0xfbc))
return 1; // tibetan

if ((c >= 0x1100) && (c <= 0x11ff))
return 1; // Korean jamo

if ((c >= 0x2800) && (c <= 0x28ff))
return 1; // braille

if ((c > 0x3040) && (c <= 0xa700))
return 1; // Chinese/Japanese. Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure

return 0;
}

int IsDigit09(unsigned int c)
{
if ((c >= '0') && (c <= '9'))
return 1;
return 0;
}

int IsDigit(unsigned int c)
{
if (iswdigit(c))
return 1;

if ((c >= 0x966) && (c <= 0x96f))
return 1;

return 0;
}

static int IsSpace(unsigned int c)
{
if (c == 0)
return 0;
if ((c >= 0x2500) && (c < 0x25a0))
return 1; // box drawing characters
if ((c >= 0xfff9) && (c <= 0xffff))
return 1; // unicode specials
return iswspace(c);
}

int isspace2(unsigned int c)
{
// can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
int c2;

if (((c2 = (c & 0xff)) == 0) || (c > ' '))
return 0;
return 1;
}

void DeleteTranslator(Translator *tr)
{
if (!tr) return;
@@ -230,136 +129,6 @@ int lookupwchar(const unsigned short *list, int c)
return 0;
}

int IsBracket(int c)
{
if ((c >= 0x2014) && (c <= 0x201f))
return 1;
return lookupwchar(brackets, c);
}

int utf8_nbytes(const char *buf)
{
// Returns the number of bytes for the first UTF-8 character in buf

unsigned char c = (unsigned char)buf[0];
if (c < 0x80)
return 1;
if (c < 0xe0)
return 2;
if (c < 0xf0)
return 3;
return 4;
}

int utf8_in2(int *c, const char *buf, int backwards)
{
// Reads a unicode characater from a UTF8 string
// Returns the number of UTF8 bytes used.
// c: holds integer representation of multibyte character
// buf: position of buffer is moved, if character is read
// backwards: set if we are moving backwards through the UTF8 string

int c1;
int n_bytes;
int ix;
static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };

// find the start of the next/previous character
while ((*buf & 0xc0) == 0x80) {
// skip over non-initial bytes of a multi-byte utf8 character
if (backwards)
buf--;
else
buf++;
}

n_bytes = 0;

if ((c1 = *buf++) & 0x80) {
if ((c1 & 0xe0) == 0xc0)
n_bytes = 1;
else if ((c1 & 0xf0) == 0xe0)
n_bytes = 2;
else if ((c1 & 0xf8) == 0xf0)
n_bytes = 3;

c1 &= mask[n_bytes];
for (ix = 0; ix < n_bytes; ix++)
{
if (!*buf)
/* Oops, truncated */
break;
c1 = (c1 << 6) + (*buf++ & 0x3f);
}
n_bytes = ix;
}
*c = c1;
return n_bytes+1;
}

#pragma GCC visibility push(default)
int utf8_in(int *c, const char *buf)
{
/* Read a unicode characater from a UTF8 string
* Returns the number of UTF8 bytes used.
* buf: position of buffer is moved, if character is read
* c: holds UTF-16 representation of multibyte character by
* skipping UTF-8 header bits of bytes in following way:
* 2-byte character "ā":
* hex binary
* c481 1100010010000001
* | 11000100 000001
* V \ \ | |
* 0101 0000000100000001
* 3-byte character "ꙅ":
* ea9985 111010101001100110000101
* 1010 011001 000101
* | + +--.\ \ | |
* V `--. \`. `.| |
* A645 1010011001000101
* 4-byte character "𠜎":
* f0a09c8e 11110000101000001001110010001110
* V 000 100000 011100 001110
* 02070e 000000100000011100001110
*/
return utf8_in2(c, buf, 0);
}
#pragma GCC visibility pop

int utf8_out(unsigned int c, char *buf)
{
// write a UTF-16 character into a buffer as UTF-8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

char *strchr_w(const char *s, int c)
{
// return NULL for any non-ascii character

+ 0
- 15
src/libespeak-ng/translate.h View File

@@ -64,9 +64,6 @@ extern "C"
#define FLAG_ALT_TRANS 0x8000 // language specific
#define FLAG_ALT2_TRANS 0x10000 // language specific
#define FLAG_ALT3_TRANS 0x20000 // language specific
#define FLAG_ALT4_TRANS 0x40000 // language specific
#define FLAG_ALT5_TRANS 0x80000 // language specific
#define FLAG_ALT6_TRANS 0x100000 // language specific
#define FLAG_ALT7_TRANS 0x200000 // language specific

#define FLAG_COMBINE 0x800000 // combine with the next word
@@ -661,21 +658,11 @@ extern int (*phoneme_callback)(const char *);
#define LEADING_2_BITS 0xC0 // 0b11000000
#define UTF8_TAIL_BITS 0x80 // 0b10000000

ESPEAK_NG_API int utf8_in(int *c, const char *buf);
int utf8_in2(int *c, const char *buf, int backwards);
int utf8_out(unsigned int c, char *buf);
int utf8_nbytes(const char *buf);

int lookupwchar(const unsigned short *list, int c);
char *strchr_w(const char *s, int c);
int IsBracket(int c);
void InitNamedata(void);
void InitText(int flags);
void InitText2(void);
int IsDigit(unsigned int c);
int IsDigit09(unsigned int c);
int IsAlpha(unsigned int c);
int isspace2(unsigned int c);
ALPHABET *AlphabetFromChar(int c);

Translator *SelectTranslator(const char *name);
@@ -686,8 +673,6 @@ void ProcessLanguageOptions(LANGUAGE_OPTIONS *langopts);

void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len);

void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags);

int TranslateWord(Translator *tr, char *word1, WORD_TAB *wtab, char *word_out);
void TranslateClause(Translator *tr, int *tone, char **voice_change);


+ 170
- 1
src/libespeak-ng/translateword.c View File

@@ -36,7 +36,7 @@
#include "translate.h"
#include "translateword.h"
#include "common.h" // for strncpy0
#include "dictionary.h" // for TranslateRules, LookupDictList, Cha...
#include "dictionary.h" // for TranslateRules, LookupDictList
#include "numbers.h" // for SetSpellingStress, ...
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonPAUSE_...
#include "readclause.h" // for towlower2
@@ -46,10 +46,14 @@


static void addPluralSuffixes(int flags, Translator *tr, char last_char, char *word_phonemes);
static void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags);
static void ChangeWordStress(Translator *tr, char *word, int new_stress);
static int CheckDottedAbbrev(char *word1);
static int NonAsciiNumber(int letter);
static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word, ALPHABET *current_alphabet, char word_phonemes[]);
static int TranslateLetter(Translator *tr, char *word, char *phonemes, int control, ALPHABET *current_alphabet);
static int Unpronouncable(Translator *tr, char *word, int posn);
static int Unpronouncable2(Translator *tr, char *word);

int TranslateWord3(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_out, bool *any_stressed_words, ALPHABET *current_alphabet, char word_phonemes[], size_t size_word_phonemes)
{
@@ -667,6 +671,82 @@ int TranslateWord3(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_
}


void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags)
{
// apply after the translation is complete

int ix;
int len;
char *p;

len = strlen(phonemes);

if (tr->langopts.param[LOPT_ALT] & 2) {
for (ix = 0; ix < (len-1); ix++) {
if (phonemes[ix] == phonSTRESS_P) {
p = &phonemes[ix+1];
if ((dict_flags & FLAG_ALT2_TRANS) != 0) {
if (*p == PhonemeCode('E'))
*p = PhonemeCode('e');
if (*p == PhonemeCode('O'))
*p = PhonemeCode('o');
} else {
if (*p == PhonemeCode('e'))
*p = PhonemeCode('E');
if (*p == PhonemeCode('o'))
*p = PhonemeCode('O');
}
break;
}
}
}
}


static void ChangeWordStress(Translator *tr, char *word, int new_stress)
{
int ix;
unsigned char *p;
int max_stress;
int vowel_count; // num of vowels + 1
int stressed_syllable = 0; // position of stressed syllable
unsigned char phonetic[N_WORD_PHONEMES];
signed char vowel_stress[N_WORD_PHONEMES/2];

strcpy((char *)phonetic, word);
max_stress = GetVowelStress(tr, phonetic, vowel_stress, &vowel_count, &stressed_syllable, 0);

if (new_stress >= STRESS_IS_PRIMARY) {
// promote to primary stress
for (ix = 1; ix < vowel_count; ix++) {
if (vowel_stress[ix] >= max_stress) {
vowel_stress[ix] = new_stress;
break;
}
}
} else {
// remove primary stress
for (ix = 1; ix < vowel_count; ix++) {
if (vowel_stress[ix] > new_stress) // >= allows for diminished stress (=1)
vowel_stress[ix] = new_stress;
}
}

// write out phonemes
ix = 1;
p = phonetic;
while (*p != 0) {
if ((phoneme_tab[*p]->type == phVOWEL) && !(phoneme_tab[*p]->phflags & phNONSYLLABIC)) {
if ((vowel_stress[ix] == STRESS_IS_DIMINISHED) || (vowel_stress[ix] > STRESS_IS_UNSTRESSED))
*word++ = stress_phonemes[(unsigned char)vowel_stress[ix]];

ix++;
}
*word++ = *p++;
}
*word = 0;
}

static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word, ALPHABET *current_alphabet, char word_phonemes[])
{
int posn = 0;
@@ -1034,3 +1114,92 @@ static int NonAsciiNumber(int letter)
}
return -1;
}

static int Unpronouncable(Translator *tr, char *word, int posn)
{
/* Determines whether a word in 'unpronouncable', i.e. whether it should
be spoken as individual letters.

This function may be language specific. This is a generic version.
*/

int c;
int c1 = 0;
int vowel_posn = 9;
int index;
int count;
ALPHABET *alphabet;

utf8_in(&c, word);
if ((tr->letter_bits_offset > 0) && (c < 0x241)) {
// Latin characters for a language with a non-latin alphabet
return 0; // so we can re-translate the word as English
}

if (((alphabet = AlphabetFromChar(c)) != NULL) && (alphabet->offset != tr->letter_bits_offset)) {
// Character is not in our alphabet
return 0;
}

if (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 1)
return 0;

if (((c = *word) == ' ') || (c == 0) || (c == '\''))
return 0;

index = 0;
count = 0;
for (;;) {
index += utf8_in(&c, &word[index]);
if ((c == 0) || (c == ' '))
break;

if ((c == '\'') && ((count > 1) || (posn > 0)))
break; // "tv'" but not "l'"

if (count == 0)
c1 = c;

if ((c == '\'') && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 3)) {
// don't count apostrophe
} else
count++;

if (IsVowel(tr, c)) {
vowel_posn = count; // position of the first vowel
break;
}

if ((c != '\'') && !iswalpha(c))
return 0;
}

if ((vowel_posn > 2) && (tr->langopts.param[LOPT_UNPRONOUNCABLE] == 2)) {
// Lookup unpronounable rules in *_rules
return Unpronouncable2(tr, word);
}

if (c1 == tr->langopts.param[LOPT_UNPRONOUNCABLE])
vowel_posn--; // disregard this as the initial letter when counting

if (vowel_posn > (tr->langopts.max_initial_consonants+1))
return 1; // no vowel, or no vowel in first few letters

return 0;
}

static int Unpronouncable2(Translator *tr, char *word)
{
int c;
int end_flags;
char ph_buf[N_WORD_PHONEMES];

ph_buf[0] = 0;
c = word[-1];
word[-1] = ' '; // ensure there is a space before the "word"
end_flags = TranslateRules(tr, word, ph_buf, sizeof(ph_buf), NULL, FLAG_UNPRON_TEST, NULL);
word[-1] = c;
if ((end_flags == 0) || (end_flags & SUFX_UNPRON))
return 1;
return 0;
}

Loading…
Cancel
Save