Languages changes: it, pt. Added experimental suffix option to allow multiple suffixes to be removed from a word (eg. (_S2m ). git-svn-id: https://espeak.svn.sourceforge.net/svnroot/espeak/trunk@295 d46cf337-b52f-0410-862d-fd96e6ae7743master
cortile $2 | cortile $2 | ||||
cortisol $3 | cortisol $3 | ||||
cortocircuito kO@-*totSi@-*k'uito | cortocircuito kO@-*totSi@-*k'uito | ||||
cosa $alt | |||||
cos $alt | |||||
cosc $alt | cosc $alt | ||||
coscia $alt | coscia $alt | ||||
cosen $alt2 | cosen $alt2 | ||||
cosi $2 | |||||
cosm $alt | cosm $alt | ||||
cosmic $alt | cosmic $alt | ||||
cosm $alt | cosm $alt | ||||
ros $alt | ros $alt | ||||
rosalia $3 | rosalia $3 | ||||
roseo $1 $alt | roseo $1 $alt | ||||
rose $2 | |||||
rosolia $3 | rosolia $3 | ||||
rospigliosi $alt2 | rospigliosi $alt2 | ||||
rosp $alt | rosp $alt |
careta $alt2 | careta $alt2 | ||||
carreta $alt2 | carreta $alt2 | ||||
casebre $alt | casebre $alt | ||||
cateto $alt2 | |||||
catorze $alt2 | catorze $alt2 | ||||
cefaleia $alt | cefaleia $alt | ||||
cerca $alt $verb | cerca $alt $verb | ||||
desprezo $alt $verb | desprezo $alt $verb | ||||
deva $alt2 | deva $alt2 | ||||
devo $alt2 | devo $alt2 | ||||
deveras $alt | |||||
dez $alt | dez $alt | ||||
diarreia $alt | diarreia $alt | ||||
discordo $alt | discordo $alt | ||||
estiveres $alt | estiveres $alt | ||||
estrofe $alt | estrofe $alt | ||||
etiqueta $alt2 | etiqueta $alt2 | ||||
europa $alt | |||||
exagero $alt $verb | exagero $alt $verb | ||||
expeça $alt2 | expeça $alt2 | ||||
expeço $alt2 | expeço $alt2 | ||||
farofa $alt | farofa $alt | ||||
febre $alt | febre $alt | ||||
feitora $alt2 | |||||
fera $alt | fera $alt | ||||
fere $alt | fere $alt | ||||
ferem $alt | ferem $alt | ||||
ferozes $alt | |||||
fezes $alt | fezes $alt | ||||
folga $alt | folga $alt | ||||
fogos $alt | fogos $alt | ||||
martelo $alt | martelo $alt | ||||
merda $alt | merda $alt | ||||
megera $alt | megera $alt | ||||
mentora $alt2 | |||||
metro $alt | |||||
mexa m'eS& | mexa m'eS& | ||||
mexi meS'i | mexi meS'i | ||||
mexo m'eSU | mexo m'eSU | ||||
naquele $alt2 | naquele $alt2 | ||||
negro $alt2 | negro $alt2 | ||||
novos $alt | novos $alt | ||||
obstera $alt | |||||
obstetra $alt | |||||
obsoleta $alt2 | obsoleta $alt2 | ||||
obsoleto $alt2 | obsoleto $alt2 | ||||
odisseia $alt | odisseia $alt | ||||
onu $1 | onu $1 | ||||
opereta $alt2 | opereta $alt2 | ||||
ordens $alt | |||||
osso $alt2 | osso $alt2 | ||||
ovos $alt | ovos $alt | ||||
padeceste $alt2 | padeceste $alt2 | ||||
paexa paeSa | paexa paeSa | ||||
pangeia $alt | pangeia $alt | ||||
palheta $alt2 | palheta $alt2 | ||||
paralelo $alt | |||||
parede $alt2 | parede $alt2 | ||||
pastora $alt2 | pastora $alt2 | ||||
pedra $alt | pedra $alt | ||||
selvagem seUv'aZeIN | selvagem seUv'aZeIN | ||||
sincera $alt | sincera $alt | ||||
sincero $alt | sincero $alt | ||||
sinopse $alt | |||||
sobe $alt | sobe $alt | ||||
sobem $alt | sobem $alt | ||||
soco $alt2 | soco $alt2 | ||||
travesso $alt2 | travesso $alt2 | ||||
trevo $alt2 | trevo $alt2 | ||||
troco $alt2 $noun | troco $alt2 $noun | ||||
tropa $alt | |||||
tropeço $alt2 $noun | tropeço $alt2 $noun | ||||
trombeta $alt2 | trombeta $alt2 | ||||
valeta $alt2 | valeta $alt2 | ||||
vanessa $alt2 | vanessa $alt2 | ||||
velozes $alt | |||||
verbo $alt | verbo $alt | ||||
verme $alt | verme $alt | ||||
vierem $alt | vierem $alt | ||||
violeta $alt2 | violeta $alt2 | ||||
voga $alt | |||||
vozes $alt | vozes $alt | ||||
zelo $alt $verb | zelo $alt $verb | ||||
zero $alt | zero $alt |
.L03 a am o | .L03 a am o | ||||
.L04 a am e em o ue uem | .L04 a am e em o ue uem | ||||
.L05 r ra ram re rem rdes | .L05 r ra ram re rem rdes | ||||
.L06 ra ram ste re rem | |||||
.group a | .group a | ||||
?1 v) e (l_ ,E // eg: possível, amovível, disponível, etc... | ?1 v) e (l_ ,E // eg: possível, amovível, disponível, etc... | ||||
?1 l) e (ta 'E // eg: bicicleta, atleta. | ?1 l) e (ta 'E // eg: bicicleta, atleta. | ||||
sf) e (ra_ E // esfera, biosfera | |||||
//sort | //sort | ||||
qu) e (brL04_ E | qu) e (brL04_ E | ||||
_hosp) e (dL03_ E | _hosp) e (dL03_ E | ||||
_atr) e (vL03_ e | _atr) e (vL03_ e | ||||
_escr) e (vL03_ e | _escr) e (vL03_ e | ||||
_descr) e (vL03_ e | _descr) e (vL03_ e | ||||
_embel) e (zL01_ e | |||||
_embel) e (zL04_ E | |||||
_pr) e (zL04_ E | _pr) e (zL04_ E | ||||
_acont) e (çL01_ e | _acont) e (çL01_ e | ||||
_ado) e (çL03_ e | _ado) e (çL03_ e | ||||
_esclar) e (çL03_ e | _esclar) e (çL03_ e | ||||
_reapar) e (çL03_ e | _reapar) e (çL03_ e | ||||
_reconh) e (çL03_ e | _reconh) e (çL03_ e | ||||
_coop) e (rL04_ E | |||||
_sup) e (rL04_ E | |||||
_imp) e (rL03_ E | |||||
_temp) e (rL04_ E | |||||
//endsort | //endsort | ||||
_exag) e (L05_ E | _exag) e (L05_ E | ||||
_houv) e (L05_ E | _houv) e (L05_ E | ||||
_soub) e (L05_ E | _soub) e (L05_ E | ||||
_compreend) e (L06_ e | |||||
_correspond) e (L06_ e | |||||
_entend) e (L06_ e | |||||
_estend) e (L06_ e | |||||
_respond)e (L06_ e | |||||
_vend) e (L06_ e | |||||
.group é | .group é | ||||
é ''E | é ''E | ||||
c) o (rrL01_ o // escorra, incorra, corra, recorra | c) o (rrL01_ o // escorra, incorra, corra, recorra | ||||
_m) o (rrL01_ o | _m) o (rrL01_ o | ||||
p) ostos (_ Ost=Us# // dispostos etc. | p) ostos (_ Ost=Us# // dispostos etc. | ||||
_esn) o (bL04_ O | |||||
//endsort | //endsort | ||||
.group ô | .group ô | ||||
r) u (_A u | r) u (_A u | ||||
u (A_ 'u | u (A_ 'u | ||||
u (em_ 'u | |||||
ui uI | ui uI | ||||
u (iu w | u (iu w | ||||
u (iCK u | u (iCK u |
_) த (ீரரை d | _) த (ீரரை d | ||||
_) த (ீர்க d | _) த (ீர்க d | ||||
_) த (ீர்க்க t | _) த (ீர்க்க t | ||||
_) தீர்க்க (_சுமங்கலி dirggV | |||||
_) தீர்க்க (_சுமங்கலி di:rggV | |||||
_) தீர்க்க (தரிச di:rggV | _) தீர்க்க (தரிச di:rggV | ||||
_) தீர்க்க (த்துடன di:rggV# | _) தீர்க்க (த்துடன di:rggV# | ||||
_) த (ீர்க்கமா d | _) த (ீர்க்கமா d |
} | } | ||||
continue; | continue; | ||||
} | } | ||||
if(rb == RULE_DOLLAR) | if(rb == RULE_DOLLAR) | ||||
{ | { | ||||
value = *rule++ & 0xff; | value = *rule++ & 0xff; | ||||
int multiple_numeric_hyphen = 0; | int multiple_numeric_hyphen = 0; | ||||
char *multiple_string = NULL; | char *multiple_string = NULL; | ||||
char *multiple_string_end = NULL; | char *multiple_string_end = NULL; | ||||
int len_word; | int len_word; | ||||
int len_phonetic; | int len_phonetic; | ||||
int text_not_phonemes; // this word specifies replacement text, not phonemes | int text_not_phonemes; // this word specifies replacement text, not phonemes | ||||
unsigned int wc; | unsigned int wc; | ||||
int all_upper_case; | int all_upper_case; | ||||
char *mnemptr; | char *mnemptr; | ||||
unsigned char flag_codes[100]; | unsigned char flag_codes[100]; | ||||
char encoded_ph[200]; | char encoded_ph[200]; | ||||
#endif | #endif | ||||
step = 0; | step = 0; | ||||
c = 0; | c = 0; | ||||
while(c != '\n') | while(c != '\n') | ||||
{ | { | ||||
c = *p; | c = *p; | ||||
if((c == '?') && (step==0)) | if((c == '?') && (step==0)) | ||||
{ | { | ||||
// conditional rule, allow only if the numbered condition is set for the voice | // conditional rule, allow only if the numbered condition is set for the voice | ||||
flag_codes[n_flag_codes++] = ix + flag_offset; | flag_codes[n_flag_codes++] = ix + flag_offset; | ||||
c = *p; | c = *p; | ||||
} | } | ||||
if((c == '$') && isalnum(p[1])) | if((c == '$') && isalnum(p[1])) | ||||
{ | { | ||||
/* read keyword parameter */ | /* read keyword parameter */ | ||||
mnemptr = p; | mnemptr = p; | ||||
while(!isspace2(c = *p)) p++; | while(!isspace2(c = *p)) p++; | ||||
*p = 0; | *p = 0; | ||||
flagnum = LookupMnem(mnem_flags,mnemptr); | flagnum = LookupMnem(mnem_flags,mnemptr); | ||||
if(flagnum > 0) | if(flagnum > 0) | ||||
{ | { | ||||
error_count++; | error_count++; | ||||
} | } | ||||
} | } | ||||
if((c == '/') && (p[1] == '/') && (multiple_words==0)) | if((c == '/') && (p[1] == '/') && (multiple_words==0)) | ||||
{ | { | ||||
c = '\n'; /* "//" treat comment as end of line */ | c = '\n'; /* "//" treat comment as end of line */ | ||||
} | } | ||||
switch(step) | switch(step) | ||||
{ | { | ||||
case 0: | case 0: | ||||
step = 1; | step = 1; | ||||
} | } | ||||
break; | break; | ||||
case 1: | case 1: | ||||
if((c == '-') && multiple_words) | if((c == '-') && multiple_words) | ||||
{ | { | ||||
step = 3; | step = 3; | ||||
} | } | ||||
break; | break; | ||||
case 3: | case 3: | ||||
if(!isspace2(c)) | if(!isspace2(c)) | ||||
{ | { | ||||
step = 4; | step = 4; | ||||
} | } | ||||
break; | break; | ||||
case 4: | case 4: | ||||
if(isspace2(c)) | if(isspace2(c)) | ||||
{ | { | ||||
step = 5; | step = 5; | ||||
} | } | ||||
break; | break; | ||||
case 5: | case 5: | ||||
break; | break; | ||||
} | } | ||||
p++; | p++; | ||||
} | } | ||||
if(word[0] == 0) | if(word[0] == 0) | ||||
{ | { | ||||
return(0); /* blank line */ | return(0); /* blank line */ | ||||
*hash = HashDictionary(word); | *hash = HashDictionary(word); | ||||
len_phonetic = strlen(encoded_ph); | len_phonetic = strlen(encoded_ph); | ||||
dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed | dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed | ||||
len_word &= 0x3f; | len_word &= 0x3f; | ||||
length = len_word + len_phonetic + 3; | length = len_word + len_phonetic + 3; | ||||
strcpy(&dict_line[(len_word)+2],encoded_ph); | strcpy(&dict_line[(len_word)+2],encoded_ph); | ||||
} | } | ||||
for(ix=0; ix<n_flag_codes; ix++) | for(ix=0; ix<n_flag_codes; ix++) | ||||
{ | { | ||||
dict_line[ix+length] = flag_codes[ix]; | dict_line[ix+length] = flag_codes[ix]; | ||||
fflush(f_log); | fflush(f_log); | ||||
#endif | #endif | ||||
} | } | ||||
for(hash=0; hash<N_HASH_DICT; hash++) | for(hash=0; hash<N_HASH_DICT; hash++) | ||||
{ | { | ||||
p = hash_chains[hash]; | p = hash_chains[hash]; | ||||
hash_counts[hash] = (int)ftell(f_out); | hash_counts[hash] = (int)ftell(f_out); | ||||
while(p != NULL) | while(p != NULL) | ||||
{ | { | ||||
length = *(p+sizeof(char *)); | length = *(p+sizeof(char *)); | ||||
char buf[200]; | char buf[200]; | ||||
char fname[sizeof(path_home)+45]; | char fname[sizeof(path_home)+45]; | ||||
char dict_line[128]; | char dict_line[128]; | ||||
text_mode = 0; | text_mode = 0; | ||||
// try with and without '.txt' extension | // try with and without '.txt' extension | ||||
fprintf(f_log,"Compiling: '%s'\n",fname); | fprintf(f_log,"Compiling: '%s'\n",fname); | ||||
linenum=0; | linenum=0; | ||||
while(fgets(buf,sizeof(buf),f_in) != NULL) | while(fgets(buf,sizeof(buf),f_in) != NULL) | ||||
{ | { | ||||
linenum++; | linenum++; | ||||
if(length == 0) continue; /* blank line */ | if(length == 0) continue; /* blank line */ | ||||
hash_counts[hash]++; | hash_counts[hash]++; | ||||
p = (char *)malloc(length+sizeof(char *)); | p = (char *)malloc(length+sizeof(char *)); | ||||
if(p == NULL) | if(p == NULL) | ||||
{ | { | ||||
} | } | ||||
break; | break; | ||||
} | } | ||||
memcpy(p,&hash_chains[hash],sizeof(char *)); | memcpy(p,&hash_chains[hash],sizeof(char *)); | ||||
hash_chains[hash] = p; | hash_chains[hash] = p; | ||||
memcpy(p+sizeof(char *),dict_line,length); | memcpy(p+sizeof(char *),dict_line,length); | ||||
count++; | count++; | ||||
} | } | ||||
fprintf(f_log,"\t%d entries\n",count); | fprintf(f_log,"\t%d entries\n",count); | ||||
fclose(f_in); | fclose(f_in); | ||||
return(0); | return(0); | ||||
output = &rule_phonemes[len]; | output = &rule_phonemes[len]; | ||||
} | } | ||||
sxflags = 0x808000; // to ensure non-zero bytes | sxflags = 0x808000; // to ensure non-zero bytes | ||||
for(p=string,ix=0;;) | for(p=string,ix=0;;) | ||||
{ | { | ||||
literal = 0; | literal = 0; | ||||
case 'a': | case 'a': | ||||
sxflags |= SUFX_A; | sxflags |= SUFX_A; | ||||
break; | break; | ||||
case 'm': | |||||
sxflags |= SUFX_M; | |||||
break; | |||||
default: | default: | ||||
if(isdigit(c)) | if(isdigit(c)) | ||||
value = (value*10) + (c - '0'); | value = (value*10) + (c - '0'); | ||||
rule_phonemes[0]=0; | rule_phonemes[0]=0; | ||||
p = buf; | p = buf; | ||||
for(ix=0; finish==0; ix++) | for(ix=0; finish==0; ix++) | ||||
{ | { | ||||
c = input[ix]; | c = input[ix]; | ||||
copy_rule_string(buf,state); | copy_rule_string(buf,state); | ||||
p = buf; | p = buf; | ||||
break; | break; | ||||
case '(': // start of suffix section | case '(': // start of suffix section | ||||
*p = 0; | *p = 0; | ||||
state = 2; | state = 2; | ||||
error_count++; | error_count++; | ||||
} | } | ||||
break; | break; | ||||
case '\n': // end of line | case '\n': // end of line | ||||
case '\r': | case '\r': | ||||
case 0: // end of line | case 0: // end of line | ||||
copy_rule_string(buf,state); | copy_rule_string(buf,state); | ||||
finish=1; | finish=1; | ||||
break; | break; | ||||
case '\t': // end of section section | case '\t': // end of section section | ||||
case ' ': | case ' ': | ||||
*p = 0; | *p = 0; | ||||
copy_rule_string(buf,state); | copy_rule_string(buf,state); | ||||
p = buf; | p = buf; | ||||
break; | break; | ||||
case '?': | case '?': | ||||
if(state==2) | if(state==2) | ||||
state=0; | state=0; | ||||
break; | break; | ||||
} | } | ||||
} | } | ||||
if(strcmp(rule_match,"$group")==0) | if(strcmp(rule_match,"$group")==0) | ||||
strcpy(rule_match,group_name); | strcpy(rule_match,group_name); | ||||
} | } | ||||
strcpy(output,buf); | strcpy(output,buf); | ||||
len = strlen(buf)+1; | len = strlen(buf)+1; | ||||
len_name = strlen(group_name); | len_name = strlen(group_name); | ||||
if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0)) | if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0)) | ||||
{ | { | ||||
len1 = strlen(p) + 1; | len1 = strlen(p) + 1; | ||||
p = &p[len1]; | p = &p[len1]; | ||||
len2 = strlen(p); | len2 = strlen(p); | ||||
rule_match[0]=0; | rule_match[0]=0; | ||||
rule_pre[0]=0; | rule_pre[0]=0; | ||||
rule_post[0]=0; | rule_post[0]=0; | ||||
} | } | ||||
} | } | ||||
*pout = 0; | *pout = 0; | ||||
spaces = 12; | spaces = 12; | ||||
if(condition > 0) | if(condition > 0) | ||||
{ | { | ||||
fprintf(f_out,"%s",buf); | fprintf(f_out,"%s",buf); | ||||
spaces = 0; | spaces = 0; | ||||
} | } | ||||
for(ix=0; ix<spaces; ix++) | for(ix=0; ix<spaces; ix++) | ||||
fputc(' ',f_out); | fputc(' ',f_out); | ||||
spaces = 14; | spaces = 14; | ||||
sprintf(buf," %s ",rule_match); | sprintf(buf," %s ",rule_match); | ||||
if(rule_post[0] != 0) | if(rule_post[0] != 0) | ||||
int n_rgroups = 0; | int n_rgroups = 0; | ||||
int n_groups3 = 0; | int n_groups3 = 0; | ||||
RGROUP rgroup[N_RULE_GROUP2]; | RGROUP rgroup[N_RULE_GROUP2]; | ||||
linenum = 0; | linenum = 0; | ||||
group_name[0] = 0; | group_name[0] = 0; | ||||
if((p = (unsigned char *)strstr(buf,"//")) != NULL) | if((p = (unsigned char *)strstr(buf,"//")) != NULL) | ||||
*p = 0; | *p = 0; | ||||
if(buf[0] == '\r') buf++; // ignore extra \r in \r\n | |||||
if(buf[0] == '\r') buf++; // ignore extra \r in \r\n | |||||
} | } | ||||
if((buf == NULL) || (buf[0] == '.')) | if((buf == NULL) || (buf[0] == '.')) | ||||
{ | { | ||||
// group character is given as a character code (max 16 bits) | // group character is given as a character code (max 16 bits) | ||||
p = (unsigned char *)group_name; | p = (unsigned char *)group_name; | ||||
if(char_code > 0x100) | if(char_code > 0x100) | ||||
{ | { | ||||
*p++ = (char_code >> 8); | *p++ = (char_code >> 8); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
if((group3_ix == 0) && (strlen(group_name) > 2)) | if((group3_ix == 0) && (strlen(group_name) > 2)) | ||||
{ | { | ||||
if(utf8_in(&c,group_name) < 2) | if(utf8_in(&c,group_name) < 2) | ||||
fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum); | fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum); | ||||
error_count++; | error_count++; | ||||
} | } | ||||
group_name[2] = 0; | group_name[2] = 0; | ||||
} | } | ||||
} | } | ||||
continue; | continue; | ||||
} | } | ||||
switch(compile_mode) | switch(compile_mode) | ||||
{ | { | ||||
case 1: // .group | case 1: // .group | ||||
compile_dictlist_file(path,"list"); | compile_dictlist_file(path,"list"); | ||||
} | } | ||||
compile_dictlist_file(path,"extra"); | compile_dictlist_file(path,"extra"); | ||||
compile_dictlist_end(f_out); | compile_dictlist_end(f_out); | ||||
offset_rules = ftell(f_out); | offset_rules = ftell(f_out); | ||||
fprintf(f_log,"Compiling: '%s'\n",fname_in); | fprintf(f_log,"Compiling: '%s'\n",fname_in); | ||||
compile_dictrules(f_in,f_out,fname_temp); | compile_dictrules(f_in,f_out,fname_temp); |
{ | { | ||||
if(vowel_stress[ix] == 4) | if(vowel_stress[ix] == 4) | ||||
vowel_stress[ix] = 3; // change marked stress (consonant clusters) to secondary (except the last) | vowel_stress[ix] = 3; // change marked stress (consonant clusters) to secondary (except the last) | ||||
if(vowel_length[ix] > 0) | if(vowel_length[ix] > 0) | ||||
{ | { | ||||
long_vowel = ix; | long_vowel = ix; | ||||
vowel_stress[ix] = 3; // give secondary stress to all long vowels | vowel_stress[ix] = 3; // give secondary stress to all long vowels | ||||
} | } | ||||
} | } | ||||
// 'stressed_syllable' gives the last marked stress | // 'stressed_syllable' gives the last marked stress | ||||
if(stressed_syllable == 0) | if(stressed_syllable == 0) | ||||
{ | { | ||||
*word_end = 'e'; | *word_end = 'e'; | ||||
} | } | ||||
i = word_end - word; | i = word_end - word; | ||||
memcpy(word_copy,word,i); | |||||
word_copy[i] = 0; | |||||
if(word_copy != NULL) | |||||
{ | |||||
memcpy(word_copy,word,i); | |||||
word_copy[i] = 0; | |||||
} | |||||
// look for multibyte characters to increase the number of bytes to remove | // look for multibyte characters to increase the number of bytes to remove | ||||
for(len_ending = i = (end_type & 0x3f); i>0 ;i--) // num.of characters of the suffix | for(len_ending = i = (end_type & 0x3f); i>0 ;i--) // num.of characters of the suffix |
int utf8_in2(int *c, const char *buf, int backwards) | int utf8_in2(int *c, const char *buf, int backwards) | ||||
{//================================================= | {//================================================= | ||||
// Read a unicode characater from a UTF8 string | |||||
// Read a unicode characater from a UTF8 string | |||||
// Returns the number of UTF8 bytes used. | // Returns the number of UTF8 bytes used. | ||||
// backwards: set if we are moving backwards through the UTF8 string | // backwards: set if we are moving backwards through the UTF8 string | ||||
int c1; | int c1; | ||||
int utf8_in(int *c, const char *buf) | int utf8_in(int *c, const char *buf) | ||||
{//================================= | {//================================= | ||||
// Read a unicode characater from a UTF8 string | |||||
// Read a unicode characater from a UTF8 string | |||||
// Returns the number of UTF8 bytes used. | // Returns the number of UTF8 bytes used. | ||||
return(utf8_in2(c,buf,0)); | return(utf8_in2(c,buf,0)); | ||||
} | } | ||||
unsigned int dictionary_flags[2]; | unsigned int dictionary_flags[2]; | ||||
unsigned int dictionary_flags2[2]; | unsigned int dictionary_flags2[2]; | ||||
int end_type=0; | int end_type=0; | ||||
int end_type1=0; | |||||
int prefix_type=0; | int prefix_type=0; | ||||
int prefix_stress; | int prefix_stress; | ||||
char *wordx; | char *wordx; | ||||
char phonemes[N_WORD_PHONEMES]; | char phonemes[N_WORD_PHONEMES]; | ||||
char phonemes2[N_WORD_PHONEMES]; | |||||
char prefix_phonemes[N_WORD_PHONEMES]; | char prefix_phonemes[N_WORD_PHONEMES]; | ||||
char unpron_phonemes[N_WORD_PHONEMES]; | char unpron_phonemes[N_WORD_PHONEMES]; | ||||
char end_phonemes[N_WORD_PHONEMES]; | char end_phonemes[N_WORD_PHONEMES]; | ||||
char end_phonemes2[N_WORD_PHONEMES]; | |||||
char word_copy[N_WORD_BYTES]; | char word_copy[N_WORD_BYTES]; | ||||
char word_copy2[N_WORD_BYTES]; | char word_copy2[N_WORD_BYTES]; | ||||
int word_copy_length; | int word_copy_length; | ||||
char prefix_chars[0x3f + 2]; | |||||
char prefix_chars[0x3f + 2]; | |||||
int found=0; | int found=0; | ||||
int end_flags; | |||||
int end_flags; | |||||
char c_temp; // save a character byte while we temporarily replace it with space | char c_temp; // save a character byte while we temporarily replace it with space | ||||
int first_char; | int first_char; | ||||
int last_char = 0; | int last_char = 0; | ||||
int add_plural_suffix = 0; | int add_plural_suffix = 0; | ||||
int prefix_flags = 0; | int prefix_flags = 0; | ||||
int more_suffixes; | |||||
int confirm_prefix; | int confirm_prefix; | ||||
int spell_word; | int spell_word; | ||||
int stress_bits; | int stress_bits; | ||||
// the word has $abbrev flag, but no pronunciation specified. Speak as individual letters | // the word has $abbrev flag, but no pronunciation specified. Speak as individual letters | ||||
spell_word = 1; | spell_word = 1; | ||||
} | } | ||||
if(!found && iswdigit(first_char)) | if(!found && iswdigit(first_char)) | ||||
{ | { | ||||
Lookup(tr,"_0lang",word_phonemes); | Lookup(tr,"_0lang",word_phonemes); | ||||
if(confirm_prefix && !(end_type & SUFX_B)) | if(confirm_prefix && !(end_type & SUFX_B)) | ||||
{ | { | ||||
int end2; | int end2; | ||||
char phonemes2[N_WORD_PHONEMES]; | |||||
char end_phonemes2[N_WORD_PHONEMES]; | char end_phonemes2[N_WORD_PHONEMES]; | ||||
// remove any standard suffix and confirm that the prefix is still recognised | // remove any standard suffix and confirm that the prefix is still recognised | ||||
for(ix=0; ix < n_chars; ix++) // num. of bytes to remove | for(ix=0; ix < n_chars; ix++) // num. of bytes to remove | ||||
{ | { | ||||
prefix_chars[pfix++] = *wordx++; | prefix_chars[pfix++] = *wordx++; | ||||
if((prefix_type & SUFX_B) && (ix == (n_chars-1))) | if((prefix_type & SUFX_B) && (ix == (n_chars-1))) | ||||
{ | { | ||||
prefix_chars[pfix-1] = 0; // discard the last character of the prefix, this is the separator character | prefix_chars[pfix-1] = 0; // discard the last character of the prefix, this is the separator character | ||||
} | } | ||||
} | } | ||||
if((end_type != 0) && !(end_type & SUFX_P)) | if((end_type != 0) && !(end_type & SUFX_P)) | ||||
{ | { | ||||
char phonemes2[N_WORD_PHONEMES]; | |||||
strcpy(phonemes2,phonemes); | |||||
end_type1 = end_type; | |||||
strcpy(phonemes2,phonemes); | |||||
// The word has a standard ending, re-translate without this ending | // The word has a standard ending, re-translate without this ending | ||||
end_flags = RemoveEnding(tr, wordx, end_type, word_copy); | end_flags = RemoveEnding(tr, wordx, end_type, word_copy); | ||||
phonemes[0] = 0; | |||||
if(prefix_phonemes[0] != 0) | |||||
{ | |||||
// lookup the stem without the prefix removed | |||||
wordx[-1] = c_temp; | |||||
found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab); // include prefix, but not suffix | |||||
wordx[-1] = ' '; | |||||
if(phonemes[0] == phonSWITCH) | |||||
{ | |||||
// change to another language in order to translate this word | |||||
memcpy(wordx,word_copy,strlen(word_copy)); | |||||
strcpy(word_phonemes,phonemes); | |||||
return(0); | |||||
} | |||||
if(dictionary_flags[0]==0) | |||||
{ | |||||
dictionary_flags[0] = dictionary_flags2[0]; | |||||
dictionary_flags[1] = dictionary_flags2[1]; | |||||
} | |||||
if(found) | |||||
prefix_phonemes[0] = 0; // matched whole word, don't need prefix now | |||||
if((found==0) && (dictionary_flags2[0] != 0)) | |||||
prefix_flags = 1; | |||||
} | |||||
if(found == 0) | |||||
{ | |||||
found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab); // without prefix and suffix | |||||
if(phonemes[0] == phonSWITCH) | |||||
{ | |||||
// change to another language in order to translate this word | |||||
memcpy(wordx,word_copy,strlen(word_copy)); | |||||
strcpy(word_phonemes,phonemes); | |||||
return(0); | |||||
} | |||||
if(dictionary_flags2[0] & FLAG_ABBREV) | |||||
{ | |||||
// Removing the suffix leaves a word which should be spoken as individual letters | |||||
// Not yet implemented | |||||
} | |||||
if(dictionary_flags[0]==0) | |||||
{ | |||||
dictionary_flags[0] = dictionary_flags2[0]; | |||||
dictionary_flags[1] = dictionary_flags2[1]; | |||||
} | |||||
} | |||||
if(found == 0) | |||||
{ | |||||
if(end_type & SUFX_Q) | |||||
{ | |||||
// don't retranslate, use the original lookup result | |||||
strcpy(phonemes,phonemes2); | |||||
// language specific changes | |||||
ApplySpecialAttribute(tr,phonemes,dictionary_flags[0]); | |||||
} | |||||
else | |||||
{ | |||||
if(end_flags & FLAG_SUFX) | |||||
wflags |= FLAG_SUFFIX_REMOVED; | |||||
if(end_type & SUFX_A) | |||||
wflags |= FLAG_SUFFIX_VOWEL; | |||||
TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags); | |||||
if(phonemes[0] == phonSWITCH) | |||||
{ | |||||
// change to another language in order to translate this word | |||||
strcpy(word_phonemes,phonemes); | |||||
memcpy(wordx,word_copy,strlen(word_copy)); | |||||
wordx[-1] = c_temp; | |||||
return(0); | |||||
} | |||||
} | |||||
} | |||||
if((end_type & SUFX_T) == 0) | |||||
more_suffixes = 1; | |||||
while(more_suffixes) | |||||
{ | |||||
more_suffixes = 0; | |||||
phonemes[0] = 0; | |||||
if(prefix_phonemes[0] != 0) | |||||
{ | |||||
// lookup the stem without the prefix removed | |||||
wordx[-1] = c_temp; | |||||
found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab); // include prefix, but not suffix | |||||
wordx[-1] = ' '; | |||||
if(phonemes[0] == phonSWITCH) | |||||
{ | |||||
// change to another language in order to translate this word | |||||
memcpy(wordx,word_copy,strlen(word_copy)); | |||||
strcpy(word_phonemes,phonemes); | |||||
return(0); | |||||
} | |||||
if(dictionary_flags[0]==0) | |||||
{ | |||||
dictionary_flags[0] = dictionary_flags2[0]; | |||||
dictionary_flags[1] = dictionary_flags2[1]; | |||||
} | |||||
if(found) | |||||
prefix_phonemes[0] = 0; // matched whole word, don't need prefix now | |||||
if((found==0) && (dictionary_flags2[0] != 0)) | |||||
prefix_flags = 1; | |||||
} | |||||
if(found == 0) | |||||
{ | |||||
found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab); // without prefix and suffix | |||||
if(phonemes[0] == phonSWITCH) | |||||
{ | |||||
// change to another language in order to translate this word | |||||
memcpy(wordx,word_copy,strlen(word_copy)); | |||||
strcpy(word_phonemes,phonemes); | |||||
return(0); | |||||
} | |||||
if(dictionary_flags2[0] & FLAG_ABBREV) | |||||
{ | |||||
// Removing the suffix leaves a word which should be spoken as individual letters | |||||
// Not yet implemented | |||||
} | |||||
if(dictionary_flags[0]==0) | |||||
{ | |||||
dictionary_flags[0] = dictionary_flags2[0]; | |||||
dictionary_flags[1] = dictionary_flags2[1]; | |||||
} | |||||
} | |||||
if(found == 0) | |||||
{ | |||||
if(end_type & SUFX_Q) | |||||
{ | |||||
// don't retranslate, use the original lookup result | |||||
strcpy(phonemes,phonemes2); | |||||
// language specific changes | |||||
ApplySpecialAttribute(tr,phonemes,dictionary_flags[0]); | |||||
} | |||||
else | |||||
{ | |||||
if(end_flags & FLAG_SUFX) | |||||
wflags |= FLAG_SUFFIX_REMOVED; | |||||
if(end_type & SUFX_A) | |||||
wflags |= FLAG_SUFFIX_VOWEL; | |||||
if(end_type & SUFX_M) | |||||
{ | |||||
// allow more suffixes before this suffix | |||||
strcpy(end_phonemes2, end_phonemes); | |||||
end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags); | |||||
strcat(end_phonemes, end_phonemes2); // add the phonemes for the previous suffixes after this one | |||||
if((end_type != 0) && !(end_type & SUFX_P)) | |||||
{ | |||||
// there is another suffix | |||||
end_flags = RemoveEnding(tr, wordx, end_type, NULL); | |||||
more_suffixes = 1; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
// don't remove any previous suffix | |||||
TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags); | |||||
end_type = 0; | |||||
} | |||||
if(phonemes[0] == phonSWITCH) | |||||
{ | |||||
// change to another language in order to translate this word | |||||
strcpy(word_phonemes,phonemes); | |||||
memcpy(wordx,word_copy,strlen(word_copy)); | |||||
wordx[-1] = c_temp; | |||||
return(0); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
if((end_type1 & SUFX_T) == 0) | |||||
{ | { | ||||
// the default is to add the suffix and then determine the word's stress pattern | // the default is to add the suffix and then determine the word's stress pattern | ||||
AppendPhonemes(tr,phonemes, N_WORD_PHONEMES, end_phonemes); | AppendPhonemes(tr,phonemes, N_WORD_PHONEMES, end_phonemes); | ||||
} | } | ||||
memcpy(wordx,word_copy,strlen(word_copy)); | memcpy(wordx,word_copy,strlen(word_copy)); | ||||
} | } | ||||
wordx[-1] = c_temp; | wordx[-1] = c_temp; | ||||
} | } | ||||
} | } | ||||
// dictionary flags for this word give a clue about which alternative pronunciations of | // dictionary flags for this word give a clue about which alternative pronunciations of | ||||
// following words to use. | // following words to use. | ||||
if(end_type & SUFX_F) | |||||
if(end_type1 & SUFX_F) | |||||
{ | { | ||||
// expect a verb form, with or without -s suffix | // expect a verb form, with or without -s suffix | ||||
tr->expect_verb = 2; | tr->expect_verb = 2; | ||||
ok = 0; | ok = 0; | ||||
} | } | ||||
} | } | ||||
if((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD)) | if((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD)) | ||||
{ | { | ||||
// not if the next word is end-of-sentence | // not if the next word is end-of-sentence | ||||
else | else | ||||
if(ph_code == phonX1) | if(ph_code == phonX1) | ||||
{ | { | ||||
// a language specific action | |||||
// a language specific action | |||||
if(tr->langopts.param[LOPT_IT_DOUBLING]) | if(tr->langopts.param[LOPT_IT_DOUBLING]) | ||||
{ | { | ||||
flags |= FLAG_DOUBLING; | flags |= FLAG_DOUBLING; | ||||
{ | { | ||||
if(first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING]) | if(first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING]) | ||||
{ | { | ||||
if(((tr->prev_dict_flags & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) || | |||||
if(((tr->prev_dict_flags & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) || | |||||
(tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2))) | (tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2))) | ||||
{ | { | ||||
// italian, double the initial consonant if the previous word ends with a | // italian, double the initial consonant if the previous word ends with a | ||||
} | } | ||||
#endif | #endif | ||||
if((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032)) | if((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032)) | ||||
c = '\''; // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe | |||||
c = '\''; // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe | |||||
if(((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in)) | if(((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in)) | ||||
{ | { | ||||
{ | { | ||||
if((next_in == '\002') || ((next_in == '[') && option_phoneme_input)) | if((next_in == '\002') || ((next_in == '[') && option_phoneme_input)) | ||||
{ | { | ||||
// "[\002" is used internally to start phoneme mode | |||||
// "[\002" is used internally to start phoneme mode | |||||
phoneme_mode = FLAG_PHONEMES; | phoneme_mode = FLAG_PHONEMES; | ||||
source_index++; | source_index++; | ||||
continue; | continue; | ||||
if(!IsSpace(prev_in) && IsAlpha(next_in)) | if(!IsSpace(prev_in) && IsAlpha(next_in)) | ||||
{ | { | ||||
if(prev_out != ' ') | if(prev_out != ' ') | ||||
{ | |||||
{ | |||||
// previous 'word' not yet ended (not alpha or numeric), start new word now. | // previous 'word' not yet ended (not alpha or numeric), start new word now. | ||||
c = ' '; | c = ' '; | ||||
space_inserted = 1; | space_inserted = 1; | ||||
if((word_count==0) && (embedded_count > 0)) | if((word_count==0) && (embedded_count > 0)) | ||||
{ | { | ||||
// add a null 'word' to carry the embedded command flag | // add a null 'word' to carry the embedded command flag | ||||
embedded_list[embedded_ix-1] |= 0x80; | |||||
embedded_list[embedded_ix-1] |= 0x80; | |||||
words[word_count].flags |= FLAG_EMBEDDED; | words[word_count].flags |= FLAG_EMBEDDED; | ||||
word_count = 1; | word_count = 1; | ||||
} | } | ||||
ix--; // the last word is a bracket, mark the previous word as last | ix--; // the last word is a bracket, mark the previous word as last | ||||
words[ix].flags |= FLAG_LAST_WORD; | words[ix].flags |= FLAG_LAST_WORD; | ||||
// FLAG_NOSPACE check to avoid recognizing .mr -mr | |||||
// FLAG_NOSPACE check to avoid recognizing .mr -mr | |||||
if((terminator & CLAUSE_DOT) && !(words[word_count-1].flags & FLAG_NOSPACE)) | if((terminator & CLAUSE_DOT) && !(words[word_count-1].flags & FLAG_NOSPACE)) | ||||
words[word_count-1].flags |= FLAG_HAS_DOT; | words[word_count-1].flags |= FLAG_HAS_DOT; | ||||
} | } |
#define SUFX_T 0x10000 // don't affect the stress position in the stem | #define SUFX_T 0x10000 // don't affect the stress position in the stem | ||||
#define SUFX_B 0x20000 // break, this character breaks the word into stem and suffix (used with SUFX_P) | #define SUFX_B 0x20000 // break, this character breaks the word into stem and suffix (used with SUFX_P) | ||||
#define SUFX_A 0x40000 // remember that the suffix starts with a vowel | #define SUFX_A 0x40000 // remember that the suffix starts with a vowel | ||||
#define SUFX_M 0x80000 // bit 19, allow multiple suffixes | |||||
#define SUFX_UNPRON 0x8000 // used to return $unpron flag from *_rules | #define SUFX_UNPRON 0x8000 // used to return $unpron flag from *_rules | ||||