| cortile $2 | cortile $2 | ||||
| cortisol $3 | cortisol $3 | ||||
| cortocircuito kO@-*totSi@-*k'uito | cortocircuito kO@-*totSi@-*k'uito | ||||
| cosa $alt | |||||
| cos $alt | |||||
| cosc $alt | cosc $alt | ||||
| coscia $alt | coscia $alt | ||||
| cosen $alt2 | cosen $alt2 | ||||
| cosi $2 | |||||
| cosm $alt | cosm $alt | ||||
| cosmic $alt | cosmic $alt | ||||
| cosm $alt | cosm $alt | ||||
| ros $alt | ros $alt | ||||
| rosalia $3 | rosalia $3 | ||||
| roseo $1 $alt | roseo $1 $alt | ||||
| rose $2 | |||||
| rosolia $3 | rosolia $3 | ||||
| rospigliosi $alt2 | rospigliosi $alt2 | ||||
| rosp $alt | rosp $alt |
| careta $alt2 | careta $alt2 | ||||
| carreta $alt2 | carreta $alt2 | ||||
| casebre $alt | casebre $alt | ||||
| cateto $alt2 | |||||
| catorze $alt2 | catorze $alt2 | ||||
| cefaleia $alt | cefaleia $alt | ||||
| cerca $alt $verb | cerca $alt $verb | ||||
| desprezo $alt $verb | desprezo $alt $verb | ||||
| deva $alt2 | deva $alt2 | ||||
| devo $alt2 | devo $alt2 | ||||
| deveras $alt | |||||
| dez $alt | dez $alt | ||||
| diarreia $alt | diarreia $alt | ||||
| discordo $alt | discordo $alt | ||||
| estiveres $alt | estiveres $alt | ||||
| estrofe $alt | estrofe $alt | ||||
| etiqueta $alt2 | etiqueta $alt2 | ||||
| europa $alt | |||||
| exagero $alt $verb | exagero $alt $verb | ||||
| expeça $alt2 | expeça $alt2 | ||||
| expeço $alt2 | expeço $alt2 | ||||
| farofa $alt | farofa $alt | ||||
| febre $alt | febre $alt | ||||
| feitora $alt2 | |||||
| fera $alt | fera $alt | ||||
| fere $alt | fere $alt | ||||
| ferem $alt | ferem $alt | ||||
| ferozes $alt | |||||
| fezes $alt | fezes $alt | ||||
| folga $alt | folga $alt | ||||
| fogos $alt | fogos $alt | ||||
| martelo $alt | martelo $alt | ||||
| merda $alt | merda $alt | ||||
| megera $alt | megera $alt | ||||
| mentora $alt2 | |||||
| metro $alt | |||||
| mexa m'eS& | mexa m'eS& | ||||
| mexi meS'i | mexi meS'i | ||||
| mexo m'eSU | mexo m'eSU | ||||
| naquele $alt2 | naquele $alt2 | ||||
| negro $alt2 | negro $alt2 | ||||
| novos $alt | novos $alt | ||||
| obstera $alt | |||||
| obstetra $alt | |||||
| obsoleta $alt2 | obsoleta $alt2 | ||||
| obsoleto $alt2 | obsoleto $alt2 | ||||
| odisseia $alt | odisseia $alt | ||||
| onu $1 | onu $1 | ||||
| opereta $alt2 | opereta $alt2 | ||||
| ordens $alt | |||||
| osso $alt2 | osso $alt2 | ||||
| ovos $alt | ovos $alt | ||||
| padeceste $alt2 | padeceste $alt2 | ||||
| paexa paeSa | paexa paeSa | ||||
| pangeia $alt | pangeia $alt | ||||
| palheta $alt2 | palheta $alt2 | ||||
| paralelo $alt | |||||
| parede $alt2 | parede $alt2 | ||||
| pastora $alt2 | pastora $alt2 | ||||
| pedra $alt | pedra $alt | ||||
| selvagem seUv'aZeIN | selvagem seUv'aZeIN | ||||
| sincera $alt | sincera $alt | ||||
| sincero $alt | sincero $alt | ||||
| sinopse $alt | |||||
| sobe $alt | sobe $alt | ||||
| sobem $alt | sobem $alt | ||||
| soco $alt2 | soco $alt2 | ||||
| travesso $alt2 | travesso $alt2 | ||||
| trevo $alt2 | trevo $alt2 | ||||
| troco $alt2 $noun | troco $alt2 $noun | ||||
| tropa $alt | |||||
| tropeço $alt2 $noun | tropeço $alt2 $noun | ||||
| trombeta $alt2 | trombeta $alt2 | ||||
| valeta $alt2 | valeta $alt2 | ||||
| vanessa $alt2 | vanessa $alt2 | ||||
| velozes $alt | |||||
| verbo $alt | verbo $alt | ||||
| verme $alt | verme $alt | ||||
| vierem $alt | vierem $alt | ||||
| violeta $alt2 | violeta $alt2 | ||||
| voga $alt | |||||
| vozes $alt | vozes $alt | ||||
| zelo $alt $verb | zelo $alt $verb | ||||
| zero $alt | zero $alt |
| .L03 a am o | .L03 a am o | ||||
| .L04 a am e em o ue uem | .L04 a am e em o ue uem | ||||
| .L05 r ra ram re rem rdes | .L05 r ra ram re rem rdes | ||||
| .L06 ra ram ste re rem | |||||
| .group a | .group a | ||||
| ?1 v) e (l_ ,E // eg: possível, amovível, disponível, etc... | ?1 v) e (l_ ,E // eg: possível, amovível, disponível, etc... | ||||
| ?1 l) e (ta 'E // eg: bicicleta, atleta. | ?1 l) e (ta 'E // eg: bicicleta, atleta. | ||||
| sf) e (ra_ E // esfera, biosfera | |||||
| //sort | //sort | ||||
| qu) e (brL04_ E | qu) e (brL04_ E | ||||
| _hosp) e (dL03_ E | _hosp) e (dL03_ E | ||||
| _atr) e (vL03_ e | _atr) e (vL03_ e | ||||
| _escr) e (vL03_ e | _escr) e (vL03_ e | ||||
| _descr) e (vL03_ e | _descr) e (vL03_ e | ||||
| _embel) e (zL01_ e | |||||
| _embel) e (zL04_ E | |||||
| _pr) e (zL04_ E | _pr) e (zL04_ E | ||||
| _acont) e (çL01_ e | _acont) e (çL01_ e | ||||
| _ado) e (çL03_ e | _ado) e (çL03_ e | ||||
| _esclar) e (çL03_ e | _esclar) e (çL03_ e | ||||
| _reapar) e (çL03_ e | _reapar) e (çL03_ e | ||||
| _reconh) e (çL03_ e | _reconh) e (çL03_ e | ||||
| _coop) e (rL04_ E | |||||
| _sup) e (rL04_ E | |||||
| _imp) e (rL03_ E | |||||
| _temp) e (rL04_ E | |||||
| //endsort | //endsort | ||||
| _exag) e (L05_ E | _exag) e (L05_ E | ||||
| _houv) e (L05_ E | _houv) e (L05_ E | ||||
| _soub) e (L05_ E | _soub) e (L05_ E | ||||
| _compreend) e (L06_ e | |||||
| _correspond) e (L06_ e | |||||
| _entend) e (L06_ e | |||||
| _estend) e (L06_ e | |||||
| _respond)e (L06_ e | |||||
| _vend) e (L06_ e | |||||
| .group é | .group é | ||||
| é ''E | é ''E | ||||
| c) o (rrL01_ o // escorra, incorra, corra, recorra | c) o (rrL01_ o // escorra, incorra, corra, recorra | ||||
| _m) o (rrL01_ o | _m) o (rrL01_ o | ||||
| p) ostos (_ Ost=Us# // dispostos etc. | p) ostos (_ Ost=Us# // dispostos etc. | ||||
| _esn) o (bL04_ O | |||||
| //endsort | //endsort | ||||
| .group ô | .group ô | ||||
| r) u (_A u | r) u (_A u | ||||
| u (A_ 'u | u (A_ 'u | ||||
| u (em_ 'u | |||||
| ui uI | ui uI | ||||
| u (iu w | u (iu w | ||||
| u (iCK u | u (iCK u |
| _) த (ீரரை d | _) த (ீரரை d | ||||
| _) த (ீர்க d | _) த (ீர்க d | ||||
| _) த (ீர்க்க t | _) த (ீர்க்க t | ||||
| _) தீர்க்க (_சுமங்கலி dirggV | |||||
| _) தீர்க்க (_சுமங்கலி di:rggV | |||||
| _) தீர்க்க (தரிச di:rggV | _) தீர்க்க (தரிச di:rggV | ||||
| _) தீர்க்க (த்துடன di:rggV# | _) தீர்க்க (த்துடன di:rggV# | ||||
| _) த (ீர்க்கமா d | _) த (ீர்க்கமா d |
| } | } | ||||
| continue; | continue; | ||||
| } | } | ||||
| if(rb == RULE_DOLLAR) | if(rb == RULE_DOLLAR) | ||||
| { | { | ||||
| value = *rule++ & 0xff; | value = *rule++ & 0xff; | ||||
| int multiple_numeric_hyphen = 0; | int multiple_numeric_hyphen = 0; | ||||
| char *multiple_string = NULL; | char *multiple_string = NULL; | ||||
| char *multiple_string_end = NULL; | char *multiple_string_end = NULL; | ||||
| int len_word; | int len_word; | ||||
| int len_phonetic; | int len_phonetic; | ||||
| int text_not_phonemes; // this word specifies replacement text, not phonemes | int text_not_phonemes; // this word specifies replacement text, not phonemes | ||||
| unsigned int wc; | unsigned int wc; | ||||
| int all_upper_case; | int all_upper_case; | ||||
| char *mnemptr; | char *mnemptr; | ||||
| unsigned char flag_codes[100]; | unsigned char flag_codes[100]; | ||||
| char encoded_ph[200]; | char encoded_ph[200]; | ||||
| #endif | #endif | ||||
| step = 0; | step = 0; | ||||
| c = 0; | c = 0; | ||||
| while(c != '\n') | while(c != '\n') | ||||
| { | { | ||||
| c = *p; | c = *p; | ||||
| if((c == '?') && (step==0)) | if((c == '?') && (step==0)) | ||||
| { | { | ||||
| // conditional rule, allow only if the numbered condition is set for the voice | // conditional rule, allow only if the numbered condition is set for the voice | ||||
| flag_codes[n_flag_codes++] = ix + flag_offset; | flag_codes[n_flag_codes++] = ix + flag_offset; | ||||
| c = *p; | c = *p; | ||||
| } | } | ||||
| if((c == '$') && isalnum(p[1])) | if((c == '$') && isalnum(p[1])) | ||||
| { | { | ||||
| /* read keyword parameter */ | /* read keyword parameter */ | ||||
| mnemptr = p; | mnemptr = p; | ||||
| while(!isspace2(c = *p)) p++; | while(!isspace2(c = *p)) p++; | ||||
| *p = 0; | *p = 0; | ||||
| flagnum = LookupMnem(mnem_flags,mnemptr); | flagnum = LookupMnem(mnem_flags,mnemptr); | ||||
| if(flagnum > 0) | if(flagnum > 0) | ||||
| { | { | ||||
| error_count++; | error_count++; | ||||
| } | } | ||||
| } | } | ||||
| if((c == '/') && (p[1] == '/') && (multiple_words==0)) | if((c == '/') && (p[1] == '/') && (multiple_words==0)) | ||||
| { | { | ||||
| c = '\n'; /* "//" treat comment as end of line */ | c = '\n'; /* "//" treat comment as end of line */ | ||||
| } | } | ||||
| switch(step) | switch(step) | ||||
| { | { | ||||
| case 0: | case 0: | ||||
| step = 1; | step = 1; | ||||
| } | } | ||||
| break; | break; | ||||
| case 1: | case 1: | ||||
| if((c == '-') && multiple_words) | if((c == '-') && multiple_words) | ||||
| { | { | ||||
| step = 3; | step = 3; | ||||
| } | } | ||||
| break; | break; | ||||
| case 3: | case 3: | ||||
| if(!isspace2(c)) | if(!isspace2(c)) | ||||
| { | { | ||||
| step = 4; | step = 4; | ||||
| } | } | ||||
| break; | break; | ||||
| case 4: | case 4: | ||||
| if(isspace2(c)) | if(isspace2(c)) | ||||
| { | { | ||||
| step = 5; | step = 5; | ||||
| } | } | ||||
| break; | break; | ||||
| case 5: | case 5: | ||||
| break; | break; | ||||
| } | } | ||||
| p++; | p++; | ||||
| } | } | ||||
| if(word[0] == 0) | if(word[0] == 0) | ||||
| { | { | ||||
| return(0); /* blank line */ | return(0); /* blank line */ | ||||
| *hash = HashDictionary(word); | *hash = HashDictionary(word); | ||||
| len_phonetic = strlen(encoded_ph); | len_phonetic = strlen(encoded_ph); | ||||
| dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed | dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed | ||||
| len_word &= 0x3f; | len_word &= 0x3f; | ||||
| length = len_word + len_phonetic + 3; | length = len_word + len_phonetic + 3; | ||||
| strcpy(&dict_line[(len_word)+2],encoded_ph); | strcpy(&dict_line[(len_word)+2],encoded_ph); | ||||
| } | } | ||||
| for(ix=0; ix<n_flag_codes; ix++) | for(ix=0; ix<n_flag_codes; ix++) | ||||
| { | { | ||||
| dict_line[ix+length] = flag_codes[ix]; | dict_line[ix+length] = flag_codes[ix]; | ||||
| fflush(f_log); | fflush(f_log); | ||||
| #endif | #endif | ||||
| } | } | ||||
| for(hash=0; hash<N_HASH_DICT; hash++) | for(hash=0; hash<N_HASH_DICT; hash++) | ||||
| { | { | ||||
| p = hash_chains[hash]; | p = hash_chains[hash]; | ||||
| hash_counts[hash] = (int)ftell(f_out); | hash_counts[hash] = (int)ftell(f_out); | ||||
| while(p != NULL) | while(p != NULL) | ||||
| { | { | ||||
| length = *(p+sizeof(char *)); | length = *(p+sizeof(char *)); | ||||
| char buf[200]; | char buf[200]; | ||||
| char fname[sizeof(path_home)+45]; | char fname[sizeof(path_home)+45]; | ||||
| char dict_line[128]; | char dict_line[128]; | ||||
| text_mode = 0; | text_mode = 0; | ||||
| // try with and without '.txt' extension | // try with and without '.txt' extension | ||||
| fprintf(f_log,"Compiling: '%s'\n",fname); | fprintf(f_log,"Compiling: '%s'\n",fname); | ||||
| linenum=0; | linenum=0; | ||||
| while(fgets(buf,sizeof(buf),f_in) != NULL) | while(fgets(buf,sizeof(buf),f_in) != NULL) | ||||
| { | { | ||||
| linenum++; | linenum++; | ||||
| if(length == 0) continue; /* blank line */ | if(length == 0) continue; /* blank line */ | ||||
| hash_counts[hash]++; | hash_counts[hash]++; | ||||
| p = (char *)malloc(length+sizeof(char *)); | p = (char *)malloc(length+sizeof(char *)); | ||||
| if(p == NULL) | if(p == NULL) | ||||
| { | { | ||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| memcpy(p,&hash_chains[hash],sizeof(char *)); | memcpy(p,&hash_chains[hash],sizeof(char *)); | ||||
| hash_chains[hash] = p; | hash_chains[hash] = p; | ||||
| memcpy(p+sizeof(char *),dict_line,length); | memcpy(p+sizeof(char *),dict_line,length); | ||||
| count++; | count++; | ||||
| } | } | ||||
| fprintf(f_log,"\t%d entries\n",count); | fprintf(f_log,"\t%d entries\n",count); | ||||
| fclose(f_in); | fclose(f_in); | ||||
| return(0); | return(0); | ||||
| output = &rule_phonemes[len]; | output = &rule_phonemes[len]; | ||||
| } | } | ||||
| sxflags = 0x808000; // to ensure non-zero bytes | sxflags = 0x808000; // to ensure non-zero bytes | ||||
| for(p=string,ix=0;;) | for(p=string,ix=0;;) | ||||
| { | { | ||||
| literal = 0; | literal = 0; | ||||
| case 'a': | case 'a': | ||||
| sxflags |= SUFX_A; | sxflags |= SUFX_A; | ||||
| break; | break; | ||||
| case 'm': | |||||
| sxflags |= SUFX_M; | |||||
| break; | |||||
| default: | default: | ||||
| if(isdigit(c)) | if(isdigit(c)) | ||||
| value = (value*10) + (c - '0'); | value = (value*10) + (c - '0'); | ||||
| rule_phonemes[0]=0; | rule_phonemes[0]=0; | ||||
| p = buf; | p = buf; | ||||
| for(ix=0; finish==0; ix++) | for(ix=0; finish==0; ix++) | ||||
| { | { | ||||
| c = input[ix]; | c = input[ix]; | ||||
| copy_rule_string(buf,state); | copy_rule_string(buf,state); | ||||
| p = buf; | p = buf; | ||||
| break; | break; | ||||
| case '(': // start of suffix section | case '(': // start of suffix section | ||||
| *p = 0; | *p = 0; | ||||
| state = 2; | state = 2; | ||||
| error_count++; | error_count++; | ||||
| } | } | ||||
| break; | break; | ||||
| case '\n': // end of line | case '\n': // end of line | ||||
| case '\r': | case '\r': | ||||
| case 0: // end of line | case 0: // end of line | ||||
| copy_rule_string(buf,state); | copy_rule_string(buf,state); | ||||
| finish=1; | finish=1; | ||||
| break; | break; | ||||
| case '\t': // end of section section | case '\t': // end of section section | ||||
| case ' ': | case ' ': | ||||
| *p = 0; | *p = 0; | ||||
| copy_rule_string(buf,state); | copy_rule_string(buf,state); | ||||
| p = buf; | p = buf; | ||||
| break; | break; | ||||
| case '?': | case '?': | ||||
| if(state==2) | if(state==2) | ||||
| state=0; | state=0; | ||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| if(strcmp(rule_match,"$group")==0) | if(strcmp(rule_match,"$group")==0) | ||||
| strcpy(rule_match,group_name); | strcpy(rule_match,group_name); | ||||
| } | } | ||||
| strcpy(output,buf); | strcpy(output,buf); | ||||
| len = strlen(buf)+1; | len = strlen(buf)+1; | ||||
| len_name = strlen(group_name); | len_name = strlen(group_name); | ||||
| if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0)) | if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0)) | ||||
| { | { | ||||
| len1 = strlen(p) + 1; | len1 = strlen(p) + 1; | ||||
| p = &p[len1]; | p = &p[len1]; | ||||
| len2 = strlen(p); | len2 = strlen(p); | ||||
| rule_match[0]=0; | rule_match[0]=0; | ||||
| rule_pre[0]=0; | rule_pre[0]=0; | ||||
| rule_post[0]=0; | rule_post[0]=0; | ||||
| } | } | ||||
| } | } | ||||
| *pout = 0; | *pout = 0; | ||||
| spaces = 12; | spaces = 12; | ||||
| if(condition > 0) | if(condition > 0) | ||||
| { | { | ||||
| fprintf(f_out,"%s",buf); | fprintf(f_out,"%s",buf); | ||||
| spaces = 0; | spaces = 0; | ||||
| } | } | ||||
| for(ix=0; ix<spaces; ix++) | for(ix=0; ix<spaces; ix++) | ||||
| fputc(' ',f_out); | fputc(' ',f_out); | ||||
| spaces = 14; | spaces = 14; | ||||
| sprintf(buf," %s ",rule_match); | sprintf(buf," %s ",rule_match); | ||||
| if(rule_post[0] != 0) | if(rule_post[0] != 0) | ||||
| int n_rgroups = 0; | int n_rgroups = 0; | ||||
| int n_groups3 = 0; | int n_groups3 = 0; | ||||
| RGROUP rgroup[N_RULE_GROUP2]; | RGROUP rgroup[N_RULE_GROUP2]; | ||||
| linenum = 0; | linenum = 0; | ||||
| group_name[0] = 0; | group_name[0] = 0; | ||||
| if((p = (unsigned char *)strstr(buf,"//")) != NULL) | if((p = (unsigned char *)strstr(buf,"//")) != NULL) | ||||
| *p = 0; | *p = 0; | ||||
| if(buf[0] == '\r') buf++; // ignore extra \r in \r\n | |||||
| if(buf[0] == '\r') buf++; // ignore extra \r in \r\n | |||||
| } | } | ||||
| if((buf == NULL) || (buf[0] == '.')) | if((buf == NULL) || (buf[0] == '.')) | ||||
| { | { | ||||
| // group character is given as a character code (max 16 bits) | // group character is given as a character code (max 16 bits) | ||||
| p = (unsigned char *)group_name; | p = (unsigned char *)group_name; | ||||
| if(char_code > 0x100) | if(char_code > 0x100) | ||||
| { | { | ||||
| *p++ = (char_code >> 8); | *p++ = (char_code >> 8); | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| if((group3_ix == 0) && (strlen(group_name) > 2)) | if((group3_ix == 0) && (strlen(group_name) > 2)) | ||||
| { | { | ||||
| if(utf8_in(&c,group_name) < 2) | if(utf8_in(&c,group_name) < 2) | ||||
| fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum); | fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum); | ||||
| error_count++; | error_count++; | ||||
| } | } | ||||
| group_name[2] = 0; | group_name[2] = 0; | ||||
| } | } | ||||
| } | } | ||||
| continue; | continue; | ||||
| } | } | ||||
| switch(compile_mode) | switch(compile_mode) | ||||
| { | { | ||||
| case 1: // .group | case 1: // .group | ||||
| compile_dictlist_file(path,"list"); | compile_dictlist_file(path,"list"); | ||||
| } | } | ||||
| compile_dictlist_file(path,"extra"); | compile_dictlist_file(path,"extra"); | ||||
| compile_dictlist_end(f_out); | compile_dictlist_end(f_out); | ||||
| offset_rules = ftell(f_out); | offset_rules = ftell(f_out); | ||||
| fprintf(f_log,"Compiling: '%s'\n",fname_in); | fprintf(f_log,"Compiling: '%s'\n",fname_in); | ||||
| compile_dictrules(f_in,f_out,fname_temp); | compile_dictrules(f_in,f_out,fname_temp); |
| { | { | ||||
| if(vowel_stress[ix] == 4) | if(vowel_stress[ix] == 4) | ||||
| vowel_stress[ix] = 3; // change marked stress (consonant clusters) to secondary (except the last) | vowel_stress[ix] = 3; // change marked stress (consonant clusters) to secondary (except the last) | ||||
| if(vowel_length[ix] > 0) | if(vowel_length[ix] > 0) | ||||
| { | { | ||||
| long_vowel = ix; | long_vowel = ix; | ||||
| vowel_stress[ix] = 3; // give secondary stress to all long vowels | vowel_stress[ix] = 3; // give secondary stress to all long vowels | ||||
| } | } | ||||
| } | } | ||||
| // 'stressed_syllable' gives the last marked stress | // 'stressed_syllable' gives the last marked stress | ||||
| if(stressed_syllable == 0) | if(stressed_syllable == 0) | ||||
| { | { | ||||
| *word_end = 'e'; | *word_end = 'e'; | ||||
| } | } | ||||
| i = word_end - word; | i = word_end - word; | ||||
| memcpy(word_copy,word,i); | |||||
| word_copy[i] = 0; | |||||
| if(word_copy != NULL) | |||||
| { | |||||
| memcpy(word_copy,word,i); | |||||
| word_copy[i] = 0; | |||||
| } | |||||
| // look for multibyte characters to increase the number of bytes to remove | // look for multibyte characters to increase the number of bytes to remove | ||||
| for(len_ending = i = (end_type & 0x3f); i>0 ;i--) // num.of characters of the suffix | for(len_ending = i = (end_type & 0x3f); i>0 ;i--) // num.of characters of the suffix |
| int utf8_in2(int *c, const char *buf, int backwards) | int utf8_in2(int *c, const char *buf, int backwards) | ||||
| {//================================================= | {//================================================= | ||||
| // Read a unicode characater from a UTF8 string | |||||
| // Read a unicode characater from a UTF8 string | |||||
| // Returns the number of UTF8 bytes used. | // Returns the number of UTF8 bytes used. | ||||
| // backwards: set if we are moving backwards through the UTF8 string | // backwards: set if we are moving backwards through the UTF8 string | ||||
| int c1; | int c1; | ||||
| int utf8_in(int *c, const char *buf) | int utf8_in(int *c, const char *buf) | ||||
| {//================================= | {//================================= | ||||
| // Read a unicode characater from a UTF8 string | |||||
| // Read a unicode characater from a UTF8 string | |||||
| // Returns the number of UTF8 bytes used. | // Returns the number of UTF8 bytes used. | ||||
| return(utf8_in2(c,buf,0)); | return(utf8_in2(c,buf,0)); | ||||
| } | } | ||||
| unsigned int dictionary_flags[2]; | unsigned int dictionary_flags[2]; | ||||
| unsigned int dictionary_flags2[2]; | unsigned int dictionary_flags2[2]; | ||||
| int end_type=0; | int end_type=0; | ||||
| int end_type1=0; | |||||
| int prefix_type=0; | int prefix_type=0; | ||||
| int prefix_stress; | int prefix_stress; | ||||
| char *wordx; | char *wordx; | ||||
| char phonemes[N_WORD_PHONEMES]; | char phonemes[N_WORD_PHONEMES]; | ||||
| char phonemes2[N_WORD_PHONEMES]; | |||||
| char prefix_phonemes[N_WORD_PHONEMES]; | char prefix_phonemes[N_WORD_PHONEMES]; | ||||
| char unpron_phonemes[N_WORD_PHONEMES]; | char unpron_phonemes[N_WORD_PHONEMES]; | ||||
| char end_phonemes[N_WORD_PHONEMES]; | char end_phonemes[N_WORD_PHONEMES]; | ||||
| char end_phonemes2[N_WORD_PHONEMES]; | |||||
| char word_copy[N_WORD_BYTES]; | char word_copy[N_WORD_BYTES]; | ||||
| char word_copy2[N_WORD_BYTES]; | char word_copy2[N_WORD_BYTES]; | ||||
| int word_copy_length; | int word_copy_length; | ||||
| char prefix_chars[0x3f + 2]; | |||||
| char prefix_chars[0x3f + 2]; | |||||
| int found=0; | int found=0; | ||||
| int end_flags; | |||||
| int end_flags; | |||||
| char c_temp; // save a character byte while we temporarily replace it with space | char c_temp; // save a character byte while we temporarily replace it with space | ||||
| int first_char; | int first_char; | ||||
| int last_char = 0; | int last_char = 0; | ||||
| int add_plural_suffix = 0; | int add_plural_suffix = 0; | ||||
| int prefix_flags = 0; | int prefix_flags = 0; | ||||
| int more_suffixes; | |||||
| int confirm_prefix; | int confirm_prefix; | ||||
| int spell_word; | int spell_word; | ||||
| int stress_bits; | int stress_bits; | ||||
| // the word has $abbrev flag, but no pronunciation specified. Speak as individual letters | // the word has $abbrev flag, but no pronunciation specified. Speak as individual letters | ||||
| spell_word = 1; | spell_word = 1; | ||||
| } | } | ||||
| if(!found && iswdigit(first_char)) | if(!found && iswdigit(first_char)) | ||||
| { | { | ||||
| Lookup(tr,"_0lang",word_phonemes); | Lookup(tr,"_0lang",word_phonemes); | ||||
| if(confirm_prefix && !(end_type & SUFX_B)) | if(confirm_prefix && !(end_type & SUFX_B)) | ||||
| { | { | ||||
| int end2; | int end2; | ||||
| char phonemes2[N_WORD_PHONEMES]; | |||||
| char end_phonemes2[N_WORD_PHONEMES]; | char end_phonemes2[N_WORD_PHONEMES]; | ||||
| // remove any standard suffix and confirm that the prefix is still recognised | // remove any standard suffix and confirm that the prefix is still recognised | ||||
| for(ix=0; ix < n_chars; ix++) // num. of bytes to remove | for(ix=0; ix < n_chars; ix++) // num. of bytes to remove | ||||
| { | { | ||||
| prefix_chars[pfix++] = *wordx++; | prefix_chars[pfix++] = *wordx++; | ||||
| if((prefix_type & SUFX_B) && (ix == (n_chars-1))) | if((prefix_type & SUFX_B) && (ix == (n_chars-1))) | ||||
| { | { | ||||
| prefix_chars[pfix-1] = 0; // discard the last character of the prefix, this is the separator character | prefix_chars[pfix-1] = 0; // discard the last character of the prefix, this is the separator character | ||||
| } | } | ||||
| } | } | ||||
| if((end_type != 0) && !(end_type & SUFX_P)) | if((end_type != 0) && !(end_type & SUFX_P)) | ||||
| { | { | ||||
| char phonemes2[N_WORD_PHONEMES]; | |||||
| strcpy(phonemes2,phonemes); | |||||
| end_type1 = end_type; | |||||
| strcpy(phonemes2,phonemes); | |||||
| // The word has a standard ending, re-translate without this ending | // The word has a standard ending, re-translate without this ending | ||||
| end_flags = RemoveEnding(tr, wordx, end_type, word_copy); | end_flags = RemoveEnding(tr, wordx, end_type, word_copy); | ||||
| phonemes[0] = 0; | |||||
| if(prefix_phonemes[0] != 0) | |||||
| { | |||||
| // lookup the stem without the prefix removed | |||||
| wordx[-1] = c_temp; | |||||
| found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab); // include prefix, but not suffix | |||||
| wordx[-1] = ' '; | |||||
| if(phonemes[0] == phonSWITCH) | |||||
| { | |||||
| // change to another language in order to translate this word | |||||
| memcpy(wordx,word_copy,strlen(word_copy)); | |||||
| strcpy(word_phonemes,phonemes); | |||||
| return(0); | |||||
| } | |||||
| if(dictionary_flags[0]==0) | |||||
| { | |||||
| dictionary_flags[0] = dictionary_flags2[0]; | |||||
| dictionary_flags[1] = dictionary_flags2[1]; | |||||
| } | |||||
| if(found) | |||||
| prefix_phonemes[0] = 0; // matched whole word, don't need prefix now | |||||
| if((found==0) && (dictionary_flags2[0] != 0)) | |||||
| prefix_flags = 1; | |||||
| } | |||||
| if(found == 0) | |||||
| { | |||||
| found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab); // without prefix and suffix | |||||
| if(phonemes[0] == phonSWITCH) | |||||
| { | |||||
| // change to another language in order to translate this word | |||||
| memcpy(wordx,word_copy,strlen(word_copy)); | |||||
| strcpy(word_phonemes,phonemes); | |||||
| return(0); | |||||
| } | |||||
| if(dictionary_flags2[0] & FLAG_ABBREV) | |||||
| { | |||||
| // Removing the suffix leaves a word which should be spoken as individual letters | |||||
| // Not yet implemented | |||||
| } | |||||
| if(dictionary_flags[0]==0) | |||||
| { | |||||
| dictionary_flags[0] = dictionary_flags2[0]; | |||||
| dictionary_flags[1] = dictionary_flags2[1]; | |||||
| } | |||||
| } | |||||
| if(found == 0) | |||||
| { | |||||
| if(end_type & SUFX_Q) | |||||
| { | |||||
| // don't retranslate, use the original lookup result | |||||
| strcpy(phonemes,phonemes2); | |||||
| // language specific changes | |||||
| ApplySpecialAttribute(tr,phonemes,dictionary_flags[0]); | |||||
| } | |||||
| else | |||||
| { | |||||
| if(end_flags & FLAG_SUFX) | |||||
| wflags |= FLAG_SUFFIX_REMOVED; | |||||
| if(end_type & SUFX_A) | |||||
| wflags |= FLAG_SUFFIX_VOWEL; | |||||
| TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags); | |||||
| if(phonemes[0] == phonSWITCH) | |||||
| { | |||||
| // change to another language in order to translate this word | |||||
| strcpy(word_phonemes,phonemes); | |||||
| memcpy(wordx,word_copy,strlen(word_copy)); | |||||
| wordx[-1] = c_temp; | |||||
| return(0); | |||||
| } | |||||
| } | |||||
| } | |||||
| if((end_type & SUFX_T) == 0) | |||||
| more_suffixes = 1; | |||||
| while(more_suffixes) | |||||
| { | |||||
| more_suffixes = 0; | |||||
| phonemes[0] = 0; | |||||
| if(prefix_phonemes[0] != 0) | |||||
| { | |||||
| // lookup the stem without the prefix removed | |||||
| wordx[-1] = c_temp; | |||||
| found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab); // include prefix, but not suffix | |||||
| wordx[-1] = ' '; | |||||
| if(phonemes[0] == phonSWITCH) | |||||
| { | |||||
| // change to another language in order to translate this word | |||||
| memcpy(wordx,word_copy,strlen(word_copy)); | |||||
| strcpy(word_phonemes,phonemes); | |||||
| return(0); | |||||
| } | |||||
| if(dictionary_flags[0]==0) | |||||
| { | |||||
| dictionary_flags[0] = dictionary_flags2[0]; | |||||
| dictionary_flags[1] = dictionary_flags2[1]; | |||||
| } | |||||
| if(found) | |||||
| prefix_phonemes[0] = 0; // matched whole word, don't need prefix now | |||||
| if((found==0) && (dictionary_flags2[0] != 0)) | |||||
| prefix_flags = 1; | |||||
| } | |||||
| if(found == 0) | |||||
| { | |||||
| found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab); // without prefix and suffix | |||||
| if(phonemes[0] == phonSWITCH) | |||||
| { | |||||
| // change to another language in order to translate this word | |||||
| memcpy(wordx,word_copy,strlen(word_copy)); | |||||
| strcpy(word_phonemes,phonemes); | |||||
| return(0); | |||||
| } | |||||
| if(dictionary_flags2[0] & FLAG_ABBREV) | |||||
| { | |||||
| // Removing the suffix leaves a word which should be spoken as individual letters | |||||
| // Not yet implemented | |||||
| } | |||||
| if(dictionary_flags[0]==0) | |||||
| { | |||||
| dictionary_flags[0] = dictionary_flags2[0]; | |||||
| dictionary_flags[1] = dictionary_flags2[1]; | |||||
| } | |||||
| } | |||||
| if(found == 0) | |||||
| { | |||||
| if(end_type & SUFX_Q) | |||||
| { | |||||
| // don't retranslate, use the original lookup result | |||||
| strcpy(phonemes,phonemes2); | |||||
| // language specific changes | |||||
| ApplySpecialAttribute(tr,phonemes,dictionary_flags[0]); | |||||
| } | |||||
| else | |||||
| { | |||||
| if(end_flags & FLAG_SUFX) | |||||
| wflags |= FLAG_SUFFIX_REMOVED; | |||||
| if(end_type & SUFX_A) | |||||
| wflags |= FLAG_SUFFIX_VOWEL; | |||||
| if(end_type & SUFX_M) | |||||
| { | |||||
| // allow more suffixes before this suffix | |||||
| strcpy(end_phonemes2, end_phonemes); | |||||
| end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags); | |||||
| strcat(end_phonemes, end_phonemes2); // add the phonemes for the previous suffixes after this one | |||||
| if((end_type != 0) && !(end_type & SUFX_P)) | |||||
| { | |||||
| // there is another suffix | |||||
| end_flags = RemoveEnding(tr, wordx, end_type, NULL); | |||||
| more_suffixes = 1; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| // don't remove any previous suffix | |||||
| TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags); | |||||
| end_type = 0; | |||||
| } | |||||
| if(phonemes[0] == phonSWITCH) | |||||
| { | |||||
| // change to another language in order to translate this word | |||||
| strcpy(word_phonemes,phonemes); | |||||
| memcpy(wordx,word_copy,strlen(word_copy)); | |||||
| wordx[-1] = c_temp; | |||||
| return(0); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| if((end_type1 & SUFX_T) == 0) | |||||
| { | { | ||||
| // the default is to add the suffix and then determine the word's stress pattern | // the default is to add the suffix and then determine the word's stress pattern | ||||
| AppendPhonemes(tr,phonemes, N_WORD_PHONEMES, end_phonemes); | AppendPhonemes(tr,phonemes, N_WORD_PHONEMES, end_phonemes); | ||||
| } | } | ||||
| memcpy(wordx,word_copy,strlen(word_copy)); | memcpy(wordx,word_copy,strlen(word_copy)); | ||||
| } | } | ||||
| wordx[-1] = c_temp; | wordx[-1] = c_temp; | ||||
| } | } | ||||
| } | } | ||||
| // dictionary flags for this word give a clue about which alternative pronunciations of | // dictionary flags for this word give a clue about which alternative pronunciations of | ||||
| // following words to use. | // following words to use. | ||||
| if(end_type & SUFX_F) | |||||
| if(end_type1 & SUFX_F) | |||||
| { | { | ||||
| // expect a verb form, with or without -s suffix | // expect a verb form, with or without -s suffix | ||||
| tr->expect_verb = 2; | tr->expect_verb = 2; | ||||
| ok = 0; | ok = 0; | ||||
| } | } | ||||
| } | } | ||||
| if((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD)) | if((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD)) | ||||
| { | { | ||||
| // not if the next word is end-of-sentence | // not if the next word is end-of-sentence | ||||
| else | else | ||||
| if(ph_code == phonX1) | if(ph_code == phonX1) | ||||
| { | { | ||||
| // a language specific action | |||||
| // a language specific action | |||||
| if(tr->langopts.param[LOPT_IT_DOUBLING]) | if(tr->langopts.param[LOPT_IT_DOUBLING]) | ||||
| { | { | ||||
| flags |= FLAG_DOUBLING; | flags |= FLAG_DOUBLING; | ||||
| { | { | ||||
| if(first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING]) | if(first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING]) | ||||
| { | { | ||||
| if(((tr->prev_dict_flags & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) || | |||||
| if(((tr->prev_dict_flags & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) || | |||||
| (tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2))) | (tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2))) | ||||
| { | { | ||||
| // italian, double the initial consonant if the previous word ends with a | // italian, double the initial consonant if the previous word ends with a | ||||
| } | } | ||||
| #endif | #endif | ||||
| if((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032)) | if((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032)) | ||||
| c = '\''; // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe | |||||
| c = '\''; // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe | |||||
| if(((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in)) | if(((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in)) | ||||
| { | { | ||||
| { | { | ||||
| if((next_in == '\002') || ((next_in == '[') && option_phoneme_input)) | if((next_in == '\002') || ((next_in == '[') && option_phoneme_input)) | ||||
| { | { | ||||
| // "[\002" is used internally to start phoneme mode | |||||
| // "[\002" is used internally to start phoneme mode | |||||
| phoneme_mode = FLAG_PHONEMES; | phoneme_mode = FLAG_PHONEMES; | ||||
| source_index++; | source_index++; | ||||
| continue; | continue; | ||||
| if(!IsSpace(prev_in) && IsAlpha(next_in)) | if(!IsSpace(prev_in) && IsAlpha(next_in)) | ||||
| { | { | ||||
| if(prev_out != ' ') | if(prev_out != ' ') | ||||
| { | |||||
| { | |||||
| // previous 'word' not yet ended (not alpha or numeric), start new word now. | // previous 'word' not yet ended (not alpha or numeric), start new word now. | ||||
| c = ' '; | c = ' '; | ||||
| space_inserted = 1; | space_inserted = 1; | ||||
| if((word_count==0) && (embedded_count > 0)) | if((word_count==0) && (embedded_count > 0)) | ||||
| { | { | ||||
| // add a null 'word' to carry the embedded command flag | // add a null 'word' to carry the embedded command flag | ||||
| embedded_list[embedded_ix-1] |= 0x80; | |||||
| embedded_list[embedded_ix-1] |= 0x80; | |||||
| words[word_count].flags |= FLAG_EMBEDDED; | words[word_count].flags |= FLAG_EMBEDDED; | ||||
| word_count = 1; | word_count = 1; | ||||
| } | } | ||||
| ix--; // the last word is a bracket, mark the previous word as last | ix--; // the last word is a bracket, mark the previous word as last | ||||
| words[ix].flags |= FLAG_LAST_WORD; | words[ix].flags |= FLAG_LAST_WORD; | ||||
| // FLAG_NOSPACE check to avoid recognizing .mr -mr | |||||
| // FLAG_NOSPACE check to avoid recognizing .mr -mr | |||||
| if((terminator & CLAUSE_DOT) && !(words[word_count-1].flags & FLAG_NOSPACE)) | if((terminator & CLAUSE_DOT) && !(words[word_count-1].flags & FLAG_NOSPACE)) | ||||
| words[word_count-1].flags |= FLAG_HAS_DOT; | words[word_count-1].flags |= FLAG_HAS_DOT; | ||||
| } | } |
| #define SUFX_T 0x10000 // don't affect the stress position in the stem | #define SUFX_T 0x10000 // don't affect the stress position in the stem | ||||
| #define SUFX_B 0x20000 // break, this character breaks the word into stem and suffix (used with SUFX_P) | #define SUFX_B 0x20000 // break, this character breaks the word into stem and suffix (used with SUFX_P) | ||||
| #define SUFX_A 0x40000 // remember that the suffix starts with a vowel | #define SUFX_A 0x40000 // remember that the suffix starts with a vowel | ||||
| #define SUFX_M 0x80000 // bit 19, allow multiple suffixes | |||||
| #define SUFX_UNPRON 0x8000 // used to return $unpron flag from *_rules | #define SUFX_UNPRON 0x8000 // used to return $unpron flag from *_rules | ||||