/* * Copyright (C) 2005 to 2014 by Jonathan Duddington * email: jonsd@users.sourceforge.net * Copyright (C) 2015-2017 Reece H. Dunn * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write see: * . */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include "common.h" // for strncpy0 #include "compiledict.h" #include "dictionary.h" // for EncodePhonemes, HashDicti... #include "error.h" // for create_file_error_context #include "mnemonics.h" // for LookupMnemName, MNEM_TAB #include "phoneme.h" // for PHONEME_TAB_LIST, phonSWITCH, phone... #include "speech.h" // for path_home #include "synthesize.h" // for Write4Bytes static const MNEM_TAB mnem_rules[] = { { "unpr", DOLLAR_UNPR }, { "noprefix", DOLLAR_NOPREFIX }, // rule fails if a prefix has been removed { "list", DOLLAR_LIST }, // a pronunciation is given in the *_list file { "w_alt1", 0x11 }, { "w_alt2", 0x12 }, { "w_alt3", 0x13 }, { "w_alt4", 0x14 }, { "w_alt5", 0x15 }, { "w_alt6", 0x16 }, { "w_alt", 0x11 }, // note: put longer names before their sub-strings { "p_alt1", 0x21 }, { "p_alt2", 0x22 }, { "p_alt3", 0x23 }, { "p_alt4", 0x24 }, { "p_alt5", 0x25 }, { "p_alt6", 0x26 }, { "p_alt", 0x21 }, { NULL, -1 } }; static const MNEM_TAB mnem_flags[] = { // these in the first group put a value in bits0-3 of dictionary_flags { "$1", 0x41 }, // stress on 1st syllable { "$2", 0x42 }, // stress on 2nd syllable { "$3", 0x43 }, { "$4", 0x44 }, { "$5", 0x45 }, { "$6", 0x46 }, { "$7", 0x47 }, { "$u", 0x48 }, // reduce to unstressed { "$u1", 0x49 }, { "$u2", 0x4a }, { "$u3", 0x4b }, { "$u+", 0x4c }, // reduce to unstressed, but stress at end of clause { "$u1+", 0x4d }, { "$u2+", 0x4e }, { "$u3+", 0x4f }, // these set the corresponding numbered bit if dictionary_flags { "$pause", 8 }, // ensure pause before this word { "$strend", 9 }, // full stress if at end of clause { "$strend2", 10 }, // full stress if at end of clause, or only followed by unstressed { "$unstressend", 11 }, // reduce stress at end of clause { "$accent_before", 12 }, // used with accent names, say this accent name before the letter name { "$abbrev", 13 }, // use this pronuciation rather than split into letters // language specific { "$double", 14 }, // IT double the initial consonant of next word { "$alt", 15 }, // use alternative pronunciation { "$alt1", 15 }, // synonym for $alt { "$alt2", 16 }, { "$alt3", 17 }, { "$alt4", 18 }, { "$alt5", 19 }, { "$alt6", 20 }, { "$alt7", 21 }, { "$combine", 23 }, // Combine with the next word { "$dot", 24 }, // ignore '.' after this word (abbreviation) { "$hasdot", 25 }, // use this pronunciation if there is a dot after the word { "$max3", 27 }, // limit to 3 repetitions { "$brk", 28 }, // a shorter $pause { "$text", 29 }, // word translates to replcement text, not phonemes // flags in dictionary word 2 { "$verbf", 0x20 }, // verb follows { "$verbsf", 0x21 }, // verb follows, allow -s suffix { "$nounf", 0x22 }, // noun follows { "$pastf", 0x23 }, // past tense follows { "$verb", 0x24 }, // use this pronunciation when its a verb { "$noun", 0x25 }, // use this pronunciation when its a noun { "$past", 0x26 }, // use this pronunciation when its past tense { "$verbextend", 0x28 }, // extend influence of 'verb follows' { "$capital", 0x29 }, // use this pronunciation if initial letter is upper case { "$allcaps", 0x2a }, // use this pronunciation if initial letter is upper case { "$accent", 0x2b }, // character name is base-character name + accent name { "$sentence", 0x2d }, // only if this clause is a sentence (i.e. terminator is {. ? !} not {, ; :} { "$only", 0x2e }, // only match on this word without suffix { "$onlys", 0x2f }, // only match with none, or with 's' suffix { "$stem", 0x30 }, // must have a suffix { "$atend", 0x31 }, // use this pronunciation if at end of clause { "$atstart", 0x32 }, // use this pronunciation at start of clause { "$native", 0x33 }, // not if we've switched translators // doesn't set dictionary_flags { "$?", 100 }, // conditional rule, followed by byte giving the condition number { "$textmode", 200 }, { "$phonememode", 201 }, { NULL, -1 } }; #define LEN_GROUP_NAME 12 typedef struct { char name[LEN_GROUP_NAME+1]; void *start; size_t length; int group3_ix; int index; } RGROUP; typedef enum { LINE_PARSER_WORD = 0, LINE_PARSER_END_OF_WORD = 1, LINE_PARSER_MULTIPLE_WORDS = 2, LINE_PARSER_END_OF_WORDS = 3, LINE_PARSER_PRONUNCIATION = 4, LINE_PARSER_END_OF_PRONUNCIATION = 5, } LINE_PARSER_STATES; typedef struct { FILE *f_log; char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes int linenum; int error_count; bool text_mode; int debug_flag; int error_need_dictionary; // A hash chain is a linked-list of hash chain entry objects: // struct hash_chain_entry { // hash_chain_entry *next_entry; // // dict_line output from compile_line: // uint8_t length; // char contents[length]; // }; char *hash_chains[N_HASH_DICT]; char letterGroupsDefined[N_LETTER_GROUPS]; char rule_cond[80]; char rule_pre[80]; char rule_post[80]; char rule_match[80]; char rule_phonemes[80]; char group_name[LEN_GROUP_NAME+1]; int group3_ix; } CompileContext; static void clean_context(CompileContext *ctx) { for (int i = 0; i < N_HASH_DICT; i++) { char *p; while ((p = ctx->hash_chains[i])) { memcpy(&p, ctx->hash_chains[i], sizeof(char*)); free(ctx->hash_chains[i]); ctx->hash_chains[i] = p; } } free(ctx); } void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len) { int stress; int ix; const char *name; int len; int total = 0; buf[0] = 0; if ((stress = flags[0] & 0xf) != 0) { sprintf(buf, "%s", LookupMnemName(mnem_flags, stress + 0x40)); total = strlen(buf); buf += total; } for (ix = 8; ix < 64; ix++) { if (((ix < 30) && (flags[0] & (1 << ix))) || ((ix >= 0x20) && (flags[1] & (1 << (ix-0x20))))) { name = LookupMnemName(mnem_flags, ix); len = strlen(name) + 1; total += len; if (total >= buf_len) continue; sprintf(buf, " %s", name); buf += len; } } } char *DecodeRule(const char *group_chars, int group_length, char *rule, int control, char *output) { // Convert compiled match template to ascii unsigned char rb; unsigned char c; char *p; char *p_end; int ix; int match_type; bool finished = false; int value; int linenum = 0; int flags; int suffix_char; int condition_num = 0; bool at_start = false; const char *name; char buf[200]; char buf_pre[200]; char suffix[20]; static const char symbols[] = { ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '&', '%', '+', '#', 'S', 'D', 'Z', 'A', 'L', '!', ' ', '@', '?', 'J', 'N', 'K', 'V', '?', 'T', 'X', '?', 'W' }; static const char symbols_lg[] = { 'A', 'B', 'C', 'H', 'F', 'G', 'Y' }; match_type = 0; buf_pre[0] = 0; for (ix = 0; ix < group_length; ix++) buf[ix] = group_chars[ix]; buf[ix] = 0; p = &buf[strlen(buf)]; while (!finished) { rb = *rule++; if (rb <= RULE_LINENUM) { switch (rb) { case 0: case RULE_PHONEMES: finished = true; break; case RULE_PRE_ATSTART: at_start = true; // fallthrough: case RULE_PRE: match_type = RULE_PRE; *p = 0; p = buf_pre; break; case RULE_POST: match_type = RULE_POST; *p = 0; strcat(buf, " ("); p = &buf[strlen(buf)]; break; case RULE_PH_COMMON: break; case RULE_CONDITION: // conditional rule, next byte gives condition number condition_num = *rule++; break; case RULE_LINENUM: value = (rule[1] & 0xff) - 1; linenum = (rule[0] & 0xff) - 1 + (value * 255); rule += 2; break; } continue; } if (rb == RULE_DOLLAR) { value = *rule++ & 0xff; if ((value != 0x01) || (control & FLAG_UNPRON_TEST)) { // TODO write the string backwards if in RULE_PRE p[0] = '$'; name = LookupMnemName(mnem_rules, value); strcpy(&p[1], name); p += (strlen(name)+1); } c = ' '; } else if (rb == RULE_ENDING) { static const char flag_chars[] = "eipvdfq tba "; flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f); suffix_char = 'S'; if (flags & (SUFX_P >> 8)) suffix_char = 'P'; sprintf(suffix, "%c%d", suffix_char, rule[2] & 0x7f); rule += 3; for (ix = 0; ix < 9; ix++) { if (flags & 1) sprintf(&suffix[strlen(suffix)], "%c", flag_chars[ix]); flags = (flags >> 1); } strcpy(p, suffix); p += strlen(suffix); c = ' '; } else if (rb == RULE_LETTERGP) c = symbols_lg[*rule++ - 'A']; else if (rb == RULE_LETTERGP2) { value = *rule++ - 'A'; if (value < 0) value += 256; p[0] = 'L'; p[1] = (value / 10) + '0'; c = (value % 10) + '0'; if (match_type == RULE_PRE) { p[0] = c; c = 'L'; } p += 2; } else if (rb <= RULE_LAST_RULE) c = symbols[rb]; else if (rb == RULE_SPACE) c = '_'; else c = rb; *p++ = c; } *p = 0; p = output; p_end = p + sizeof(output) - 1; if (linenum > 0) { sprintf(p, "%5d:\t", linenum); p += 7; } if (condition_num > 0) { sprintf(p, "?%d ", condition_num); p = &p[strlen(p)]; } if (((ix = strlen(buf_pre)) > 0) || at_start) { if (at_start) *p++ = '_'; while ((--ix >= 0) && (p < p_end-3)) *p++ = buf_pre[ix]; *p++ = ')'; *p++ = ' '; } *p = 0; buf[p_end - p] = 0; // prevent overflow in output[] strcat(p, buf); ix = strlen(output); while (ix < 8) output[ix++] = ' '; output[ix] = 0; return output; } static int compile_line(CompileContext *ctx, char *linebuf, char *dict_line, int n_dict_line, int *hash) { // Compile a line in the language_list file unsigned char c; char *p; char *word; char *phonetic; char *phonetic_end; unsigned int ix; LINE_PARSER_STATES step; unsigned int n_flag_codes = 0; int flagnum; int flag_offset; int length; int multiple_words = 0; bool multiple_numeric_hyphen = false; char *multiple_string = NULL; char *multiple_string_end = NULL; int len_word; int len_phonetic; bool text_not_phonemes = false; // this word specifies replacement text, not phonemes unsigned int wc; bool all_upper_case; char *mnemptr; unsigned char flag_codes[100]; char encoded_ph[200]; char bad_phoneme_str[4]; int bad_phoneme; static const char nullstring[] = { 0 }; phonetic = word = (char*)nullstring; p = linebuf; step = LINE_PARSER_WORD; c = *p; while (c != '\n' && c != '\0') { c = *p; if ((c == '?') && (step == 0)) { // conditional rule, allow only if the numbered condition is set for the voice flag_offset = 100; p++; if (*p == '!') { // allow only if the numbered condition is NOT set flag_offset = 132; p++; } ix = 0; if (IsDigit09(*p)) { ix += (*p-'0'); p++; } if (IsDigit09(*p)) { ix = ix*10 + (*p-'0'); p++; } flag_codes[n_flag_codes++] = ix + flag_offset; c = *p; } if ((c == '$') && isalnum(p[1])) { // read keyword parameter mnemptr = p; while (!isspace2(c = *p)) p++; *p = 0; flagnum = LookupMnem(mnem_flags, mnemptr); if (flagnum > 0) { if (flagnum == 200) ctx->text_mode = true; else if (flagnum == 201) ctx->text_mode = false; else if (flagnum == BITNUM_FLAG_TEXTMODE) text_not_phonemes = true; else flag_codes[n_flag_codes++] = flagnum; } else { fprintf(ctx->f_log, "%5d: Unknown keyword: %s\n", ctx->linenum, mnemptr); ctx->error_count++; } } if ((c == '/') && (p[1] == '/') && (multiple_words == 0)) c = '\n'; // "//" treat comment as end of line switch (step) { case LINE_PARSER_WORD: if (c == '(') { multiple_words = 1; word = p+1; step = LINE_PARSER_END_OF_WORD; } else if (!isspace2(c)) { word = p; step = LINE_PARSER_END_OF_WORD; } break; case LINE_PARSER_END_OF_WORD: if ((c == '-') && multiple_words) { if (IsDigit09(word[0])) multiple_numeric_hyphen = true; flag_codes[n_flag_codes++] = BITNUM_FLAG_HYPHENATED; c = ' '; } if (isspace2(c)) { p[0] = 0; // terminate english word if (multiple_words) { multiple_string = multiple_string_end = p+1; step = LINE_PARSER_MULTIPLE_WORDS; } else step = LINE_PARSER_END_OF_WORDS; } else if (c == ')') { if (multiple_words) { p[0] = 0; multiple_words = 0; step = LINE_PARSER_END_OF_WORDS; } else if (word[0] != '_') { fprintf(ctx->f_log, "%5d: Missing '('\n", ctx->linenum); ctx->error_count++; step = LINE_PARSER_END_OF_WORDS; } } break; case LINE_PARSER_MULTIPLE_WORDS: if (isspace2(c)) multiple_words++; else if (c == ')') { p[0] = ' '; // terminate extra string multiple_string_end = p+1; step = LINE_PARSER_END_OF_WORDS; } break; case LINE_PARSER_END_OF_WORDS: if (!isspace2(c)) { phonetic = p; step = LINE_PARSER_PRONUNCIATION; } break; case LINE_PARSER_PRONUNCIATION: if (isspace2(c)) { phonetic_end = p; p[0] = 0; // terminate phonetic step = LINE_PARSER_END_OF_PRONUNCIATION; } break; case LINE_PARSER_END_OF_PRONUNCIATION: if (!isspace2(c)) { *phonetic_end = ' '; step = LINE_PARSER_PRONUNCIATION; } break; } p++; } if (word[0] == 0) return 0; // blank line if (ctx->text_mode) text_not_phonemes = true; if (text_not_phonemes) { if (word[0] == '_') { // This is a special word, used by eSpeak. Translate this into phonemes now strcat(phonetic, " "); // need a space to indicate word-boundary // PROBLEM vowel reductions are not applied to the translated phonemes // condition rules are not applied TranslateWord(translator, phonetic, NULL, NULL, NULL, NULL, 0); text_not_phonemes = false; strncpy0(encoded_ph, ctx->word_phonemes, N_WORD_BYTES-4); if ((ctx->word_phonemes[0] == 0) && (ctx->error_need_dictionary < 3)) { // the dictionary was not loaded, we need a second attempt ctx->error_need_dictionary++; fprintf(ctx->f_log, "%5d: Need to compile dictionary again\n", ctx->linenum); } } else // this is replacement text, so don't encode as phonemes. Restrict the length of the replacement word strncpy0(encoded_ph, phonetic, N_WORD_BYTES-4); } else { EncodePhonemes(phonetic, encoded_ph, &bad_phoneme); if (strchr(encoded_ph, phonSWITCH) != 0) flag_codes[n_flag_codes++] = BITNUM_FLAG_ONLY_S; // don't match on suffixes (except 's') when switching languages // check for errors in the phonemes codes if (bad_phoneme != 0) { // unrecognised phoneme, report error bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0; fprintf(ctx->f_log, "%5d: Bad phoneme [%s] (U+%x) in: %s %s\n", ctx->linenum, bad_phoneme_str, bad_phoneme, word, phonetic); ctx->error_count++; } } if (text_not_phonemes != translator->langopts.textmode) flag_codes[n_flag_codes++] = BITNUM_FLAG_TEXTMODE; if (sscanf(word, "U+%x", &wc) == 1) { // Character code ix = utf8_out(wc, word); word[ix] = 0; } else if (word[0] != '_') { // convert to lower case, and note if the word is all-capitals int c2; all_upper_case = true; for (p = word;;) { // this assumes that the lower case char is the same length as the upper case char // OK, except for Turkish "I", but use towlower() rather than towlower2() ix = utf8_in(&c2, p); if (c2 == 0) break; if (iswupper(c2)) utf8_out(towlower2(c2, translator), p); else all_upper_case = false; p += ix; } if (all_upper_case) flag_codes[n_flag_codes++] = BITNUM_FLAG_ALLCAPS; } len_word = strlen(word); if (translator->transpose_min > 0) len_word = TransposeAlphabet(translator, word); *hash = HashDictionary(word); len_phonetic = strlen(encoded_ph); dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed len_word &= 0x3f; memcpy(&dict_line[2], word, len_word); if (len_phonetic == 0) { // no phonemes specified. set bit 7 dict_line[1] |= 0x80; length = len_word + 2; } else { length = len_word + len_phonetic + 3; if (length < n_dict_line) { strcpy(&dict_line[(len_word)+2], encoded_ph); } else { fprintf(ctx->f_log, "%5d: Dictionary line length would overflow the data buffer: %d\n", ctx->linenum, length); ctx->error_count++; // no phonemes specified. set bit 7 dict_line[1] |= 0x80; length = len_word + 2; } } for (ix = 0; ix < n_flag_codes; ix++) dict_line[ix+length] = flag_codes[ix]; length += n_flag_codes; if ((multiple_string != NULL) && (multiple_words > 0)) { if (multiple_words > 10) { fprintf(ctx->f_log, "%5d: Two many parts in a multi-word entry: %d\n", ctx->linenum, multiple_words); ctx->error_count++; } else { dict_line[length++] = 80 + multiple_words; ix = multiple_string_end - multiple_string; if (multiple_numeric_hyphen) dict_line[length++] = ' '; // ??? memcpy(&dict_line[length], multiple_string, ix); length += ix; } } *((uint8_t *)dict_line) = (uint8_t)length; return length; } static void compile_dictlist_start(CompileContext *ctx) { // initialise dictionary list int ix; char *p; char *p2; for (ix = 0; ix < N_HASH_DICT; ix++) { p = ctx->hash_chains[ix]; while (p != NULL) { memcpy(&p2, p, sizeof(char *)); free(p); p = p2; } ctx->hash_chains[ix] = NULL; } } static void compile_dictlist_end(CompileContext *ctx, FILE *f_out) { // Write out the compiled dictionary list int hash; int length; char *p; for (hash = 0; hash < N_HASH_DICT; hash++) { p = ctx->hash_chains[hash]; while (p != NULL) { length = *(uint8_t *)(p+sizeof(char *)); fwrite(p+sizeof(char *), length, 1, f_out); memcpy(&p, p, sizeof(char *)); } fputc(0, f_out); } } static int compile_dictlist_file(CompileContext *ctx, const char *path, const char *filename) { int length; int hash; char *p; int count = 0; FILE *f_in; char buf[200]; char fname[sizeof(path_home)+45]; char dict_line[256]; // length is uint8_t, so an entry can't take up more than 256 bytes ctx->text_mode = false; // try with and without '.txt' extension sprintf(fname, "%s%s.txt", path, filename); if ((f_in = fopen(fname, "r")) == NULL) { sprintf(fname, "%s%s", path, filename); if ((f_in = fopen(fname, "r")) == NULL) return -1; } if (ctx->f_log != NULL) fprintf(ctx->f_log, "Compiling: '%s'\n", fname); ctx->linenum = 0; while (fgets(buf, sizeof(buf), f_in) != NULL) { ctx->linenum++; length = compile_line(ctx, buf, dict_line, sizeof(dict_line), &hash); if (length == 0) continue; // blank line p = (char *)malloc(length+sizeof(char *)); if (p == NULL) { if (ctx->f_log != NULL) { fprintf(ctx->f_log, "Can't allocate memory\n"); ctx->error_count++; } break; } memcpy(p, &ctx->hash_chains[hash], sizeof(char *)); ctx->hash_chains[hash] = p; // NOTE: dict_line[0] is the entry length (0-255) memcpy(p+sizeof(char *), dict_line, length); count++; } if (ctx->f_log != NULL) fprintf(ctx->f_log, "\t%d entries\n", count); fclose(f_in); return 0; } #define N_RULES 3000 // max rules for each group static int isHexDigit(int c) { if ((c >= '0') && (c <= '9')) return c - '0'; if ((c >= 'a') && (c <= 'f')) return c - 'a' + 10; if ((c >= 'A') && (c <= 'F')) return c - 'A' + 10; return -1; } static void copy_rule_string(CompileContext *ctx, char *string, int *state_out) { // state 0: conditional, 1=pre, 2=match, 3=post, 4=phonemes char * const outbuf[5] = { ctx->rule_cond, ctx->rule_pre, ctx->rule_match, ctx->rule_post, ctx->rule_phonemes }; static const int next_state[5] = { 2, 2, 4, 4, 4 }; char *output; char *p; int ix; int len; char c; int c2, c3; int sxflags; int value; bool literal; bool hexdigit_input = false; int state = *state_out; const MNEM_TAB *mr; if (string[0] == 0) return; output = outbuf[state]; if (state == 4) { // append to any previous phoneme string, i.e. allow spaces in the phoneme string len = strlen(ctx->rule_phonemes); if (len > 0) ctx->rule_phonemes[len++] = ' '; output = &ctx->rule_phonemes[len]; } sxflags = 0x808000; // to ensure non-zero bytes for (p = string, ix = 0;;) { literal = false; c = *p++; if ((c == '0') && (p[0] == 'x') && (isHexDigit(p[1]) >= 0) && (isHexDigit(p[2]) >= 0)) { hexdigit_input = true; c = p[1]; p += 2; } if (c == '\\') { c = *p++; // treat next character literally if ((c >= '0') && (c <= '3') && (p[0] >= '0') && (p[0] <= '7') && (p[1] >= '0') && (p[1] <= '7')) { // character code given by 3 digit octal value; c = (c-'0')*64 + (p[0]-'0')*8 + (p[1]-'0'); p += 2; } literal = true; } if (hexdigit_input) { if (((c2 = isHexDigit(c)) >= 0) && ((c3 = isHexDigit(p[0])) >= 0)) { c = c2 * 16 + c3; literal = true; p++; } else hexdigit_input = false; } if ((state == 1) || (state == 3)) { // replace special characters (note: 'E' is reserved for a replaced silent 'e') if (literal == false) { static const char lettergp_letters[9] = { LETTERGP_A, LETTERGP_B, LETTERGP_C, 0, 0, LETTERGP_F, LETTERGP_G, LETTERGP_H, LETTERGP_Y }; switch (c) { case '_': c = RULE_SPACE; break; case 'Y': c = 'I'; // fallthrough: case 'A': // vowel case 'B': case 'C': case 'H': case 'F': case 'G': if (state == 1) { // pre-rule, put the number before the RULE_LETTERGP; output[ix++] = lettergp_letters[c-'A'] + 'A'; c = RULE_LETTERGP; } else { output[ix++] = RULE_LETTERGP; c = lettergp_letters[c-'A'] + 'A'; } break; case 'D': c = RULE_DIGIT; break; case 'K': c = RULE_NOTVOWEL; break; case 'N': c = RULE_NO_SUFFIX; break; case 'V': c = RULE_IFVERB; break; case 'Z': c = RULE_NONALPHA; break; case '+': c = RULE_INC_SCORE; break; case '<': // Can't use - as opposite for + because it is used literally as part of word c = RULE_DEC_SCORE; break; case '@': c = RULE_SYLLABLE; break; case '&': c = RULE_STRESSED; break; case '%': c = RULE_DOUBLE; break; case '#': c = RULE_DEL_FWD; break; case '!': c = RULE_CAPITAL; break; case 'T': output[ix++] = RULE_DOLLAR; c = 0x11; break; case 'W': c = RULE_SPELLING; break; case 'X': c = RULE_NOVOWELS; break; case 'J': c = RULE_SKIPCHARS; break; case 'L': // expect two digits c = *p++ - '0'; value = *p++ - '0'; c = c * 10 + value; if ((value < 0) || (value > 9)) { c = 0; fprintf(ctx->f_log, "%5d: Expected 2 digits after 'L'\n", ctx->linenum); ctx->error_count++; } else if ((c <= 0) || (c >= N_LETTER_GROUPS) || (ctx->letterGroupsDefined[(int)c] == 0)) { fprintf(ctx->f_log, "%5d: Letter group L%.2d not defined\n", ctx->linenum, c); ctx->error_count++; } c += 'A'; if (state == 1) { // pre-rule, put the group number before the RULE_LETTERGP command output[ix++] = c; c = RULE_LETTERGP2; } else output[ix++] = RULE_LETTERGP2; break; case '$': value = 0; mr = mnem_rules; while (mr->mnem != NULL) { len = strlen(mr->mnem); if (strncmp(p, mr->mnem, len) == 0) { value = mr->value; p += len; break; } mr++; } if (state == 1) { // pre-rule, put the number before the RULE_DOLLAR output[ix++] = value; c = RULE_DOLLAR; } else { output[ix++] = RULE_DOLLAR; c = value; } if (value == 0) { fprintf(ctx->f_log, "%5d: $ command not recognized\n", ctx->linenum); ctx->error_count++; } break; case 'P': // Prefix sxflags |= SUFX_P; // fallthrough case 'S': // Suffix output[ix++] = RULE_ENDING; value = 0; while (!isspace2(c = *p++) && (c != 0)) { switch (c) { case 'e': sxflags |= SUFX_E; break; case 'i': sxflags |= SUFX_I; break; case 'p': // obsolete, replaced by 'P' above sxflags |= SUFX_P; break; case 'v': sxflags |= SUFX_V; break; case 'd': sxflags |= SUFX_D; break; case 'f': sxflags |= SUFX_F; break; case 'q': sxflags |= SUFX_Q; break; case 't': sxflags |= SUFX_T; break; case 'b': sxflags |= SUFX_B; break; case 'a': sxflags |= SUFX_A; break; case 'm': sxflags |= SUFX_M; break; default: if (IsDigit09(c)) value = (value*10) + (c - '0'); break; } } p--; output[ix++] = sxflags >> 16; output[ix++] = sxflags >> 8; c = value | 0x80; break; } } } output[ix++] = c; if (c == 0) break; } *state_out = next_state[state]; } static char *compile_rule(CompileContext *ctx, char *input) { int ix; unsigned char c; int wc; char *p; char *prule; int len; int len_name; int start; int state = 2; bool finish = false; char buf[80]; char output[150]; int bad_phoneme; char bad_phoneme_str[4]; buf[0] = 0; ctx->rule_cond[0] = 0; ctx->rule_pre[0] = 0; ctx->rule_post[0] = 0; ctx->rule_match[0] = 0; ctx->rule_phonemes[0] = 0; p = buf; for (ix = 0; finish == false; ix++) { switch (c = input[ix]) { case ')': // end of prefix section *p = 0; state = 1; copy_rule_string(ctx, buf, &state); p = buf; break; case '(': // start of suffix section *p = 0; state = 2; copy_rule_string(ctx, buf, &state); state = 3; p = buf; if (input[ix+1] == ' ') { fprintf(ctx->f_log, "%5d: Syntax error. Space after (, or negative score for previous rule\n", ctx->linenum); ctx->error_count++; } break; case '\n': // end of line case '\r': case 0: // end of line *p = 0; copy_rule_string(ctx, buf, &state); finish = true; break; case '\t': // end of section section case ' ': *p = 0; copy_rule_string(ctx, buf, &state); p = buf; break; case '?': if (state == 2) state = 0; else *p++ = c; break; default: *p++ = c; break; } } if (strcmp(ctx->rule_match, "$group") == 0) strcpy(ctx->rule_match, ctx->group_name); if (ctx->rule_match[0] == 0) { if (ctx->rule_post[0] != 0) { fprintf(ctx->f_log, "%5d: Syntax error\n", ctx->linenum); ctx->error_count++; } return NULL; } EncodePhonemes(ctx->rule_phonemes, buf, &bad_phoneme); if (bad_phoneme != 0) { bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0; fprintf(ctx->f_log, "%5d: Bad phoneme [%s] (U+%x) in: %s\n", ctx->linenum, bad_phoneme_str, bad_phoneme, input); ctx->error_count++; } strcpy(output, buf); len = strlen(buf)+1; len_name = strlen(ctx->group_name); if ((len_name > 0) && (memcmp(ctx->rule_match, ctx->group_name, len_name) != 0)) { utf8_in(&wc, ctx->rule_match); if ((ctx->group_name[0] == '9') && IsDigit(wc)) { // numeric group, rule_match starts with a digit, so OK } else { fprintf(ctx->f_log, "%5d: Wrong initial letters '%s' for group '%s'\n", ctx->linenum, ctx->rule_match, ctx->group_name); ctx->error_count++; } } strcpy(&output[len], ctx->rule_match); len += strlen(ctx->rule_match); if (ctx->debug_flag) { output[len] = RULE_LINENUM; output[len+1] = (ctx->linenum % 255) + 1; output[len+2] = (ctx->linenum / 255) + 1; len += 3; } if (ctx->rule_cond[0] != 0) { if (ctx->rule_cond[0] == '!') { // allow the rule only if the condition number is NOT set for the voice ix = atoi(&ctx->rule_cond[1]) + 32; } else { // allow the rule only if the condition number is set for the voice ix = atoi(ctx->rule_cond); } if ((ix > 0) && (ix < 255)) { output[len++] = RULE_CONDITION; output[len++] = ix; } else { fprintf(ctx->f_log, "%5d: bad condition number ?%d\n", ctx->linenum, ix); ctx->error_count++; } } if (ctx->rule_pre[0] != 0) { start = 0; if (ctx->rule_pre[0] == RULE_SPACE) { // omit '_' at the beginning of the pre-string and imply it by using RULE_PRE_ATSTART c = RULE_PRE_ATSTART; start = 1; } else c = RULE_PRE; output[len++] = c; // output PRE string in reverse order for (ix = strlen(ctx->rule_pre)-1; ix >= start; ix--) output[len++] = ctx->rule_pre[ix]; } if (ctx->rule_post[0] != 0) { sprintf(&output[len], "%c%s", RULE_POST, ctx->rule_post); len += (strlen(ctx->rule_post)+1); } output[len++] = 0; if ((prule = (char *)malloc(len)) != NULL) memcpy(prule, output, len); return prule; } static int __cdecl string_sorter(char **a, char **b) { char *pa, *pb; int ix; if ((ix = strcmp(pa = *a, pb = *b)) != 0) return ix; pa += (strlen(pa)+1); pb += (strlen(pb)+1); return strcmp(pa, pb); } static int __cdecl rgroup_sorter(RGROUP *a, RGROUP *b) { // Sort long names before short names int ix; ix = strlen(b->name) - strlen(a->name); if (ix != 0) return ix; ix = strcmp(a->name, b->name); if (ix != 0) return ix; return a->index - b->index; } static void* output_rule_group(int n_rules, char **rules, char *name, size_t *outsize) { int ix; int len1; int len2; int len_name; char *p; char *p2, *p3; const char *common; char *outptr = NULL; size_t outpos, outlen = 0; short nextchar_count[256]; memset(nextchar_count, 0, sizeof(nextchar_count)); len_name = strlen(name); // sort the rules in this group by their phoneme string common = ""; qsort((void *)rules, n_rules, sizeof(char *), (int(__cdecl *)(const void *, const void *))string_sorter); if (strcmp(name, "9") == 0) len_name = 0; // don't remove characters from numeric match strings for (ix = 0; ix < n_rules; ix++) { p = rules[ix]; len1 = strlen(p) + 1; // phoneme string p3 = &p[len1]; p2 = p3 + len_name; // remove group name from start of match string len2 = strlen(p2); nextchar_count[(unsigned char)(p2[0])]++; // the next byte after the group name outpos = outlen; if ((common[0] != 0) && (strcmp(p, common) == 0)) { outlen += len2 + 1; outptr = realloc(outptr, outlen); memmove(outptr + outpos, p2, len2); outptr[outlen-1] = 0; } else { if ((ix < n_rules-1) && (strcmp(p, rules[ix+1]) == 0)) { outlen ++; outptr = realloc(outptr, outlen); common = rules[ix]; // phoneme string is same as next, set as common outptr[outpos++] = RULE_PH_COMMON; } outlen += len2 + 1 + len1; outptr = realloc(outptr, outlen); memmove(outptr + outpos, p2, len2); outpos += len2; outptr[outpos++] = RULE_PHONEMES; memmove(outptr + outpos, p, len1); } } if (outsize) *outsize = outlen; return outptr; } static int compile_lettergroup(CompileContext *ctx, char *input, FILE *f_out) { char *p; char *p_start; int group; int ix; int n_items; int length; int max_length = 0; #define N_LETTERGP_ITEMS 200 char *items[N_LETTERGP_ITEMS]; char item_length[N_LETTERGP_ITEMS]; p = input; if (!IsDigit09(p[0]) || !IsDigit09(p[1])) { fprintf(ctx->f_log, "%5d: Expected 2 digits after '.L'\n", ctx->linenum); ctx->error_count++; return 1; } group = atoi(&p[0]); if (group >= N_LETTER_GROUPS) { fprintf(ctx->f_log, "%5d: lettergroup out of range (01-%.2d)\n", ctx->linenum, N_LETTER_GROUPS-1); ctx->error_count++; return 1; } while (!isspace2(*p)) p++; fputc(RULE_GROUP_START, f_out); fputc(RULE_LETTERGP2, f_out); fputc(group + 'A', f_out); if (ctx->letterGroupsDefined[group] != 0) { fprintf(ctx->f_log, "%5d: lettergroup L%.2d is already defined\n", ctx->linenum, group); ctx->error_count++; } ctx->letterGroupsDefined[group] = 1; n_items = 0; while (n_items < N_LETTERGP_ITEMS) { while (isspace2(*p)) p++; if (*p == 0) break; items[n_items] = p_start = p; while ((*p & 0xff) > ' ') { if (*p == '_') *p = ' '; // allow '_' for word break p++; } *p++ = 0; length = p - p_start; if (length > max_length) max_length = length; item_length[n_items++] = length; } // write out the items, longest first while (max_length > 1) { for (ix = 0; ix < n_items; ix++) { if (item_length[ix] == max_length) fwrite(items[ix], 1, max_length, f_out); } max_length--; } fputc(RULE_GROUP_END, f_out); return 0; } static void free_rules(char **rules, int n_rules) { for (int i = 0; i < n_rules; ++i) { free(*rules); *rules++ = NULL; } } static espeak_ng_STATUS compile_dictrules(CompileContext *ctx, FILE *f_in, FILE *f_out) { char *prule; unsigned char *p; int ix; int c; int gp; int n_rules = 0; int count = 0; int different; int wc; int err_n_rules = 0; const char *prev_rgroup_name; unsigned int char_code; int compile_mode = 0; char *buf; char buf1[500]; char *rules[N_RULES]; int n_rgroups = 0; int n_groups3 = 0; RGROUP rgroup[N_RULE_GROUP2]; ctx->linenum = 0; ctx->group_name[0] = 0; for (;;) { ctx->linenum++; buf = fgets(buf1, sizeof(buf1), f_in); if (buf != NULL) { if ((p = (unsigned char *)strstr(buf, "//")) != NULL) *p = 0; if (buf[0] == '\r') buf++; // ignore extra \r in \r\n } if ((buf == NULL) || (buf[0] == '.')) { // next .group or end of file, write out the previous group if (n_rules > 0) { strcpy(rgroup[n_rgroups].name, ctx->group_name); rgroup[n_rgroups].group3_ix = ctx->group3_ix; rgroup[n_rgroups].start = output_rule_group(n_rules, rules, ctx->group_name, &rgroup[n_rgroups].length); rgroup[n_rgroups].index = n_rgroups; n_rgroups++; count += n_rules; free_rules(rules, n_rules); } n_rules = 0; err_n_rules = 0; if (compile_mode == 2) { // end of the character replacements section fwrite(&n_rules, 1, 4, f_out); // write a zero word to terminate the replacemenmt list fputc(RULE_GROUP_END, f_out); compile_mode = 0; } if (buf == NULL) break; // end of file if (memcmp(buf, ".L", 2) == 0) { compile_lettergroup(ctx, &buf[2], f_out); continue; } if (memcmp(buf, ".replace", 8) == 0) { compile_mode = 2; fputc(RULE_GROUP_START, f_out); fputc(RULE_REPLACEMENTS, f_out); // advance to next word boundary while ((ftell(f_out) & 3) != 0) fputc(0, f_out); } if (memcmp(buf, ".group", 6) == 0) { compile_mode = 1; p = (unsigned char *)&buf[6]; while ((p[0] == ' ') || (p[0] == '\t')) p++; // Note: Windows isspace(0xe1) gives TRUE ! ix = 0; while ((*p > ' ') && (ix < LEN_GROUP_NAME)) ctx->group_name[ix++] = *p++; ctx->group_name[ix] = 0; ctx->group3_ix = 0; if (sscanf(ctx->group_name, "0x%x", &char_code) == 1) { // group character is given as a character code (max 16 bits) p = (unsigned char *)ctx->group_name; if (char_code > 0x100) *p++ = (char_code >> 8); *p++ = char_code; *p = 0; } else { if (translator->letter_bits_offset > 0) { utf8_in(&wc, ctx->group_name); if (((ix = (wc - translator->letter_bits_offset)) >= 0) && (ix < 128)) ctx->group3_ix = ix+1; // not zero } } if ((ctx->group3_ix == 0) && (strlen(ctx->group_name) > 2)) { if (utf8_in(&c, ctx->group_name) < 2) { fprintf(ctx->f_log, "%5d: Group name longer than 2 bytes (UTF8)", ctx->linenum); ctx->error_count++; } ctx->group_name[2] = 0; } } continue; } switch (compile_mode) { case 1: // .group prule = compile_rule(ctx, buf); if (prule != NULL) { if (n_rules < N_RULES) rules[n_rules++] = prule; else { if (err_n_rules == 0) { fprintf(stderr, "\nExceeded limit of rules (%d) in group '%s'\n", N_RULES, ctx->group_name); ctx->error_count++; err_n_rules = 1; } } } break; case 2: // .replace p = (unsigned char *)buf; while (isspace2(*p)) p++; if ((unsigned char)(*p) > 0x20) { while ((unsigned char)(*p) > 0x20) { // not space or zero-byte fputc(*p, f_out); p++; } fputc(0, f_out); while (isspace2(*p)) p++; while ((unsigned char)(*p) > 0x20) { fputc(*p, f_out); p++; } fputc(0, f_out); } break; } } qsort((void *)rgroup, n_rgroups, sizeof(rgroup[0]), (int(__cdecl *)(const void *, const void *))rgroup_sorter); prev_rgroup_name = "\n"; for (gp = 0; gp < n_rgroups; gp++) { if ((different = strcmp(rgroup[gp].name, prev_rgroup_name)) != 0) { // not the same as the previous group if (gp > 0) fputc(RULE_GROUP_END, f_out); fputc(RULE_GROUP_START, f_out); if (rgroup[gp].group3_ix != 0) { n_groups3++; fputc(1, f_out); fputc(rgroup[gp].group3_ix, f_out); } else fprintf(f_out, "%s", prev_rgroup_name = rgroup[gp].name); fputc(0, f_out); } fwrite(rgroup[gp].start, rgroup[gp].length, 1, f_out); } fputc(RULE_GROUP_END, f_out); fputc(0, f_out); fprintf(ctx->f_log, "\t%d rules, %d groups (%d)\n\n", count, n_rgroups, n_groups3); free_rules(rules, n_rules); for (gp = 0; gp < n_rgroups; gp++) { free(rgroup[gp].start); } return ENS_OK; } #pragma GCC visibility push(default) ESPEAK_NG_API espeak_ng_STATUS espeak_ng_CompileDictionary(const char *dsource, const char *dict_name, FILE *log, int flags, espeak_ng_ERROR_CONTEXT *context) { if (!log) log = stderr; if (!dict_name) dict_name = dictionary_name; // fname: space to write the filename in case of error // flags: bit 0: include source line number information, for debug purposes. FILE *f_in; FILE *f_out; int offset_rules = 0; int value; char fname_in[sizeof(path_home)+45]; char fname_out[sizeof(path_home)+15]; char path[sizeof(path_home)+40]; // path_dsource+20 CompileContext *ctx = calloc(1, sizeof(CompileContext)); ctx->error_count = 0; ctx->error_need_dictionary = 0; memset(ctx->letterGroupsDefined, 0, sizeof(ctx->letterGroupsDefined)); ctx->debug_flag = flags & 1; if (dsource == NULL) dsource = ""; ctx->f_log = log; if (ctx->f_log == NULL) ctx->f_log = stderr; // try with and without '.txt' extension sprintf(path, "%s%s_", dsource, dict_name); sprintf(fname_in, "%srules.txt", path); if ((f_in = fopen(fname_in, "r")) == NULL) { sprintf(fname_in, "%srules", path); if ((f_in = fopen(fname_in, "r")) == NULL) { clean_context(ctx); return create_file_error_context(context, errno, fname_in); } } sprintf(fname_out, "%s%c%s_dict", path_home, PATHSEP, dict_name); if ((f_out = fopen(fname_out, "wb+")) == NULL) { int error = errno; fclose(f_in); clean_context(ctx); return create_file_error_context(context, error, fname_out); } value = N_HASH_DICT; Write4Bytes(f_out, value); Write4Bytes(f_out, offset_rules); compile_dictlist_start(ctx); fprintf(ctx->f_log, "Using phonemetable: '%s'\n", phoneme_tab_list[phoneme_tab_number].name); compile_dictlist_file(ctx, path, "roots"); if (translator->langopts.listx) { compile_dictlist_file(ctx, path, "list"); compile_dictlist_file(ctx, path, "listx"); } else { compile_dictlist_file(ctx, path, "listx"); compile_dictlist_file(ctx, path, "list"); } compile_dictlist_file(ctx, path, "emoji"); compile_dictlist_file(ctx, path, "extra"); compile_dictlist_end(ctx, f_out); offset_rules = ftell(f_out); fprintf(ctx->f_log, "Compiling: '%s'\n", fname_in); espeak_ng_STATUS status = compile_dictrules(ctx, f_in, f_out); fclose(f_in); fseek(f_out, 4, SEEK_SET); Write4Bytes(f_out, offset_rules); fclose(f_out); fflush(ctx->f_log); if (status != ENS_OK) { clean_context(ctx); return status; } LoadDictionary(translator, dict_name, 0); status = ctx->error_count > 0 ? ENS_COMPILE_ERROR : ENS_OK; clean_context(ctx); return status; } #pragma GCC visibility pop