eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

compiledict.c 40KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980
  1. /***************************************************************************
  2. * Copyright (C) 2005 to 2014 by Jonathan Duddington *
  3. * email: [email protected] *
  4. * Copyright (C) 2015 Reece H. Dunn *
  5. * *
  6. * This program is free software; you can redistribute it and/or modify *
  7. * it under the terms of the GNU General Public License as published by *
  8. * the Free Software Foundation; either version 3 of the License, or *
  9. * (at your option) any later version. *
  10. * *
  11. * This program is distributed in the hope that it will be useful, *
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  14. * GNU General Public License for more details. *
  15. * *
  16. * You should have received a copy of the GNU General Public License *
  17. * along with this program; if not, write see: *
  18. * <http://www.gnu.org/licenses/>. *
  19. ***************************************************************************/
  20. #include <stdio.h>
  21. #include <ctype.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <wctype.h>
  25. #include "speak_lib.h"
  26. #include "speech.h"
  27. #include "phoneme.h"
  28. #include "synthesize.h"
  29. #include "translate.h"
  30. extern void Write4Bytes(FILE *f, int value);
  31. int HashDictionary(const char *string);
  32. static FILE *f_log = NULL;
  33. extern char *dir_dictionary;
  34. extern char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
  35. static int linenum;
  36. static int error_count;
  37. static int text_mode = 0;
  38. static int debug_flag = 0;
  39. static int error_need_dictionary = 0;
  40. static int hash_counts[N_HASH_DICT];
  41. static char *hash_chains[N_HASH_DICT];
  42. static char letterGroupsDefined[N_LETTER_GROUPS];
  43. MNEM_TAB mnem_rules[] = {
  44. {"unpr", DOLLAR_UNPR},
  45. {"noprefix", DOLLAR_NOPREFIX}, // rule fails if a prefix has been removed
  46. {"list", DOLLAR_LIST}, // a pronunciation is given in the *_list file
  47. {"w_alt1", 0x11},
  48. {"w_alt2", 0x12},
  49. {"w_alt3", 0x13},
  50. {"w_alt4", 0x14},
  51. {"w_alt5", 0x15},
  52. {"w_alt6", 0x16},
  53. {"w_alt", 0x11}, // note: put longer names before their sub-strings
  54. {"p_alt1", 0x21},
  55. {"p_alt2", 0x22},
  56. {"p_alt3", 0x23},
  57. {"p_alt4", 0x24},
  58. {"p_alt5", 0x25},
  59. {"p_alt6", 0x26},
  60. {"p_alt", 0x21},
  61. {NULL, -1}
  62. };
  63. MNEM_TAB mnem_flags[] = {
  64. // these in the first group put a value in bits0-3 of dictionary_flags
  65. {"$1", 0x41}, // stress on 1st syllable
  66. {"$2", 0x42}, // stress on 2nd syllable
  67. {"$3", 0x43},
  68. {"$4", 0x44},
  69. {"$5", 0x45},
  70. {"$6", 0x46},
  71. {"$7", 0x47},
  72. {"$u", 0x48}, // reduce to unstressed
  73. {"$u1", 0x49},
  74. {"$u2", 0x4a},
  75. {"$u3", 0x4b},
  76. {"$u+", 0x4c}, // reduce to unstressed, but stress at end of clause
  77. {"$u1+", 0x4d},
  78. {"$u2+", 0x4e},
  79. {"$u3+", 0x4f},
  80. // these set the corresponding numbered bit if dictionary_flags
  81. {"$pause", 8}, // ensure pause before this word
  82. {"$strend", 9}, // full stress if at end of clause
  83. {"$strend2", 10}, // full stress if at end of clause, or only followed by unstressed
  84. {"$unstressend",11}, // reduce stress at end of clause
  85. {"$accent_before",12}, // used with accent names, say this accent name before the letter name
  86. {"$abbrev", 13}, // use this pronuciation rather than split into letters
  87. // language specific
  88. {"$double", 14}, // IT double the initial consonant of next word
  89. {"$alt", 15}, // use alternative pronunciation
  90. {"$alt1", 15}, // synonym for $alt
  91. {"$alt2", 16},
  92. {"$alt3", 17},
  93. {"$alt4", 18},
  94. {"$alt5", 19},
  95. {"$alt6", 20},
  96. {"$alt7", 21},
  97. {"$combine", 23}, // Combine with the next word
  98. {"$dot", 24}, // ignore '.' after this word (abbreviation)
  99. {"$hasdot", 25}, // use this pronunciation if there is a dot after the word
  100. {"$max3", 27}, // limit to 3 repetitions
  101. {"$brk", 28}, // a shorter $pause
  102. {"$text", 29}, // word translates to replcement text, not phonemes
  103. // flags in dictionary word 2
  104. {"$verbf", 0x20}, // verb follows
  105. {"$verbsf", 0x21}, // verb follows, allow -s suffix
  106. {"$nounf", 0x22}, // noun follows
  107. {"$pastf", 0x23}, // past tense follows
  108. {"$verb", 0x24}, // use this pronunciation when its a verb
  109. {"$noun", 0x25}, // use this pronunciation when its a noun
  110. {"$past", 0x26}, // use this pronunciation when its past tense
  111. {"$verbextend",0x28}, // extend influence of 'verb follows'
  112. {"$capital", 0x29}, // use this pronunciation if initial letter is upper case
  113. {"$allcaps", 0x2a}, // use this pronunciation if initial letter is upper case
  114. {"$accent", 0x2b}, // character name is base-character name + accent name
  115. {"$sentence",0x2d}, // only if this clause is a sentence (i.e. terminator is {. ? !} not {, ; :}
  116. {"$only", 0x2e}, // only match on this word without suffix
  117. {"$onlys", 0x2f}, // only match with none, or with 's' suffix
  118. {"$stem", 0x30}, // must have a suffix
  119. {"$atend", 0x31}, // use this pronunciation if at end of clause
  120. {"$atstart", 0x32}, // use this pronunciation at start of clause
  121. {"$native", 0x33}, // not if we've switched translators
  122. // doesn't set dictionary_flags
  123. {"$?", 100}, // conditional rule, followed by byte giving the condition number
  124. {"$textmode", 200},
  125. {"$phonememode", 201},
  126. {NULL, -1}
  127. };
  128. #define LEN_GROUP_NAME 12
  129. typedef struct {
  130. char name[LEN_GROUP_NAME+1];
  131. unsigned int start;
  132. unsigned int length;
  133. int group3_ix;
  134. } RGROUP;
  135. int isspace2(unsigned int c)
  136. {
  137. // can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
  138. int c2;
  139. if(((c2 = (c & 0xff)) == 0) || (c > ' '))
  140. return(0);
  141. return(1);
  142. }
  143. static FILE *fopen_log(const char *fname,const char *access)
  144. {
  145. // performs fopen, but produces error message to f_log if it fails
  146. FILE *f;
  147. if((f = fopen(fname,access)) == NULL)
  148. {
  149. if(f_log != NULL)
  150. fprintf(f_log,"Can't access (%s) file '%s'\n",access,fname);
  151. }
  152. return(f);
  153. }
  154. /* Lookup a mnemonic string in a table, return its name */
  155. const char *LookupMnemName(MNEM_TAB *table, const int value)
  156. {
  157. while(table->mnem != NULL)
  158. {
  159. if(table->value==value)
  160. return(table->mnem);
  161. table++;
  162. }
  163. return(""); /* not found */
  164. }
  165. void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len)
  166. {
  167. int stress;
  168. int ix;
  169. const char *name;
  170. int len;
  171. int total = 0;
  172. buf[0] = 0;
  173. if((stress = flags[0] & 0xf) != 0)
  174. {
  175. sprintf(buf, "%s", LookupMnemName(mnem_flags, stress + 0x40));
  176. total = strlen(buf);
  177. buf += total;
  178. }
  179. for(ix=8; ix<64; ix++)
  180. {
  181. if(((ix < 30) && (flags[0] & (1 << ix))) || ((ix >= 0x20) && (flags[1] & (1 << (ix-0x20)))))
  182. {
  183. name = LookupMnemName(mnem_flags, ix);
  184. len = strlen(name) + 1;
  185. total += len;
  186. if(total >= buf_len)
  187. continue;
  188. sprintf(buf, " %s", name);
  189. buf += len;
  190. }
  191. }
  192. }
  193. char *DecodeRule(const char *group_chars, int group_length, char *rule, int control)
  194. {
  195. /* Convert compiled match template to ascii */
  196. unsigned char rb;
  197. unsigned char c;
  198. char *p;
  199. char *p_end;
  200. int ix;
  201. int match_type;
  202. int finished=0;
  203. int value;
  204. int linenum=0;
  205. int flags;
  206. int suffix_char;
  207. int condition_num=0;
  208. int at_start = 0;
  209. const char *name;
  210. char buf[200];
  211. char buf_pre[200];
  212. char suffix[20];
  213. static char output[80];
  214. static char symbols[] =
  215. {' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
  216. '&','%','+','#','S','D','Z','A','L','!',' ','@','?','J','N','K','V','?','T','X','?','W'
  217. };
  218. static char symbols_lg[] = {'A','B','C','H','F','G','Y'};
  219. match_type = 0;
  220. buf_pre[0] = 0;
  221. for(ix=0; ix<group_length; ix++)
  222. {
  223. buf[ix] = group_chars[ix];
  224. }
  225. buf[ix] = 0;
  226. p = &buf[strlen(buf)];
  227. while(!finished)
  228. {
  229. rb = *rule++;
  230. if(rb <= RULE_LINENUM)
  231. {
  232. switch(rb)
  233. {
  234. case 0:
  235. case RULE_PHONEMES:
  236. finished=1;
  237. break;
  238. case RULE_PRE_ATSTART:
  239. at_start = 1; // drop through to next case
  240. case RULE_PRE:
  241. match_type = RULE_PRE;
  242. *p = 0;
  243. p = buf_pre;
  244. break;
  245. case RULE_POST:
  246. match_type = RULE_POST;
  247. *p = 0;
  248. strcat(buf," (");
  249. p = &buf[strlen(buf)];
  250. break;
  251. case RULE_PH_COMMON:
  252. break;
  253. case RULE_CONDITION:
  254. /* conditional rule, next byte gives condition number */
  255. condition_num = *rule++;
  256. break;
  257. case RULE_LINENUM:
  258. value = (rule[1] & 0xff) - 1;
  259. linenum = (rule[0] & 0xff) - 1 + (value * 255);
  260. rule+=2;
  261. break;
  262. }
  263. continue;
  264. }
  265. if(rb == RULE_DOLLAR)
  266. {
  267. value = *rule++ & 0xff;
  268. if((value != 0x01) || (control & FLAG_UNPRON_TEST))
  269. {
  270. // TODO write the string backwards if in RULE_PRE
  271. p[0] = '$';
  272. name = LookupMnemName(mnem_rules, value);
  273. strcpy(&p[1],name);
  274. p += (strlen(name)+1);
  275. }
  276. c = ' ';
  277. }
  278. else if(rb == RULE_ENDING)
  279. {
  280. static const char *flag_chars = "eipvdfq tba ";
  281. flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f);
  282. suffix_char = 'S';
  283. if(flags & (SUFX_P >> 8))
  284. suffix_char = 'P';
  285. sprintf(suffix,"%c%d",suffix_char,rule[2] & 0x7f);
  286. rule += 3;
  287. for(ix=0; ix<9; ix++)
  288. {
  289. if(flags & 1)
  290. sprintf(&suffix[strlen(suffix)],"%c",flag_chars[ix]);
  291. flags = (flags >> 1);
  292. }
  293. strcpy(p,suffix);
  294. p += strlen(suffix);
  295. c = ' ';
  296. }
  297. else if(rb == RULE_LETTERGP)
  298. {
  299. c = symbols_lg[*rule++ - 'A'];
  300. }
  301. else if(rb == RULE_LETTERGP2)
  302. {
  303. value = *rule++ - 'A';
  304. p[0] = 'L';
  305. p[1] = (value / 10) + '0';
  306. c = (value % 10) + '0';
  307. if(match_type == RULE_PRE)
  308. {
  309. p[0] = c;
  310. c = 'L';
  311. }
  312. p+=2;
  313. }
  314. else if(rb <= RULE_LAST_RULE)
  315. c = symbols[rb];
  316. else if(rb == RULE_SPACE)
  317. c = '_';
  318. else
  319. c = rb;
  320. *p++ = c;
  321. }
  322. *p = 0;
  323. p = output;
  324. p_end = p + sizeof(output) - 1;
  325. if(linenum > 0)
  326. {
  327. sprintf(p,"%5d:\t",linenum);
  328. p += 7;
  329. }
  330. if(condition_num > 0)
  331. {
  332. sprintf(p,"?%d ",condition_num);
  333. p = &p[strlen(p)];
  334. }
  335. if(((ix = strlen(buf_pre)) > 0) || at_start)
  336. {
  337. if(at_start)
  338. *p++ = '_';
  339. while((--ix >= 0) && (p < p_end-3))
  340. *p++ = buf_pre[ix];
  341. *p++ = ')';
  342. *p++ = ' ';
  343. }
  344. *p = 0;
  345. buf[p_end - p] = 0; // prevent overflow in output[]
  346. strcat(p,buf);
  347. ix = strlen(output);
  348. while(ix < 8)
  349. output[ix++]=' ';
  350. output[ix]=0;
  351. return(output);
  352. }
  353. static int compile_line(char *linebuf, char *dict_line, int *hash)
  354. {
  355. // Compile a line in the language_list file
  356. unsigned char c;
  357. char *p;
  358. char *word;
  359. char *phonetic;
  360. unsigned int ix;
  361. int step;
  362. unsigned int n_flag_codes = 0;
  363. int flagnum;
  364. int flag_offset;
  365. int length;
  366. int multiple_words = 0;
  367. int multiple_numeric_hyphen = 0;
  368. char *multiple_string = NULL;
  369. char *multiple_string_end = NULL;
  370. int len_word;
  371. int len_phonetic;
  372. int text_not_phonemes; // this word specifies replacement text, not phonemes
  373. unsigned int wc;
  374. int all_upper_case;
  375. char *mnemptr;
  376. unsigned char flag_codes[100];
  377. char encoded_ph[200];
  378. char bad_phoneme_str[4];
  379. int bad_phoneme;
  380. static char nullstring[] = {0};
  381. text_not_phonemes = 0;
  382. phonetic = word = nullstring;
  383. p = linebuf;
  384. step = 0;
  385. c = 0;
  386. while(c != '\n')
  387. {
  388. c = *p;
  389. if((c == '?') && (step==0))
  390. {
  391. // conditional rule, allow only if the numbered condition is set for the voice
  392. flag_offset = 100;
  393. p++;
  394. if(*p == '!')
  395. {
  396. // allow only if the numbered condition is NOT set
  397. flag_offset = 132;
  398. p++;
  399. }
  400. ix = 0;
  401. if(IsDigit09(*p))
  402. {
  403. ix += (*p-'0');
  404. p++;
  405. }
  406. if(IsDigit09(*p))
  407. {
  408. ix = ix*10 + (*p-'0');
  409. p++;
  410. }
  411. flag_codes[n_flag_codes++] = ix + flag_offset;
  412. c = *p;
  413. }
  414. if((c == '$') && isalnum(p[1]))
  415. {
  416. /* read keyword parameter */
  417. mnemptr = p;
  418. while(!isspace2(c = *p)) p++;
  419. *p = 0;
  420. flagnum = LookupMnem(mnem_flags,mnemptr);
  421. if(flagnum > 0)
  422. {
  423. if(flagnum == 200)
  424. {
  425. text_mode = 1;
  426. }
  427. else if(flagnum == 201)
  428. {
  429. text_mode = 0;
  430. }
  431. else if(flagnum == BITNUM_FLAG_TEXTMODE)
  432. {
  433. text_not_phonemes = 1;
  434. }
  435. else
  436. {
  437. flag_codes[n_flag_codes++] = flagnum;
  438. }
  439. }
  440. else
  441. {
  442. fprintf(f_log,"%5d: Unknown keyword: %s\n",linenum,mnemptr);
  443. error_count++;
  444. }
  445. }
  446. if((c == '/') && (p[1] == '/') && (multiple_words==0))
  447. {
  448. c = '\n'; /* "//" treat comment as end of line */
  449. }
  450. switch(step)
  451. {
  452. case 0:
  453. if(c == '(')
  454. {
  455. multiple_words = 1;
  456. word = p+1;
  457. step = 1;
  458. }
  459. else if(!isspace2(c))
  460. {
  461. word = p;
  462. step = 1;
  463. }
  464. break;
  465. case 1:
  466. if((c == '-') && multiple_words)
  467. {
  468. if(IsDigit09(word[0]))
  469. {
  470. multiple_numeric_hyphen = 1;
  471. }
  472. flag_codes[n_flag_codes++] = BITNUM_FLAG_HYPHENATED;
  473. c = ' ';
  474. }
  475. if(isspace2(c))
  476. {
  477. p[0] = 0; /* terminate english word */
  478. if(multiple_words)
  479. {
  480. multiple_string = multiple_string_end = p+1;
  481. step = 2;
  482. }
  483. else
  484. {
  485. step = 3;
  486. }
  487. }
  488. else if(c == ')')
  489. {
  490. if(multiple_words)
  491. {
  492. p[0] = 0;
  493. multiple_words = 0;
  494. step = 3;
  495. }
  496. else if(word[0] != '_')
  497. {
  498. fprintf(f_log, "%5d: Missing '('\n", linenum);
  499. error_count++;
  500. step = 3;
  501. }
  502. }
  503. break;
  504. case 2:
  505. if(isspace2(c))
  506. {
  507. multiple_words++;
  508. }
  509. else if(c == ')')
  510. {
  511. p[0] = ' '; // terminate extra string
  512. multiple_string_end = p+1;
  513. step = 3;
  514. }
  515. break;
  516. case 3:
  517. if(!isspace2(c))
  518. {
  519. phonetic = p;
  520. step = 4;
  521. }
  522. break;
  523. case 4:
  524. if(isspace2(c))
  525. {
  526. p[0] = 0; /* terminate phonetic */
  527. step = 5;
  528. }
  529. break;
  530. case 5:
  531. break;
  532. }
  533. p++;
  534. }
  535. if(word[0] == 0)
  536. {
  537. return(0); /* blank line */
  538. }
  539. if(text_mode)
  540. text_not_phonemes = 1;
  541. if(text_not_phonemes)
  542. {
  543. if(word[0] == '_')
  544. {
  545. // This is a special word, used by eSpeak. Translate this into phonemes now
  546. strcat(phonetic, " "); // need a space to indicate word-boundary
  547. // PROBLEM vowel reductions are not applied to the translated phonemes
  548. // condition rules are not applied
  549. TranslateWord(translator,phonetic,0,NULL,NULL);
  550. text_not_phonemes = 0;
  551. strncpy0(encoded_ph, word_phonemes, N_WORD_BYTES-4);
  552. if((word_phonemes[0] == 0) && (error_need_dictionary < 3))
  553. {
  554. // the dictionary was not loaded, we need a second attempt
  555. error_need_dictionary++;
  556. fprintf(f_log,"%5d: Need to compile dictionary again\n",linenum);
  557. }
  558. }
  559. else
  560. {
  561. // this is replacement text, so don't encode as phonemes. Restrict the length of the replacement word
  562. strncpy0(encoded_ph,phonetic,N_WORD_BYTES-4);
  563. }
  564. }
  565. else
  566. {
  567. EncodePhonemes(phonetic,encoded_ph,&bad_phoneme);
  568. if(strchr(encoded_ph,phonSWITCH) != 0)
  569. {
  570. flag_codes[n_flag_codes++] = BITNUM_FLAG_ONLY_S; // don't match on suffixes (except 's') when switching languages
  571. }
  572. // check for errors in the phonemes codes
  573. if(bad_phoneme != 0)
  574. {
  575. // unrecognised phoneme, report error
  576. bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
  577. fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s %s\n",linenum,bad_phoneme_str,bad_phoneme,word,phonetic);
  578. error_count++;
  579. }
  580. }
  581. if(text_not_phonemes != translator->langopts.textmode)
  582. {
  583. flag_codes[n_flag_codes++] = BITNUM_FLAG_TEXTMODE;
  584. }
  585. if(sscanf(word,"U+%x",&wc) == 1)
  586. {
  587. // Character code
  588. ix = utf8_out(wc, word);
  589. word[ix] = 0;
  590. }
  591. else if(word[0] != '_')
  592. {
  593. // convert to lower case, and note if the word is all-capitals
  594. int c2;
  595. all_upper_case = 1;
  596. p = word;
  597. for(p=word;;)
  598. {
  599. // this assumes that the lower case char is the same length as the upper case char
  600. // OK, except for Turkish "I", but use towlower() rather than towlower2()
  601. ix = utf8_in(&c2,p);
  602. if(c2 == 0)
  603. break;
  604. if(iswupper2(c2))
  605. {
  606. utf8_out(towlower2(c2),p);
  607. }
  608. else
  609. {
  610. all_upper_case = 0;
  611. }
  612. p += ix;
  613. }
  614. if(all_upper_case)
  615. {
  616. flag_codes[n_flag_codes++] = BITNUM_FLAG_ALLCAPS;
  617. }
  618. }
  619. len_word = strlen(word);
  620. if(translator->transpose_min > 0)
  621. {
  622. len_word = TransposeAlphabet(translator, word);
  623. }
  624. *hash = HashDictionary(word);
  625. len_phonetic = strlen(encoded_ph);
  626. dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed
  627. len_word &= 0x3f;
  628. memcpy(&dict_line[2],word,len_word);
  629. if(len_phonetic == 0)
  630. {
  631. // no phonemes specified. set bit 7
  632. dict_line[1] |= 0x80;
  633. length = len_word + 2;
  634. }
  635. else
  636. {
  637. length = len_word + len_phonetic + 3;
  638. strcpy(&dict_line[(len_word)+2],encoded_ph);
  639. }
  640. for(ix=0; ix<n_flag_codes; ix++)
  641. {
  642. dict_line[ix+length] = flag_codes[ix];
  643. }
  644. length += n_flag_codes;
  645. if((multiple_string != NULL) && (multiple_words > 0))
  646. {
  647. if(multiple_words > 10)
  648. {
  649. fprintf(f_log,"%5d: Two many parts in a multi-word entry: %d\n",linenum,multiple_words);
  650. error_count++;
  651. }
  652. else
  653. {
  654. dict_line[length++] = 80 + multiple_words;
  655. ix = multiple_string_end - multiple_string;
  656. if(multiple_numeric_hyphen)
  657. {
  658. dict_line[length++] = ' '; // ???
  659. }
  660. memcpy(&dict_line[length],multiple_string,ix);
  661. length += ix;
  662. }
  663. }
  664. dict_line[0] = length;
  665. return(length);
  666. }
  667. static void compile_dictlist_start(void)
  668. {
  669. // initialise dictionary list
  670. int ix;
  671. char *p;
  672. char *p2;
  673. for(ix=0; ix<N_HASH_DICT; ix++)
  674. {
  675. p = hash_chains[ix];
  676. while(p != NULL)
  677. {
  678. memcpy(&p2,p,sizeof(char *));
  679. free(p);
  680. p = p2;
  681. }
  682. hash_chains[ix] = NULL;
  683. hash_counts[ix]=0;
  684. }
  685. }
  686. static void compile_dictlist_end(FILE *f_out)
  687. {
  688. // Write out the compiled dictionary list
  689. int hash;
  690. int length;
  691. char *p;
  692. if(f_log != NULL)
  693. {
  694. #ifdef OUTPUT_FORMAT
  695. for(hash=0; hash<N_HASH_DICT; hash++)
  696. {
  697. fprintf(f_log,"%8d",hash_counts[hash]);
  698. if((hash & 7) == 7)
  699. fputc('\n',f_log);
  700. }
  701. fflush(f_log);
  702. #endif
  703. }
  704. for(hash=0; hash<N_HASH_DICT; hash++)
  705. {
  706. p = hash_chains[hash];
  707. hash_counts[hash] = (int)ftell(f_out);
  708. while(p != NULL)
  709. {
  710. length = *(p+sizeof(char *));
  711. fwrite(p+sizeof(char *),length,1,f_out);
  712. memcpy(&p,p,sizeof(char *));
  713. }
  714. fputc(0,f_out);
  715. }
  716. }
  717. static int compile_dictlist_file(const char *path, const char* filename)
  718. {
  719. int length;
  720. int hash;
  721. char *p;
  722. int count=0;
  723. FILE *f_in;
  724. char buf[200];
  725. char fname[sizeof(path_home)+45];
  726. char dict_line[128];
  727. text_mode = 0;
  728. // try with and without '.txt' extension
  729. sprintf(fname,"%s%s.txt",path,filename);
  730. if((f_in = fopen(fname,"r")) == NULL)
  731. {
  732. sprintf(fname,"%s%s",path,filename);
  733. if((f_in = fopen(fname,"r")) == NULL)
  734. return(-1);
  735. }
  736. if(f_log != NULL)
  737. fprintf(f_log,"Compiling: '%s'\n",fname);
  738. linenum=0;
  739. while(fgets(buf,sizeof(buf),f_in) != NULL)
  740. {
  741. linenum++;
  742. length = compile_line(buf,dict_line,&hash);
  743. if(length == 0) continue; /* blank line */
  744. hash_counts[hash]++;
  745. p = (char *)malloc(length+sizeof(char *));
  746. if(p == NULL)
  747. {
  748. if(f_log != NULL)
  749. {
  750. fprintf(f_log,"Can't allocate memory\n");
  751. error_count++;
  752. }
  753. break;
  754. }
  755. memcpy(p,&hash_chains[hash],sizeof(char *));
  756. hash_chains[hash] = p;
  757. memcpy(p+sizeof(char *),dict_line,length);
  758. count++;
  759. }
  760. if(f_log != NULL)
  761. fprintf(f_log,"\t%d entries\n",count);
  762. fclose(f_in);
  763. return(0);
  764. }
  765. static char rule_cond[80];
  766. static char rule_pre[80];
  767. static char rule_post[80];
  768. static char rule_match[80];
  769. static char rule_phonemes[80];
  770. static char group_name[LEN_GROUP_NAME+1];
  771. static int group3_ix;
  772. #define N_RULES 3000 // max rules for each group
  773. int isHexDigit(int c)
  774. {
  775. if((c >= '0') && (c <= '9'))
  776. return(c - '0');
  777. if((c >= 'a') && (c <= 'f'))
  778. return(c - 'a' + 10);
  779. if((c >= 'A') && (c <= 'F'))
  780. return(c - 'A' + 10);
  781. return(-1);
  782. }
  783. static void copy_rule_string(char *string, int *state_out)
  784. {
  785. // state 0: conditional, 1=pre, 2=match, 3=post, 4=phonemes
  786. static char *outbuf[5] = {rule_cond, rule_pre, rule_match, rule_post, rule_phonemes};
  787. static int next_state[5] = {2,2,4,4,4};
  788. char *output;
  789. char *p;
  790. int ix;
  791. int len;
  792. char c;
  793. int c2, c3;
  794. int sxflags;
  795. int value;
  796. int literal;
  797. int hexdigit_input = 0;
  798. int state = *state_out;
  799. MNEM_TAB *mr;
  800. if(string[0] == 0) return;
  801. output = outbuf[state];
  802. if(state==4)
  803. {
  804. // append to any previous phoneme string, i.e. allow spaces in the phoneme string
  805. len = strlen(rule_phonemes);
  806. if(len > 0)
  807. rule_phonemes[len++] = ' ';
  808. output = &rule_phonemes[len];
  809. }
  810. sxflags = 0x808000; // to ensure non-zero bytes
  811. for(p=string,ix=0;;)
  812. {
  813. literal = 0;
  814. c = *p++;
  815. if((c == '0') && (p[0] == 'x') && (isHexDigit(p[1]) >= 0) && (isHexDigit(p[2]) >= 0))
  816. {
  817. hexdigit_input = 1;
  818. c = p[1];
  819. p+= 2;
  820. }
  821. if(c == '\\')
  822. {
  823. c = *p++; // treat next character literally
  824. if((c >= '0') && (c <= '3') && (p[0] >= '0') && (p[0] <= '7') && (p[1] >= '0') && (p[1] <= '7'))
  825. {
  826. // character code given by 3 digit octal value;
  827. c = (c-'0')*64 + (p[0]-'0')*8 + (p[1]-'0');
  828. p += 2;
  829. }
  830. literal = 1;
  831. }
  832. if(hexdigit_input)
  833. {
  834. if(((c2 = isHexDigit(c)) >= 0) && ((c3 = isHexDigit(p[0])) >= 0))
  835. {
  836. c = c2 * 16 + c3;
  837. literal = 1;
  838. p++;
  839. }
  840. else
  841. {
  842. hexdigit_input = 0;
  843. }
  844. }
  845. if((state==1) || (state==3))
  846. {
  847. // replace special characters (note: 'E' is reserved for a replaced silent 'e')
  848. if(literal == 0)
  849. {
  850. static const char lettergp_letters[9] = {LETTERGP_A,LETTERGP_B,LETTERGP_C,0,0,LETTERGP_F,LETTERGP_G,LETTERGP_H,LETTERGP_Y};
  851. switch(c)
  852. {
  853. case '_':
  854. c = RULE_SPACE;
  855. break;
  856. case 'Y':
  857. c = 'I'; // drop through to next case
  858. case 'A': // vowel
  859. case 'B':
  860. case 'C':
  861. case 'H':
  862. case 'F':
  863. case 'G':
  864. if(state == 1)
  865. {
  866. // pre-rule, put the number before the RULE_LETTERGP;
  867. output[ix++] = lettergp_letters[c-'A'] + 'A';
  868. c = RULE_LETTERGP;
  869. }
  870. else
  871. {
  872. output[ix++] = RULE_LETTERGP;
  873. c = lettergp_letters[c-'A'] + 'A';
  874. }
  875. break;
  876. case 'D':
  877. c = RULE_DIGIT;
  878. break;
  879. case 'K':
  880. c = RULE_NOTVOWEL;
  881. break;
  882. case 'N':
  883. c = RULE_NO_SUFFIX;
  884. break;
  885. case 'V':
  886. c = RULE_IFVERB;
  887. break;
  888. case 'Z':
  889. c = RULE_NONALPHA;
  890. break;
  891. case '+':
  892. c = RULE_INC_SCORE;
  893. break;
  894. case '@':
  895. c = RULE_SYLLABLE;
  896. break;
  897. case '&':
  898. c = RULE_STRESSED;
  899. break;
  900. case '%':
  901. c = RULE_DOUBLE;
  902. break;
  903. case '#':
  904. c = RULE_DEL_FWD;
  905. break;
  906. case '!':
  907. c = RULE_CAPITAL;
  908. break;
  909. case 'T':
  910. output[ix++] = RULE_DOLLAR;
  911. c = 0x11;
  912. break;
  913. case 'W':
  914. c = RULE_SPELLING;
  915. break;
  916. case 'X':
  917. c = RULE_NOVOWELS;
  918. break;
  919. case 'J':
  920. c = RULE_SKIPCHARS;
  921. break;
  922. case 'L':
  923. // expect two digits
  924. c = *p++ - '0';
  925. value = *p++ - '0';
  926. c = c * 10 + value;
  927. if((value < 0) || (value > 9))
  928. {
  929. c = 0;
  930. fprintf(f_log,"%5d: Expected 2 digits after 'L'\n",linenum);
  931. error_count++;
  932. }
  933. else if((c <= 0) || (c >= N_LETTER_GROUPS) || (letterGroupsDefined[(int)c] == 0))
  934. {
  935. fprintf(f_log,"%5d: Letter group L%.2d not defined\n",linenum,c);
  936. error_count++;
  937. }
  938. c += 'A';
  939. if(state == 1)
  940. {
  941. // pre-rule, put the group number before the RULE_LETTERGP command
  942. output[ix++] = c;
  943. c = RULE_LETTERGP2;
  944. }
  945. else
  946. {
  947. output[ix++] = RULE_LETTERGP2;
  948. }
  949. break;
  950. case '$':
  951. value = 0;
  952. mr = mnem_rules;
  953. while(mr->mnem != NULL)
  954. {
  955. len = strlen(mr->mnem);
  956. if(memcmp(p, mr->mnem, len) == 0)
  957. {
  958. value = mr->value;
  959. p += len;
  960. break;
  961. }
  962. mr++;
  963. }
  964. if(state == 1)
  965. {
  966. // pre-rule, put the number before the RULE_DOLLAR
  967. output[ix++] = value;
  968. c = RULE_DOLLAR;
  969. }
  970. else
  971. {
  972. output[ix++] = RULE_DOLLAR;
  973. c = value;
  974. }
  975. if(value == 0)
  976. {
  977. fprintf(f_log,"%5d: $ command not recognized\n",linenum);
  978. error_count++;
  979. }
  980. break;
  981. case 'P':
  982. sxflags |= SUFX_P; // Prefix, now drop through to Suffix
  983. case 'S':
  984. output[ix++] = RULE_ENDING;
  985. value = 0;
  986. while(!isspace2(c = *p++) && (c != 0))
  987. {
  988. switch(c)
  989. {
  990. case 'e':
  991. sxflags |= SUFX_E;
  992. break;
  993. case 'i':
  994. sxflags |= SUFX_I;
  995. break;
  996. case 'p': // obsolete, replaced by 'P' above
  997. sxflags |= SUFX_P;
  998. break;
  999. case 'v':
  1000. sxflags |= SUFX_V;
  1001. break;
  1002. case 'd':
  1003. sxflags |= SUFX_D;
  1004. break;
  1005. case 'f':
  1006. sxflags |= SUFX_F;
  1007. break;
  1008. case 'q':
  1009. sxflags |= SUFX_Q;
  1010. break;
  1011. case 't':
  1012. sxflags |= SUFX_T;
  1013. break;
  1014. case 'b':
  1015. sxflags |= SUFX_B;
  1016. break;
  1017. case 'a':
  1018. sxflags |= SUFX_A;
  1019. break;
  1020. case 'm':
  1021. sxflags |= SUFX_M;
  1022. break;
  1023. default:
  1024. if(IsDigit09(c))
  1025. value = (value*10) + (c - '0');
  1026. break;
  1027. }
  1028. }
  1029. p--;
  1030. output[ix++] = sxflags >> 16;
  1031. output[ix++] = sxflags >> 8;
  1032. c = value | 0x80;
  1033. break;
  1034. }
  1035. }
  1036. }
  1037. output[ix++] = c;
  1038. if(c == 0) break;
  1039. }
  1040. *state_out = next_state[state];
  1041. }
  1042. static char *compile_rule(char *input)
  1043. {
  1044. int ix;
  1045. unsigned char c;
  1046. int wc;
  1047. char *p;
  1048. char *prule;
  1049. int len;
  1050. int len_name;
  1051. int start;
  1052. int state=2;
  1053. int finish=0;
  1054. char buf[80];
  1055. char output[150];
  1056. int bad_phoneme;
  1057. char bad_phoneme_str[4];
  1058. buf[0]=0;
  1059. rule_cond[0]=0;
  1060. rule_pre[0]=0;
  1061. rule_post[0]=0;
  1062. rule_match[0]=0;
  1063. rule_phonemes[0]=0;
  1064. p = buf;
  1065. for(ix=0; finish==0; ix++)
  1066. {
  1067. c = input[ix];
  1068. switch(c = input[ix])
  1069. {
  1070. case ')': // end of prefix section
  1071. *p = 0;
  1072. state = 1;
  1073. copy_rule_string(buf,&state);
  1074. p = buf;
  1075. break;
  1076. case '(': // start of suffix section
  1077. *p = 0;
  1078. state = 2;
  1079. copy_rule_string(buf,&state);
  1080. state = 3;
  1081. p = buf;
  1082. if(input[ix+1] == ' ')
  1083. {
  1084. fprintf(f_log,"%5d: Syntax error. Space after (\n",linenum);
  1085. error_count++;
  1086. }
  1087. break;
  1088. case '\n': // end of line
  1089. case '\r':
  1090. case 0: // end of line
  1091. *p = 0;
  1092. copy_rule_string(buf,&state);
  1093. finish=1;
  1094. break;
  1095. case '\t': // end of section section
  1096. case ' ':
  1097. *p = 0;
  1098. copy_rule_string(buf,&state);
  1099. p = buf;
  1100. break;
  1101. case '?':
  1102. if(state==2)
  1103. state=0;
  1104. else
  1105. *p++ = c;
  1106. break;
  1107. default:
  1108. *p++ = c;
  1109. break;
  1110. }
  1111. }
  1112. if(strcmp(rule_match,"$group")==0)
  1113. strcpy(rule_match,group_name);
  1114. if(rule_match[0]==0)
  1115. {
  1116. if(rule_post[0] != 0)
  1117. {
  1118. fprintf(f_log,"%5d: Syntax error\n",linenum);
  1119. error_count++;
  1120. }
  1121. return(NULL);
  1122. }
  1123. EncodePhonemes(rule_phonemes,buf,&bad_phoneme);
  1124. if(bad_phoneme != 0)
  1125. {
  1126. bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
  1127. fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s\n",linenum,bad_phoneme_str,bad_phoneme,input);
  1128. error_count++;
  1129. }
  1130. strcpy(output,buf);
  1131. len = strlen(buf)+1;
  1132. len_name = strlen(group_name);
  1133. if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0))
  1134. {
  1135. utf8_in(&wc,rule_match);
  1136. if((group_name[0] == '9') && IsDigit(wc))
  1137. {
  1138. // numeric group, rule_match starts with a digit, so OK
  1139. }
  1140. else
  1141. {
  1142. fprintf(f_log,"%5d: Wrong initial letters '%s' for group '%s'\n",linenum,rule_match,group_name);
  1143. error_count++;
  1144. }
  1145. }
  1146. strcpy(&output[len],rule_match);
  1147. len += strlen(rule_match);
  1148. if(debug_flag)
  1149. {
  1150. output[len] = RULE_LINENUM;
  1151. output[len+1] = (linenum % 255) + 1;
  1152. output[len+2] = (linenum / 255) + 1;
  1153. len+=3;
  1154. }
  1155. if(rule_cond[0] != 0)
  1156. {
  1157. ix = -1;
  1158. if(rule_cond[0] == '!')
  1159. {
  1160. // allow the rule only if the condition number is NOT set for the voice
  1161. ix = atoi(&rule_cond[1]) + 32;
  1162. }
  1163. else
  1164. {
  1165. // allow the rule only if the condition number is set for the voice
  1166. ix = atoi(rule_cond);
  1167. }
  1168. if((ix > 0) && (ix < 255))
  1169. {
  1170. output[len++] = RULE_CONDITION;
  1171. output[len++] = ix;
  1172. }
  1173. else
  1174. {
  1175. fprintf(f_log,"%5d: bad condition number ?%d\n",linenum,ix);
  1176. error_count++;
  1177. }
  1178. }
  1179. if(rule_pre[0] != 0)
  1180. {
  1181. start = 0;
  1182. if(rule_pre[0] == RULE_SPACE)
  1183. {
  1184. // omit '_' at the beginning of the pre-string and imply it by using RULE_PRE_ATSTART
  1185. c = RULE_PRE_ATSTART;
  1186. start = 1;
  1187. }
  1188. else
  1189. {
  1190. c = RULE_PRE;
  1191. }
  1192. output[len++] = c;
  1193. // output PRE string in reverse order
  1194. for(ix = strlen(rule_pre)-1; ix>=start; ix--)
  1195. output[len++] = rule_pre[ix];
  1196. }
  1197. if(rule_post[0] != 0)
  1198. {
  1199. sprintf(&output[len],"%c%s",RULE_POST,rule_post);
  1200. len += (strlen(rule_post)+1);
  1201. }
  1202. output[len++]=0;
  1203. prule = (char *)malloc(len);
  1204. memcpy(prule,output,len);
  1205. return(prule);
  1206. }
  1207. int __cdecl string_sorter(char **a, char **b)
  1208. {
  1209. char *pa, *pb;
  1210. int ix;
  1211. if((ix = strcmp(pa = *a,pb = *b)) != 0)
  1212. return(ix);
  1213. pa += (strlen(pa)+1);
  1214. pb += (strlen(pb)+1);
  1215. return(strcmp(pa,pb));
  1216. }
  1217. static int __cdecl rgroup_sorter(RGROUP *a, RGROUP *b)
  1218. {
  1219. // Sort long names before short names
  1220. int ix;
  1221. ix = strlen(b->name) - strlen(a->name);
  1222. if(ix != 0) return(ix);
  1223. ix = strcmp(a->name,b->name);
  1224. if(ix != 0) return(ix);
  1225. return(a->start-b->start);
  1226. }
  1227. #ifdef OUTPUT_FORMAT
  1228. static void print_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
  1229. {
  1230. int rule;
  1231. int ix;
  1232. unsigned char c;
  1233. int len1;
  1234. int len2;
  1235. int spaces;
  1236. char *p;
  1237. char *pout;
  1238. int condition;
  1239. char buf[80];
  1240. char suffix[12];
  1241. static unsigned char symbols[] = {'@','&','%','+','#','$','D','Z','A','B','C','F'};
  1242. fprintf(f_out,"\n$group %s\n",name);
  1243. for(rule=0; rule<n_rules; rule++)
  1244. {
  1245. p = rules[rule];
  1246. len1 = strlen(p) + 1;
  1247. p = &p[len1];
  1248. len2 = strlen(p);
  1249. rule_match[0]=0;
  1250. rule_pre[0]=0;
  1251. rule_post[0]=0;
  1252. condition = 0;
  1253. pout = rule_match;
  1254. for(ix=0; ix<len2; ix++)
  1255. {
  1256. switch(c = p[ix])
  1257. {
  1258. case RULE_PRE:
  1259. *pout = 0;
  1260. pout = rule_pre;
  1261. break;
  1262. case RULE_POST:
  1263. *pout = 0;
  1264. pout = rule_post;
  1265. break;
  1266. case RULE_CONDITION:
  1267. condition = p[++ix];
  1268. break;
  1269. case RULE_ENDING:
  1270. sprintf(suffix,"$%d[%x]",(p[ix+2]),p[ix+1] & 0x7f);
  1271. ix += 2;
  1272. strcpy(pout,suffix);
  1273. pout += strlen(suffix);
  1274. break;
  1275. default:
  1276. if(c <= RULE_LETTER7)
  1277. c = symbols[c-RULE_SYLLABLE];
  1278. if(c == ' ')
  1279. c = '_';
  1280. *pout++ = c;
  1281. break;
  1282. }
  1283. }
  1284. *pout = 0;
  1285. spaces = 12;
  1286. if(condition > 0)
  1287. {
  1288. sprintf(buf,"?%d ",condition);
  1289. spaces -= strlen(buf);
  1290. fprintf(f_out,"%s",buf);
  1291. }
  1292. if(rule_pre[0] != 0)
  1293. {
  1294. p = buf;
  1295. for(ix=strlen(rule_pre)-1; ix>=0; ix--)
  1296. *p++ = rule_pre[ix];
  1297. sprintf(p,") ");
  1298. spaces -= strlen(buf);
  1299. for(ix=0; ix<spaces; ix++)
  1300. fputc(' ',f_out);
  1301. fprintf(f_out,"%s",buf);
  1302. spaces = 0;
  1303. }
  1304. for(ix=0; ix<spaces; ix++)
  1305. fputc(' ',f_out);
  1306. spaces = 14;
  1307. sprintf(buf," %s ",rule_match);
  1308. if(rule_post[0] != 0)
  1309. {
  1310. p = &buf[strlen(buf)];
  1311. sprintf(p,"(%s ",rule_post);
  1312. }
  1313. fprintf(f_out,"%s",buf);
  1314. spaces -= strlen(buf);
  1315. for(ix=0; ix<spaces; ix++)
  1316. fputc(' ',f_out);
  1317. DecodePhonemes(rules[rule],buf);
  1318. fprintf(f_out,"%s\n",buf); // phonemes
  1319. }
  1320. }
  1321. #endif
  1322. static void output_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
  1323. {
  1324. int ix;
  1325. int len1;
  1326. int len2;
  1327. int len_name;
  1328. char *p;
  1329. char *p2, *p3;
  1330. const char *common;
  1331. short nextchar_count[256];
  1332. memset(nextchar_count,0,sizeof(nextchar_count));
  1333. len_name = strlen(name);
  1334. #ifdef OUTPUT_FORMAT
  1335. print_rule_group(f_log,n_rules,rules,name);
  1336. #endif
  1337. // sort the rules in this group by their phoneme string
  1338. common = "";
  1339. qsort((void *)rules,n_rules,sizeof(char *),(int (__cdecl *)(const void *,const void *))string_sorter);
  1340. if(strcmp(name,"9")==0)
  1341. len_name = 0; // don't remove characters from numeric match strings
  1342. for(ix=0; ix<n_rules; ix++)
  1343. {
  1344. p = rules[ix];
  1345. len1 = strlen(p) + 1; // phoneme string
  1346. p3 = &p[len1];
  1347. p2 = p3 + len_name; // remove group name from start of match string
  1348. len2 = strlen(p2);
  1349. nextchar_count[(unsigned char)(p2[0])]++; // the next byte after the group name
  1350. if((common[0] != 0) && (strcmp(p,common)==0))
  1351. {
  1352. fwrite(p2,len2,1,f_out);
  1353. fputc(0,f_out); // no phoneme string, it's the same as previous rule
  1354. }
  1355. else
  1356. {
  1357. if((ix < n_rules-1) && (strcmp(p,rules[ix+1])==0))
  1358. {
  1359. common = rules[ix]; // phoneme string is same as next, set as common
  1360. fputc(RULE_PH_COMMON,f_out);
  1361. }
  1362. fwrite(p2,len2,1,f_out);
  1363. fputc(RULE_PHONEMES,f_out);
  1364. fwrite(p,len1,1,f_out);
  1365. }
  1366. }
  1367. #ifdef LIST_GROUP_INFO
  1368. for(ix=32; ix<256; ix++)
  1369. {
  1370. if(nextchar_count[ix] > 30)
  1371. printf("Group %s %c %d\n",name,ix,nextchar_count[ix]);
  1372. }
  1373. #endif
  1374. }
  1375. static int compile_lettergroup(char *input, FILE *f_out)
  1376. {
  1377. char *p;
  1378. char *p_start;
  1379. int group;
  1380. int ix;
  1381. int n_items;
  1382. int length;
  1383. int max_length = 0;
  1384. #define N_LETTERGP_ITEMS 200
  1385. char *items[N_LETTERGP_ITEMS];
  1386. char item_length[N_LETTERGP_ITEMS];
  1387. p = input;
  1388. if(!IsDigit09(p[0]) || !IsDigit09(p[1]))
  1389. {
  1390. fprintf(f_log,"%5d: Expected 2 digits after '.L'\n",linenum);
  1391. error_count++;
  1392. return(1);
  1393. }
  1394. group = atoi(&p[0]);
  1395. if(group >= N_LETTER_GROUPS)
  1396. {
  1397. fprintf(f_log,"%5d: lettergroup out of range (01-%.2d)\n",linenum,N_LETTER_GROUPS-1);
  1398. error_count++;
  1399. return(1);
  1400. }
  1401. while(!isspace2(*p)) p++;
  1402. fputc(RULE_GROUP_START,f_out);
  1403. fputc(RULE_LETTERGP2,f_out);
  1404. fputc(group + 'A', f_out);
  1405. if(letterGroupsDefined[group] != 0)
  1406. {
  1407. fprintf(f_log,"%5d: lettergroup L%.2d is already defined\n",linenum,group);
  1408. error_count++;
  1409. }
  1410. letterGroupsDefined[group] = 1;
  1411. n_items = 0;
  1412. while(n_items < N_LETTERGP_ITEMS)
  1413. {
  1414. while(isspace2(*p)) p++;
  1415. if(*p == 0)
  1416. break;
  1417. items[n_items] = p_start = p;
  1418. while((*p & 0xff) > ' ')
  1419. {
  1420. if (*p == '_') *p = ' '; // allow '_' for word break
  1421. p++;
  1422. }
  1423. *p++ = 0;
  1424. length = p - p_start;
  1425. if(length > max_length)
  1426. max_length = length;
  1427. item_length[n_items++] = length;
  1428. }
  1429. // write out the items, longest first
  1430. while(max_length > 1)
  1431. {
  1432. for(ix=0; ix < n_items; ix++)
  1433. {
  1434. if(item_length[ix] == max_length)
  1435. {
  1436. fwrite(items[ix],1,max_length,f_out);
  1437. }
  1438. }
  1439. max_length--;
  1440. }
  1441. fputc(RULE_GROUP_END,f_out);
  1442. return(0);
  1443. }
  1444. static int compile_dictrules(FILE *f_in, FILE *f_out, char *fname_temp)
  1445. {
  1446. char *prule;
  1447. unsigned char *p;
  1448. int ix;
  1449. int c;
  1450. int gp;
  1451. FILE *f_temp;
  1452. int n_rules=0;
  1453. int count=0;
  1454. int different;
  1455. int wc;
  1456. int err_n_rules=0;
  1457. const char *prev_rgroup_name;
  1458. unsigned int char_code;
  1459. int compile_mode=0;
  1460. char *buf;
  1461. char buf1[500];
  1462. char *rules[N_RULES];
  1463. int n_rgroups = 0;
  1464. int n_groups3 = 0;
  1465. RGROUP rgroup[N_RULE_GROUP2];
  1466. linenum = 0;
  1467. group_name[0] = 0;
  1468. if((f_temp = fopen_log(fname_temp,"wb")) == NULL)
  1469. return(1);
  1470. for(;;)
  1471. {
  1472. linenum++;
  1473. buf = fgets(buf1,sizeof(buf1),f_in);
  1474. if(buf != NULL)
  1475. {
  1476. if((p = (unsigned char *)strstr(buf,"//")) != NULL)
  1477. *p = 0;
  1478. if(buf[0] == '\r') buf++; // ignore extra \r in \r\n
  1479. }
  1480. if((buf == NULL) || (buf[0] == '.'))
  1481. {
  1482. // next .group or end of file, write out the previous group
  1483. if(n_rules > 0)
  1484. {
  1485. strcpy(rgroup[n_rgroups].name,group_name);
  1486. rgroup[n_rgroups].group3_ix = group3_ix;
  1487. rgroup[n_rgroups].start = ftell(f_temp);
  1488. output_rule_group(f_temp,n_rules,rules,group_name);
  1489. rgroup[n_rgroups].length = ftell(f_temp) - rgroup[n_rgroups].start;
  1490. n_rgroups++;
  1491. count += n_rules;
  1492. }
  1493. n_rules = 0;
  1494. err_n_rules = 0;
  1495. if(compile_mode == 2)
  1496. {
  1497. // end of the character replacements section
  1498. fwrite(&n_rules,1,4,f_out); // write a zero word to terminate the replacemenmt list
  1499. compile_mode = 0;
  1500. }
  1501. if(buf == NULL) break; // end of file
  1502. if(memcmp(buf,".L",2)==0)
  1503. {
  1504. compile_lettergroup(&buf[2], f_out);
  1505. continue;
  1506. }
  1507. if(memcmp(buf,".replace",8)==0)
  1508. {
  1509. compile_mode = 2;
  1510. fputc(RULE_GROUP_START,f_out);
  1511. fputc(RULE_REPLACEMENTS,f_out);
  1512. // advance to next word boundary
  1513. while((ftell(f_out) & 3) != 0)
  1514. fputc(0,f_out);
  1515. }
  1516. if(memcmp(buf,".group",6)==0)
  1517. {
  1518. compile_mode = 1;
  1519. p = (unsigned char *)&buf[6];
  1520. while((p[0]==' ') || (p[0]=='\t')) p++; // Note: Windows isspace(0xe1) gives TRUE !
  1521. ix = 0;
  1522. while((*p > ' ') && (ix < LEN_GROUP_NAME))
  1523. group_name[ix++] = *p++;
  1524. group_name[ix]=0;
  1525. group3_ix = 0;
  1526. if(sscanf(group_name,"0x%x",&char_code)==1)
  1527. {
  1528. // group character is given as a character code (max 16 bits)
  1529. p = (unsigned char *)group_name;
  1530. if(char_code > 0x100)
  1531. {
  1532. *p++ = (char_code >> 8);
  1533. }
  1534. *p++ = char_code;
  1535. *p = 0;
  1536. }
  1537. else
  1538. {
  1539. if(translator->letter_bits_offset > 0)
  1540. {
  1541. utf8_in(&wc, group_name);
  1542. if(((ix = (wc - translator->letter_bits_offset)) >= 0) && (ix < 128))
  1543. {
  1544. group3_ix = ix+1; // not zero
  1545. }
  1546. }
  1547. }
  1548. if((group3_ix == 0) && (strlen(group_name) > 2))
  1549. {
  1550. if(utf8_in(&c,group_name) < 2)
  1551. {
  1552. fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum);
  1553. error_count++;
  1554. }
  1555. group_name[2] = 0;
  1556. }
  1557. }
  1558. continue;
  1559. }
  1560. switch(compile_mode)
  1561. {
  1562. case 1: // .group
  1563. prule = compile_rule(buf);
  1564. if(prule != NULL)
  1565. {
  1566. if(n_rules < N_RULES)
  1567. {
  1568. rules[n_rules++] = prule;
  1569. }
  1570. else
  1571. {
  1572. if(err_n_rules == 0)
  1573. {
  1574. fprintf(stderr, "\nExceeded limit of rules (%d) in group '%s'\n", N_RULES, group_name);
  1575. error_count++;
  1576. err_n_rules = 1;
  1577. }
  1578. }
  1579. }
  1580. break;
  1581. case 2: // .replace
  1582. {
  1583. int replace1;
  1584. int replace2;
  1585. char *p;
  1586. p = buf;
  1587. replace1 = 0;
  1588. replace2 = 0;
  1589. while(isspace2(*p)) p++;
  1590. ix = 0;
  1591. while((unsigned char)(*p) > 0x20) // not space or zero-byte
  1592. {
  1593. p += utf8_in(&c,p);
  1594. replace1 += (c << ix);
  1595. ix += 16;
  1596. }
  1597. while(isspace2(*p)) p++;
  1598. ix = 0;
  1599. while((unsigned char)(*p) > 0x20)
  1600. {
  1601. p += utf8_in(&c,p);
  1602. replace2 += (c << ix);
  1603. ix += 16;
  1604. }
  1605. if(replace1 != 0)
  1606. {
  1607. Write4Bytes(f_out,replace1); // write as little-endian
  1608. Write4Bytes(f_out,replace2); // if big-endian, reverse the bytes in LoadDictionary()
  1609. }
  1610. }
  1611. break;
  1612. }
  1613. }
  1614. fclose(f_temp);
  1615. qsort((void *)rgroup,n_rgroups,sizeof(rgroup[0]),(int (__cdecl *)(const void *,const void *))rgroup_sorter);
  1616. if((f_temp = fopen(fname_temp,"rb"))==NULL)
  1617. return(2);
  1618. prev_rgroup_name = "\n";
  1619. for(gp = 0; gp < n_rgroups; gp++)
  1620. {
  1621. fseek(f_temp,rgroup[gp].start,SEEK_SET);
  1622. if((different = strcmp(rgroup[gp].name, prev_rgroup_name)) != 0)
  1623. {
  1624. // not the same as the previous group
  1625. if(gp > 0)
  1626. fputc(RULE_GROUP_END,f_out);
  1627. fputc(RULE_GROUP_START,f_out);
  1628. if(rgroup[gp].group3_ix != 0)
  1629. {
  1630. n_groups3++;
  1631. fputc(1,f_out);
  1632. fputc(rgroup[gp].group3_ix, f_out);
  1633. }
  1634. else
  1635. {
  1636. fprintf(f_out, "%s", prev_rgroup_name = rgroup[gp].name);
  1637. }
  1638. fputc(0,f_out);
  1639. }
  1640. for(ix=rgroup[gp].length; ix>0; ix--)
  1641. {
  1642. c = fgetc(f_temp);
  1643. fputc(c,f_out);
  1644. }
  1645. if(different)
  1646. {
  1647. }
  1648. }
  1649. fputc(RULE_GROUP_END,f_out);
  1650. fputc(0,f_out);
  1651. fclose(f_temp);
  1652. remove(fname_temp);
  1653. fprintf(f_log,"\t%d rules, %d groups (%d)\n\n",count,n_rgroups,n_groups3);
  1654. return(0);
  1655. }
  1656. int CompileDictionary(const char *dsource, const char *dict_name, FILE *log, char *fname_err, int flags)
  1657. {
  1658. // fname: space to write the filename in case of error
  1659. // flags: bit 0: include source line number information, for debug purposes.
  1660. FILE *f_in;
  1661. FILE *f_out;
  1662. int offset_rules=0;
  1663. int value;
  1664. char fname_in[sizeof(path_home)+45];
  1665. char fname_out[sizeof(path_home)+15];
  1666. char fname_temp[sizeof(path_home)+15];
  1667. char path[sizeof(path_home)+40]; // path_dsource+20
  1668. error_count = 0;
  1669. error_need_dictionary = 0;
  1670. memset(letterGroupsDefined,0,sizeof(letterGroupsDefined));
  1671. debug_flag = flags & 1;
  1672. if(dsource == NULL)
  1673. dsource = "";
  1674. f_log = log;
  1675. if(f_log == NULL)
  1676. f_log = stderr;
  1677. // try with and without '.txt' extension
  1678. sprintf(path,"%s%s_",dsource,dict_name);
  1679. sprintf(fname_in,"%srules.txt",path);
  1680. if((f_in = fopen(fname_in,"r")) == NULL)
  1681. {
  1682. sprintf(fname_in,"%srules",path);
  1683. if((f_in = fopen_log(fname_in,"r")) == NULL)
  1684. {
  1685. if(fname_err)
  1686. strcpy(fname_err,fname_in);
  1687. return(-1);
  1688. }
  1689. }
  1690. sprintf(fname_out,"%s%c%s_dict",path_home,PATHSEP,dict_name);
  1691. if((f_out = fopen_log(fname_out,"wb+")) == NULL)
  1692. {
  1693. if(fname_err)
  1694. strcpy(fname_err,fname_out);
  1695. fclose(f_in);
  1696. return(-1);
  1697. }
  1698. sprintf(fname_temp,"%s%ctemp",path_home,PATHSEP);
  1699. value = N_HASH_DICT;
  1700. Write4Bytes(f_out,value);
  1701. Write4Bytes(f_out,offset_rules);
  1702. compile_dictlist_start();
  1703. fprintf(f_log,"Using phonemetable: '%s'\n",phoneme_tab_list[phoneme_tab_number].name);
  1704. compile_dictlist_file(path,"roots");
  1705. if(translator->langopts.listx)
  1706. {
  1707. compile_dictlist_file(path,"list");
  1708. compile_dictlist_file(path,"listx");
  1709. }
  1710. else
  1711. {
  1712. compile_dictlist_file(path,"listx");
  1713. compile_dictlist_file(path,"list");
  1714. }
  1715. compile_dictlist_file(path,"extra");
  1716. compile_dictlist_end(f_out);
  1717. offset_rules = ftell(f_out);
  1718. fprintf(f_log,"Compiling: '%s'\n",fname_in);
  1719. compile_dictrules(f_in,f_out,fname_temp);
  1720. fclose(f_in);
  1721. fseek(f_out,4,SEEK_SET);
  1722. Write4Bytes(f_out,offset_rules);
  1723. fclose(f_out);
  1724. fflush(f_log);
  1725. LoadDictionary(translator, dict_name, 0);
  1726. return(error_count);
  1727. }