eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

compiledict.cpp 42KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013
  1. /***************************************************************************
  2. * Copyright (C) 2005 to 2014 by Jonathan Duddington *
  3. * email: [email protected] *
  4. * Copyright (C) 2015 by Reece H. Dunn *
  5. * *
  6. * This program is free software; you can redistribute it and/or modify *
  7. * it under the terms of the GNU General Public License as published by *
  8. * the Free Software Foundation; either version 3 of the License, or *
  9. * (at your option) any later version. *
  10. * *
  11. * This program is distributed in the hope that it will be useful, *
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  14. * GNU General Public License for more details. *
  15. * *
  16. * You should have received a copy of the GNU General Public License *
  17. * along with this program; if not, write see: *
  18. * <http://www.gnu.org/licenses/>. *
  19. ***************************************************************************/
  20. #include "StdAfx.h"
  21. #include <stdio.h>
  22. #include <ctype.h>
  23. #include <stdlib.h>
  24. #include <string.h>
  25. #include <wctype.h>
  26. #include "speak_lib.h"
  27. #include "speech.h"
  28. #include "phoneme.h"
  29. #include "synthesize.h"
  30. #include "translate.h"
  31. extern void Write4Bytes(FILE *f, int value);
  32. int HashDictionary(const char *string);
  33. static FILE *f_log = NULL;
  34. extern char *dir_dictionary;
  35. extern char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
  36. static int linenum;
  37. static int error_count;
  38. static int text_mode = 0;
  39. static int debug_flag = 0;
  40. static int error_need_dictionary = 0;
  41. static int hash_counts[N_HASH_DICT];
  42. static char *hash_chains[N_HASH_DICT];
  43. static char letterGroupsDefined[N_LETTER_GROUPS];
  44. MNEM_TAB mnem_rules[] = {
  45. {"unpr", DOLLAR_UNPR},
  46. {"noprefix", DOLLAR_NOPREFIX}, // rule fails if a prefix has been removed
  47. {"list", DOLLAR_LIST}, // a pronunciation is given in the *_list file
  48. {"w_alt1", 0x11},
  49. {"w_alt2", 0x12},
  50. {"w_alt3", 0x13},
  51. {"w_alt4", 0x14},
  52. {"w_alt5", 0x15},
  53. {"w_alt6", 0x16},
  54. {"w_alt", 0x11}, // note: put longer names before their sub-strings
  55. {"p_alt1", 0x21},
  56. {"p_alt2", 0x22},
  57. {"p_alt3", 0x23},
  58. {"p_alt4", 0x24},
  59. {"p_alt5", 0x25},
  60. {"p_alt6", 0x26},
  61. {"p_alt", 0x21},
  62. {NULL, -1}
  63. };
  64. MNEM_TAB mnem_flags[] = {
  65. // these in the first group put a value in bits0-3 of dictionary_flags
  66. {"$1", 0x41}, // stress on 1st syllable
  67. {"$2", 0x42}, // stress on 2nd syllable
  68. {"$3", 0x43},
  69. {"$4", 0x44},
  70. {"$5", 0x45},
  71. {"$6", 0x46},
  72. {"$7", 0x47},
  73. {"$u", 0x48}, // reduce to unstressed
  74. {"$u1", 0x49},
  75. {"$u2", 0x4a},
  76. {"$u3", 0x4b},
  77. {"$u+", 0x4c}, // reduce to unstressed, but stress at end of clause
  78. {"$u1+", 0x4d},
  79. {"$u2+", 0x4e},
  80. {"$u3+", 0x4f},
  81. // these set the corresponding numbered bit if dictionary_flags
  82. {"$pause", 8}, // ensure pause before this word
  83. {"$strend", 9}, // full stress if at end of clause
  84. {"$strend2", 10}, // full stress if at end of clause, or only followed by unstressed
  85. {"$unstressend",11}, // reduce stress at end of clause
  86. {"$accent_before",12}, // used with accent names, say this accent name before the letter name
  87. {"$abbrev", 13}, // use this pronuciation rather than split into letters
  88. // language specific
  89. {"$double", 14}, // IT double the initial consonant of next word
  90. {"$alt", 15}, // use alternative pronunciation
  91. {"$alt1", 15}, // synonym for $alt
  92. {"$alt2", 16},
  93. {"$alt3", 17},
  94. {"$alt4", 18},
  95. {"$alt5", 19},
  96. {"$alt6", 20},
  97. {"$alt7", 21},
  98. {"$combine", 23}, // Combine with the next word
  99. {"$dot", 24}, // ignore '.' after this word (abbreviation)
  100. {"$hasdot", 25}, // use this pronunciation if there is a dot after the word
  101. {"$max3", 27}, // limit to 3 repetitions
  102. {"$brk", 28}, // a shorter $pause
  103. {"$text", 29}, // word translates to replcement text, not phonemes
  104. // flags in dictionary word 2
  105. {"$verbf", 0x20}, // verb follows
  106. {"$verbsf", 0x21}, // verb follows, allow -s suffix
  107. {"$nounf", 0x22}, // noun follows
  108. {"$pastf", 0x23}, // past tense follows
  109. {"$verb", 0x24}, // use this pronunciation when its a verb
  110. {"$noun", 0x25}, // use this pronunciation when its a noun
  111. {"$past", 0x26}, // use this pronunciation when its past tense
  112. {"$verbextend",0x28}, // extend influence of 'verb follows'
  113. {"$capital", 0x29}, // use this pronunciation if initial letter is upper case
  114. {"$allcaps", 0x2a}, // use this pronunciation if initial letter is upper case
  115. {"$accent", 0x2b}, // character name is base-character name + accent name
  116. {"$sentence",0x2d}, // only if this clause is a sentence (i.e. terminator is {. ? !} not {, ; :}
  117. {"$only", 0x2e}, // only match on this word without suffix
  118. {"$onlys", 0x2f}, // only match with none, or with 's' suffix
  119. {"$stem", 0x30}, // must have a suffix
  120. {"$atend", 0x31}, // use this pronunciation if at end of clause
  121. {"$atstart", 0x32}, // use this pronunciation at start of clause
  122. {"$native", 0x33}, // not if we've switched translators
  123. // doesn't set dictionary_flags
  124. {"$?", 100}, // conditional rule, followed by byte giving the condition number
  125. {"$textmode", 200},
  126. {"$phonememode", 201},
  127. {NULL, -1}
  128. };
  129. #define LEN_GROUP_NAME 12
  130. typedef struct {
  131. char name[LEN_GROUP_NAME+1];
  132. unsigned int start;
  133. unsigned int length;
  134. int group3_ix;
  135. } RGROUP;
  136. int isspace2(unsigned int c)
  137. {//=========================
  138. // can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
  139. int c2;
  140. if(((c2 = (c & 0xff)) == 0) || (c > ' '))
  141. return(0);
  142. return(1);
  143. }
  144. static FILE *fopen_log(const char *fname,const char *access)
  145. {//==================================================
  146. // performs fopen, but produces error message to f_log if it fails
  147. FILE *f;
  148. if((f = fopen(fname,access)) == NULL)
  149. {
  150. if(f_log != NULL)
  151. fprintf(f_log,"Can't access (%s) file '%s'\n",access,fname);
  152. }
  153. return(f);
  154. }
  155. const char *LookupMnemName(MNEM_TAB *table, const int value)
  156. //==========================================================
  157. /* Lookup a mnemonic string in a table, return its name */
  158. {
  159. while(table->mnem != NULL)
  160. {
  161. if(table->value==value)
  162. return(table->mnem);
  163. table++;
  164. }
  165. return(""); /* not found */
  166. } /* end of LookupMnemValue */
  167. void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len)
  168. {//========================================================================
  169. int stress;
  170. int ix;
  171. const char *name;
  172. int len;
  173. int total = 0;
  174. buf[0] = 0;
  175. if((stress = flags[0] & 0xf) != 0)
  176. {
  177. sprintf(buf, "%s", LookupMnemName(mnem_flags, stress + 0x40));
  178. total = strlen(buf);
  179. buf += total;
  180. }
  181. for(ix=8; ix<64; ix++)
  182. {
  183. if(((ix < 30) && (flags[0] & (1 << ix))) || ((ix >= 0x20) && (flags[1] & (1 << (ix-0x20)))))
  184. {
  185. name = LookupMnemName(mnem_flags, ix);
  186. len = strlen(name) + 1;
  187. total += len;
  188. if(total >= buf_len)
  189. continue;
  190. sprintf(buf, " %s", name);
  191. buf += len;
  192. }
  193. }
  194. }
  195. char *DecodeRule(const char *group_chars, int group_length, char *rule, int control)
  196. {//=================================================================================
  197. /* Convert compiled match template to ascii */
  198. unsigned char rb;
  199. unsigned char c;
  200. char *p;
  201. char *p_end;
  202. int ix;
  203. int match_type;
  204. int finished=0;
  205. int value;
  206. int linenum=0;
  207. int flags;
  208. int suffix_char;
  209. int condition_num=0;
  210. int at_start = 0;
  211. const char *name;
  212. char buf[200];
  213. char buf_pre[200];
  214. char suffix[20];
  215. static char output[80];
  216. static char symbols[] =
  217. {' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
  218. '&','%','+','#','S','D','Z','A','L','!',' ','@','?','J','N','K','V','?','T','X','?','W'
  219. };
  220. static char symbols_lg[] = {'A','B','C','H','F','G','Y'};
  221. match_type = 0;
  222. buf_pre[0] = 0;
  223. for(ix=0; ix<group_length; ix++)
  224. {
  225. buf[ix] = group_chars[ix];
  226. }
  227. buf[ix] = 0;
  228. p = &buf[strlen(buf)];
  229. while(!finished)
  230. {
  231. rb = *rule++;
  232. if(rb <= RULE_LINENUM)
  233. {
  234. switch(rb)
  235. {
  236. case 0:
  237. case RULE_PHONEMES:
  238. finished=1;
  239. break;
  240. case RULE_PRE_ATSTART:
  241. at_start = 1; // drop through to next case
  242. case RULE_PRE:
  243. match_type = RULE_PRE;
  244. *p = 0;
  245. p = buf_pre;
  246. break;
  247. case RULE_POST:
  248. match_type = RULE_POST;
  249. *p = 0;
  250. strcat(buf," (");
  251. p = &buf[strlen(buf)];
  252. break;
  253. case RULE_PH_COMMON:
  254. break;
  255. case RULE_CONDITION:
  256. /* conditional rule, next byte gives condition number */
  257. condition_num = *rule++;
  258. break;
  259. case RULE_LINENUM:
  260. value = (rule[1] & 0xff) - 1;
  261. linenum = (rule[0] & 0xff) - 1 + (value * 255);
  262. rule+=2;
  263. break;
  264. }
  265. continue;
  266. }
  267. if(rb == RULE_DOLLAR)
  268. {
  269. value = *rule++ & 0xff;
  270. if((value != 0x01) || (control & FLAG_UNPRON_TEST))
  271. {
  272. // TODO write the string backwards if in RULE_PRE
  273. p[0] = '$';
  274. name = LookupMnemName(mnem_rules, value);
  275. strcpy(&p[1],name);
  276. p += (strlen(name)+1);
  277. }
  278. c = ' ';
  279. }
  280. else if(rb == RULE_ENDING)
  281. {
  282. static const char *flag_chars = "eipvdfq tba ";
  283. flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f);
  284. suffix_char = 'S';
  285. if(flags & (SUFX_P >> 8))
  286. suffix_char = 'P';
  287. sprintf(suffix,"%c%d",suffix_char,rule[2] & 0x7f);
  288. rule += 3;
  289. for(ix=0; ix<9; ix++)
  290. {
  291. if(flags & 1)
  292. sprintf(&suffix[strlen(suffix)],"%c",flag_chars[ix]);
  293. flags = (flags >> 1);
  294. }
  295. strcpy(p,suffix);
  296. p += strlen(suffix);
  297. c = ' ';
  298. }
  299. else if(rb == RULE_LETTERGP)
  300. {
  301. c = symbols_lg[*rule++ - 'A'];
  302. }
  303. else if(rb == RULE_LETTERGP2)
  304. {
  305. value = *rule++ - 'A';
  306. p[0] = 'L';
  307. p[1] = (value / 10) + '0';
  308. c = (value % 10) + '0';
  309. if(match_type == RULE_PRE)
  310. {
  311. p[0] = c;
  312. c = 'L';
  313. }
  314. p+=2;
  315. }
  316. else if(rb <= RULE_LAST_RULE)
  317. c = symbols[rb];
  318. else if(rb == RULE_SPACE)
  319. c = '_';
  320. else
  321. c = rb;
  322. *p++ = c;
  323. }
  324. *p = 0;
  325. p = output;
  326. p_end = p + sizeof(output) - 1;
  327. if(linenum > 0)
  328. {
  329. sprintf(p,"%5d:\t",linenum);
  330. p += 7;
  331. }
  332. if(condition_num > 0)
  333. {
  334. sprintf(p,"?%d ",condition_num);
  335. p = &p[strlen(p)];
  336. }
  337. if(((ix = strlen(buf_pre)) > 0) || at_start)
  338. {
  339. if(at_start)
  340. *p++ = '_';
  341. while((--ix >= 0) && (p < p_end-3))
  342. *p++ = buf_pre[ix];
  343. *p++ = ')';
  344. *p++ = ' ';
  345. }
  346. *p = 0;
  347. buf[p_end - p] = 0; // prevent overflow in output[]
  348. strcat(p,buf);
  349. ix = strlen(output);
  350. while(ix < 8)
  351. output[ix++]=' ';
  352. output[ix]=0;
  353. return(output);
  354. } /* end of DecodeRule */
  355. static int compile_line(char *linebuf, char *dict_line, int *hash)
  356. {//===============================================================
  357. // Compile a line in the language_list file
  358. unsigned char c;
  359. char *p;
  360. char *word;
  361. char *phonetic;
  362. unsigned int ix;
  363. int step;
  364. unsigned int n_flag_codes = 0;
  365. int flagnum;
  366. int flag_offset;
  367. int length;
  368. int multiple_words = 0;
  369. int multiple_numeric_hyphen = 0;
  370. char *multiple_string = NULL;
  371. char *multiple_string_end = NULL;
  372. int len_word;
  373. int len_phonetic;
  374. int text_not_phonemes; // this word specifies replacement text, not phonemes
  375. unsigned int wc;
  376. int all_upper_case;
  377. char *mnemptr;
  378. unsigned char flag_codes[100];
  379. char encoded_ph[200];
  380. char bad_phoneme_str[4];
  381. int bad_phoneme;
  382. static char nullstring[] = {0};
  383. text_not_phonemes = 0;
  384. phonetic = word = nullstring;
  385. p = linebuf;
  386. // while(isspace2(*p)) p++;
  387. #ifdef deleted
  388. if(*p == '$')
  389. {
  390. if(memcmp(p,"$textmode",9) == 0)
  391. {
  392. text_mode = 1;
  393. return(0);
  394. }
  395. if(memcmp(p,"$phonememode",12) == 0)
  396. {
  397. text_mode = 0;
  398. return(0);
  399. }
  400. }
  401. #endif
  402. step = 0;
  403. c = 0;
  404. while(c != '\n')
  405. {
  406. c = *p;
  407. if((c == '?') && (step==0))
  408. {
  409. // conditional rule, allow only if the numbered condition is set for the voice
  410. flag_offset = 100;
  411. p++;
  412. if(*p == '!')
  413. {
  414. // allow only if the numbered condition is NOT set
  415. flag_offset = 132;
  416. p++;
  417. }
  418. ix = 0;
  419. if(IsDigit09(*p))
  420. {
  421. ix += (*p-'0');
  422. p++;
  423. }
  424. if(IsDigit09(*p))
  425. {
  426. ix = ix*10 + (*p-'0');
  427. p++;
  428. }
  429. flag_codes[n_flag_codes++] = ix + flag_offset;
  430. c = *p;
  431. }
  432. if((c == '$') && isalnum(p[1]))
  433. {
  434. /* read keyword parameter */
  435. mnemptr = p;
  436. while(!isspace2(c = *p)) p++;
  437. *p = 0;
  438. flagnum = LookupMnem(mnem_flags,mnemptr);
  439. if(flagnum > 0)
  440. {
  441. if(flagnum == 200)
  442. {
  443. text_mode = 1;
  444. }
  445. else if(flagnum == 201)
  446. {
  447. text_mode = 0;
  448. }
  449. else if(flagnum == BITNUM_FLAG_TEXTMODE)
  450. {
  451. text_not_phonemes = 1;
  452. }
  453. else
  454. {
  455. flag_codes[n_flag_codes++] = flagnum;
  456. }
  457. }
  458. else
  459. {
  460. fprintf(f_log,"%5d: Unknown keyword: %s\n",linenum,mnemptr);
  461. error_count++;
  462. }
  463. }
  464. if((c == '/') && (p[1] == '/') && (multiple_words==0))
  465. {
  466. c = '\n'; /* "//" treat comment as end of line */
  467. }
  468. switch(step)
  469. {
  470. case 0:
  471. if(c == '(')
  472. {
  473. multiple_words = 1;
  474. word = p+1;
  475. step = 1;
  476. }
  477. else if(!isspace2(c))
  478. {
  479. word = p;
  480. step = 1;
  481. }
  482. break;
  483. case 1:
  484. if((c == '-') && multiple_words)
  485. {
  486. if(IsDigit09(word[0]))
  487. {
  488. multiple_numeric_hyphen = 1;
  489. }
  490. // else // ???
  491. {
  492. flag_codes[n_flag_codes++] = BITNUM_FLAG_HYPHENATED;
  493. }
  494. c = ' ';
  495. }
  496. if(isspace2(c))
  497. {
  498. p[0] = 0; /* terminate english word */
  499. if(multiple_words)
  500. {
  501. multiple_string = multiple_string_end = p+1;
  502. step = 2;
  503. }
  504. else
  505. {
  506. step = 3;
  507. }
  508. }
  509. else if(c == ')')
  510. {
  511. if(multiple_words)
  512. {
  513. p[0] = 0;
  514. multiple_words = 0;
  515. step = 3;
  516. }
  517. else if(word[0] != '_')
  518. {
  519. fprintf(f_log, "%5d: Missing '('\n", linenum);
  520. error_count++;
  521. step = 3;
  522. }
  523. }
  524. break;
  525. case 2:
  526. if(isspace2(c))
  527. {
  528. multiple_words++;
  529. }
  530. else if(c == ')')
  531. {
  532. p[0] = ' '; // terminate extra string
  533. multiple_string_end = p+1;
  534. step = 3;
  535. }
  536. break;
  537. case 3:
  538. if(!isspace2(c))
  539. {
  540. phonetic = p;
  541. step = 4;
  542. }
  543. break;
  544. case 4:
  545. if(isspace2(c))
  546. {
  547. p[0] = 0; /* terminate phonetic */
  548. step = 5;
  549. }
  550. break;
  551. case 5:
  552. break;
  553. }
  554. p++;
  555. }
  556. if(word[0] == 0)
  557. {
  558. return(0); /* blank line */
  559. }
  560. if(text_mode)
  561. text_not_phonemes = 1;
  562. if(text_not_phonemes)
  563. {
  564. if(word[0] == '_')
  565. {
  566. // This is a special word, used by eSpeak. Translate this into phonemes now
  567. strcat(phonetic, " "); // need a space to indicate word-boundary
  568. // PROBLEM vowel reductions are not applied to the translated phonemes
  569. // condition rules are not applied
  570. TranslateWord(translator,phonetic,0,NULL,NULL);
  571. text_not_phonemes = 0;
  572. strncpy0(encoded_ph, word_phonemes, N_WORD_BYTES-4);
  573. if((word_phonemes[0] == 0) && (error_need_dictionary < 3))
  574. {
  575. // the dictionary was not loaded, we need a second attempt
  576. error_need_dictionary++;
  577. fprintf(f_log,"%5d: Need to compile dictionary again\n",linenum);
  578. }
  579. {
  580. //char decoded_phonemes[128];
  581. //DecodePhonemes(word_phonemes,decoded_phonemes);
  582. //printf("Translator %x %s [%s] [%s]\n",translator->translator_name,word,phonetic,decoded_phonemes);
  583. }
  584. }
  585. else
  586. {
  587. // this is replacement text, so don't encode as phonemes. Restrict the length of the replacement word
  588. strncpy0(encoded_ph,phonetic,N_WORD_BYTES-4);
  589. }
  590. }
  591. else
  592. {
  593. EncodePhonemes(phonetic,encoded_ph,&bad_phoneme);
  594. if(strchr(encoded_ph,phonSWITCH) != 0)
  595. {
  596. flag_codes[n_flag_codes++] = BITNUM_FLAG_ONLY_S; // don't match on suffixes (except 's') when switching languages
  597. }
  598. // check for errors in the phonemes codes
  599. if(bad_phoneme != 0)
  600. {
  601. // unrecognised phoneme, report error
  602. bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
  603. fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s %s\n",linenum,bad_phoneme_str,bad_phoneme,word,phonetic);
  604. error_count++;
  605. }
  606. }
  607. if(text_not_phonemes != translator->langopts.textmode)
  608. {
  609. flag_codes[n_flag_codes++] = BITNUM_FLAG_TEXTMODE;
  610. }
  611. if(sscanf(word,"U+%x",&wc) == 1)
  612. {
  613. // Character code
  614. ix = utf8_out(wc, word);
  615. word[ix] = 0;
  616. }
  617. else if(word[0] != '_')
  618. {
  619. // convert to lower case, and note if the word is all-capitals
  620. int c2;
  621. all_upper_case = 1;
  622. p = word;
  623. for(p=word;;)
  624. {
  625. // this assumes that the lower case char is the same length as the upper case char
  626. // OK, except for Turkish "I", but use towlower() rather than towlower2()
  627. ix = utf8_in(&c2,p);
  628. if(c2 == 0)
  629. break;
  630. if(iswupper2(c2))
  631. {
  632. utf8_out(towlower2(c2),p);
  633. }
  634. else
  635. {
  636. all_upper_case = 0;
  637. }
  638. p += ix;
  639. }
  640. if(all_upper_case)
  641. {
  642. flag_codes[n_flag_codes++] = BITNUM_FLAG_ALLCAPS;
  643. }
  644. }
  645. len_word = strlen(word);
  646. if(translator->transpose_min > 0)
  647. {
  648. len_word = TransposeAlphabet(translator, word);
  649. }
  650. *hash = HashDictionary(word);
  651. len_phonetic = strlen(encoded_ph);
  652. dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed
  653. len_word &= 0x3f;
  654. memcpy(&dict_line[2],word,len_word);
  655. if(len_phonetic == 0)
  656. {
  657. // no phonemes specified. set bit 7
  658. dict_line[1] |= 0x80;
  659. length = len_word + 2;
  660. }
  661. else
  662. {
  663. length = len_word + len_phonetic + 3;
  664. strcpy(&dict_line[(len_word)+2],encoded_ph);
  665. }
  666. for(ix=0; ix<n_flag_codes; ix++)
  667. {
  668. dict_line[ix+length] = flag_codes[ix];
  669. }
  670. length += n_flag_codes;
  671. if((multiple_string != NULL) && (multiple_words > 0))
  672. {
  673. if(multiple_words > 10)
  674. {
  675. fprintf(f_log,"%5d: Two many parts in a multi-word entry: %d\n",linenum,multiple_words);
  676. error_count++;
  677. }
  678. else
  679. {
  680. dict_line[length++] = 80 + multiple_words;
  681. ix = multiple_string_end - multiple_string;
  682. if(multiple_numeric_hyphen)
  683. {
  684. dict_line[length++] = ' '; // ???
  685. }
  686. memcpy(&dict_line[length],multiple_string,ix);
  687. length += ix;
  688. }
  689. }
  690. dict_line[0] = length;
  691. return(length);
  692. } /* end of compile_line */
  693. static void compile_dictlist_start(void)
  694. {//=====================================
  695. // initialise dictionary list
  696. int ix;
  697. char *p;
  698. char *p2;
  699. for(ix=0; ix<N_HASH_DICT; ix++)
  700. {
  701. p = hash_chains[ix];
  702. while(p != NULL)
  703. {
  704. memcpy(&p2,p,sizeof(char *));
  705. free(p);
  706. p = p2;
  707. }
  708. hash_chains[ix] = NULL;
  709. hash_counts[ix]=0;
  710. }
  711. }
  712. static void compile_dictlist_end(FILE *f_out)
  713. {//==========================================
  714. // Write out the compiled dictionary list
  715. int hash;
  716. int length;
  717. char *p;
  718. if(f_log != NULL)
  719. {
  720. #ifdef OUTPUT_FORMAT
  721. for(hash=0; hash<N_HASH_DICT; hash++)
  722. {
  723. fprintf(f_log,"%8d",hash_counts[hash]);
  724. if((hash & 7) == 7)
  725. fputc('\n',f_log);
  726. }
  727. fflush(f_log);
  728. #endif
  729. }
  730. for(hash=0; hash<N_HASH_DICT; hash++)
  731. {
  732. p = hash_chains[hash];
  733. hash_counts[hash] = (int)ftell(f_out);
  734. while(p != NULL)
  735. {
  736. length = *(p+sizeof(char *));
  737. fwrite(p+sizeof(char *),length,1,f_out);
  738. memcpy(&p,p,sizeof(char *));
  739. }
  740. fputc(0,f_out);
  741. }
  742. }
  743. static int compile_dictlist_file(const char *path, const char* filename)
  744. {//=====================================================================
  745. int length;
  746. int hash;
  747. char *p;
  748. int count=0;
  749. FILE *f_in;
  750. char buf[200];
  751. char fname[sizeof(path_home)+45];
  752. char dict_line[128];
  753. text_mode = 0;
  754. // try with and without '.txt' extension
  755. sprintf(fname,"%s%s.txt",path,filename);
  756. if((f_in = fopen(fname,"r")) == NULL)
  757. {
  758. sprintf(fname,"%s%s",path,filename);
  759. if((f_in = fopen(fname,"r")) == NULL)
  760. return(-1);
  761. }
  762. if(f_log != NULL)
  763. fprintf(f_log,"Compiling: '%s'\n",fname);
  764. linenum=0;
  765. while(fgets(buf,sizeof(buf),f_in) != NULL)
  766. {
  767. linenum++;
  768. length = compile_line(buf,dict_line,&hash);
  769. if(length == 0) continue; /* blank line */
  770. hash_counts[hash]++;
  771. p = (char *)malloc(length+sizeof(char *));
  772. if(p == NULL)
  773. {
  774. if(f_log != NULL)
  775. {
  776. fprintf(f_log,"Can't allocate memory\n");
  777. error_count++;
  778. }
  779. break;
  780. }
  781. memcpy(p,&hash_chains[hash],sizeof(char *));
  782. hash_chains[hash] = p;
  783. memcpy(p+sizeof(char *),dict_line,length);
  784. count++;
  785. }
  786. if(f_log != NULL)
  787. fprintf(f_log,"\t%d entries\n",count);
  788. fclose(f_in);
  789. return(0);
  790. } /* end of compile_dictlist_file */
  791. static char rule_cond[80];
  792. static char rule_pre[80];
  793. static char rule_post[80];
  794. static char rule_match[80];
  795. static char rule_phonemes[80];
  796. static char group_name[LEN_GROUP_NAME+1];
  797. static int group3_ix;
  798. #define N_RULES 3000 // max rules for each group
  799. int isHexDigit(int c)
  800. {
  801. if((c >= '0') && (c <= '9'))
  802. return(c - '0');
  803. if((c >= 'a') && (c <= 'f'))
  804. return(c - 'a' + 10);
  805. if((c >= 'A') && (c <= 'F'))
  806. return(c - 'A' + 10);
  807. return(-1);
  808. }
  809. static void copy_rule_string(char *string, int *state_out)
  810. {//=======================================================
  811. // state 0: conditional, 1=pre, 2=match, 3=post, 4=phonemes
  812. static char *outbuf[5] = {rule_cond, rule_pre, rule_match, rule_post, rule_phonemes};
  813. static int next_state[5] = {2,2,4,4,4};
  814. char *output;
  815. char *p;
  816. int ix;
  817. int len;
  818. char c;
  819. int c2, c3;
  820. int sxflags;
  821. int value;
  822. int literal;
  823. int hexdigit_input = 0;
  824. int state = *state_out;
  825. MNEM_TAB *mr;
  826. if(string[0] == 0) return;
  827. output = outbuf[state];
  828. if(state==4)
  829. {
  830. // append to any previous phoneme string, i.e. allow spaces in the phoneme string
  831. len = strlen(rule_phonemes);
  832. if(len > 0)
  833. rule_phonemes[len++] = ' ';
  834. output = &rule_phonemes[len];
  835. }
  836. sxflags = 0x808000; // to ensure non-zero bytes
  837. for(p=string,ix=0;;)
  838. {
  839. literal = 0;
  840. c = *p++;
  841. if((c == '0') && (p[0] == 'x') && (isHexDigit(p[1]) >= 0) && (isHexDigit(p[2]) >= 0))
  842. {
  843. hexdigit_input = 1;
  844. c = p[1];
  845. p+= 2;
  846. }
  847. if(c == '\\')
  848. {
  849. c = *p++; // treat next character literally
  850. //#ifdef deleted
  851. if((c >= '0') && (c <= '3') && (p[0] >= '0') && (p[0] <= '7') && (p[1] >= '0') && (p[1] <= '7'))
  852. {
  853. // character code given by 3 digit octal value;
  854. c = (c-'0')*64 + (p[0]-'0')*8 + (p[1]-'0');
  855. p += 2;
  856. }
  857. //endif
  858. literal = 1;
  859. }
  860. if(hexdigit_input)
  861. {
  862. if(((c2 = isHexDigit(c)) >= 0) && ((c3 = isHexDigit(p[0])) >= 0))
  863. {
  864. c = c2 * 16 + c3;
  865. literal = 1;
  866. p++;
  867. }
  868. else
  869. {
  870. hexdigit_input = 0;
  871. }
  872. }
  873. if((state==1) || (state==3))
  874. {
  875. // replace special characters (note: 'E' is reserved for a replaced silent 'e')
  876. if(literal == 0)
  877. {
  878. static const char lettergp_letters[9] = {LETTERGP_A,LETTERGP_B,LETTERGP_C,0,0,LETTERGP_F,LETTERGP_G,LETTERGP_H,LETTERGP_Y};
  879. switch(c)
  880. {
  881. case '_':
  882. c = RULE_SPACE;
  883. break;
  884. case 'Y':
  885. c = 'I'; // drop through to next case
  886. case 'A': // vowel
  887. case 'B':
  888. case 'C':
  889. case 'H':
  890. case 'F':
  891. case 'G':
  892. if(state == 1)
  893. {
  894. // pre-rule, put the number before the RULE_LETTERGP;
  895. output[ix++] = lettergp_letters[c-'A'] + 'A';
  896. c = RULE_LETTERGP;
  897. }
  898. else
  899. {
  900. output[ix++] = RULE_LETTERGP;
  901. c = lettergp_letters[c-'A'] + 'A';
  902. }
  903. break;
  904. case 'D':
  905. c = RULE_DIGIT;
  906. break;
  907. case 'K':
  908. c = RULE_NOTVOWEL;
  909. break;
  910. case 'N':
  911. c = RULE_NO_SUFFIX;
  912. break;
  913. case 'V':
  914. c = RULE_IFVERB;
  915. break;
  916. case 'Z':
  917. c = RULE_NONALPHA;
  918. break;
  919. case '+':
  920. c = RULE_INC_SCORE;
  921. break;
  922. case '@':
  923. c = RULE_SYLLABLE;
  924. break;
  925. case '&':
  926. c = RULE_STRESSED;
  927. break;
  928. case '%':
  929. c = RULE_DOUBLE;
  930. break;
  931. case '#':
  932. c = RULE_DEL_FWD;
  933. break;
  934. case '!':
  935. c = RULE_CAPITAL;
  936. break;
  937. case 'T':
  938. output[ix++] = RULE_DOLLAR;
  939. c = 0x11;
  940. break;
  941. case 'W':
  942. c = RULE_SPELLING;
  943. break;
  944. case 'X':
  945. c = RULE_NOVOWELS;
  946. break;
  947. case 'J':
  948. c = RULE_SKIPCHARS;
  949. break;
  950. case 'L':
  951. // expect two digits
  952. c = *p++ - '0';
  953. value = *p++ - '0';
  954. c = c * 10 + value;
  955. if((value < 0) || (value > 9))
  956. {
  957. c = 0;
  958. fprintf(f_log,"%5d: Expected 2 digits after 'L'\n",linenum);
  959. error_count++;
  960. }
  961. else if((c <= 0) || (c >= N_LETTER_GROUPS) || (letterGroupsDefined[(int)c] == 0))
  962. {
  963. fprintf(f_log,"%5d: Letter group L%.2d not defined\n",linenum,c);
  964. error_count++;
  965. }
  966. c += 'A';
  967. if(state == 1)
  968. {
  969. // pre-rule, put the group number before the RULE_LETTERGP command
  970. output[ix++] = c;
  971. c = RULE_LETTERGP2;
  972. }
  973. else
  974. {
  975. output[ix++] = RULE_LETTERGP2;
  976. }
  977. break;
  978. case '$':
  979. value = 0;
  980. mr = mnem_rules;
  981. while(mr->mnem != NULL)
  982. {
  983. len = strlen(mr->mnem);
  984. if(memcmp(p, mr->mnem, len) == 0)
  985. {
  986. value = mr->value;
  987. p += len;
  988. break;
  989. }
  990. mr++;
  991. }
  992. if(state == 1)
  993. {
  994. // pre-rule, put the number before the RULE_DOLLAR
  995. output[ix++] = value;
  996. c = RULE_DOLLAR;
  997. }
  998. else
  999. {
  1000. output[ix++] = RULE_DOLLAR;
  1001. c = value;
  1002. }
  1003. if(value == 0)
  1004. {
  1005. fprintf(f_log,"%5d: $ command not recognized\n",linenum);
  1006. error_count++;
  1007. }
  1008. break;
  1009. case 'P':
  1010. sxflags |= SUFX_P; // Prefix, now drop through to Suffix
  1011. case 'S':
  1012. output[ix++] = RULE_ENDING;
  1013. value = 0;
  1014. while(!isspace2(c = *p++) && (c != 0))
  1015. {
  1016. switch(c)
  1017. {
  1018. case 'e':
  1019. sxflags |= SUFX_E;
  1020. break;
  1021. case 'i':
  1022. sxflags |= SUFX_I;
  1023. break;
  1024. case 'p': // obsolete, replaced by 'P' above
  1025. sxflags |= SUFX_P;
  1026. break;
  1027. case 'v':
  1028. sxflags |= SUFX_V;
  1029. break;
  1030. case 'd':
  1031. sxflags |= SUFX_D;
  1032. break;
  1033. case 'f':
  1034. sxflags |= SUFX_F;
  1035. break;
  1036. case 'q':
  1037. sxflags |= SUFX_Q;
  1038. break;
  1039. case 't':
  1040. sxflags |= SUFX_T;
  1041. break;
  1042. case 'b':
  1043. sxflags |= SUFX_B;
  1044. break;
  1045. case 'a':
  1046. sxflags |= SUFX_A;
  1047. break;
  1048. case 'm':
  1049. sxflags |= SUFX_M;
  1050. break;
  1051. default:
  1052. if(IsDigit09(c))
  1053. value = (value*10) + (c - '0');
  1054. break;
  1055. }
  1056. }
  1057. p--;
  1058. output[ix++] = sxflags >> 16;
  1059. output[ix++] = sxflags >> 8;
  1060. c = value | 0x80;
  1061. break;
  1062. }
  1063. }
  1064. }
  1065. output[ix++] = c;
  1066. if(c == 0) break;
  1067. }
  1068. *state_out = next_state[state];
  1069. } // end of copy_rule_string
  1070. static char *compile_rule(char *input)
  1071. {//===================================
  1072. int ix;
  1073. unsigned char c;
  1074. int wc;
  1075. char *p;
  1076. char *prule;
  1077. int len;
  1078. int len_name;
  1079. int start;
  1080. int state=2;
  1081. int finish=0;
  1082. char buf[80];
  1083. char output[150];
  1084. int bad_phoneme;
  1085. char bad_phoneme_str[4];
  1086. buf[0]=0;
  1087. rule_cond[0]=0;
  1088. rule_pre[0]=0;
  1089. rule_post[0]=0;
  1090. rule_match[0]=0;
  1091. rule_phonemes[0]=0;
  1092. p = buf;
  1093. for(ix=0; finish==0; ix++)
  1094. {
  1095. c = input[ix];
  1096. switch(c = input[ix])
  1097. {
  1098. case ')': // end of prefix section
  1099. *p = 0;
  1100. state = 1;
  1101. copy_rule_string(buf,&state);
  1102. p = buf;
  1103. break;
  1104. case '(': // start of suffix section
  1105. *p = 0;
  1106. state = 2;
  1107. copy_rule_string(buf,&state);
  1108. state = 3;
  1109. p = buf;
  1110. if(input[ix+1] == ' ')
  1111. {
  1112. fprintf(f_log,"%5d: Syntax error. Space after (\n",linenum);
  1113. error_count++;
  1114. }
  1115. break;
  1116. case '\n': // end of line
  1117. case '\r':
  1118. case 0: // end of line
  1119. *p = 0;
  1120. copy_rule_string(buf,&state);
  1121. finish=1;
  1122. break;
  1123. case '\t': // end of section section
  1124. case ' ':
  1125. *p = 0;
  1126. copy_rule_string(buf,&state);
  1127. p = buf;
  1128. break;
  1129. case '?':
  1130. if(state==2)
  1131. state=0;
  1132. else
  1133. *p++ = c;
  1134. break;
  1135. default:
  1136. *p++ = c;
  1137. break;
  1138. }
  1139. }
  1140. if(strcmp(rule_match,"$group")==0)
  1141. strcpy(rule_match,group_name);
  1142. if(rule_match[0]==0)
  1143. {
  1144. if(rule_post[0] != 0)
  1145. {
  1146. fprintf(f_log,"%5d: Syntax error\n",linenum);
  1147. error_count++;
  1148. }
  1149. return(NULL);
  1150. }
  1151. EncodePhonemes(rule_phonemes,buf,&bad_phoneme);
  1152. if(bad_phoneme != 0)
  1153. {
  1154. bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
  1155. fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s\n",linenum,bad_phoneme_str,bad_phoneme,input);
  1156. error_count++;
  1157. }
  1158. strcpy(output,buf);
  1159. len = strlen(buf)+1;
  1160. len_name = strlen(group_name);
  1161. if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0))
  1162. {
  1163. utf8_in(&wc,rule_match);
  1164. if((group_name[0] == '9') && IsDigit(wc))
  1165. {
  1166. // numeric group, rule_match starts with a digit, so OK
  1167. }
  1168. else
  1169. {
  1170. fprintf(f_log,"%5d: Wrong initial letters '%s' for group '%s'\n",linenum,rule_match,group_name);
  1171. error_count++;
  1172. }
  1173. }
  1174. strcpy(&output[len],rule_match);
  1175. len += strlen(rule_match);
  1176. if(debug_flag)
  1177. {
  1178. output[len] = RULE_LINENUM;
  1179. output[len+1] = (linenum % 255) + 1;
  1180. output[len+2] = (linenum / 255) + 1;
  1181. len+=3;
  1182. }
  1183. if(rule_cond[0] != 0)
  1184. {
  1185. ix = -1;
  1186. if(rule_cond[0] == '!')
  1187. {
  1188. // allow the rule only if the condition number is NOT set for the voice
  1189. ix = atoi(&rule_cond[1]) + 32;
  1190. }
  1191. else
  1192. {
  1193. // allow the rule only if the condition number is set for the voice
  1194. ix = atoi(rule_cond);
  1195. }
  1196. if((ix > 0) && (ix < 255))
  1197. {
  1198. output[len++] = RULE_CONDITION;
  1199. output[len++] = ix;
  1200. }
  1201. else
  1202. {
  1203. fprintf(f_log,"%5d: bad condition number ?%d\n",linenum,ix);
  1204. error_count++;
  1205. }
  1206. }
  1207. if(rule_pre[0] != 0)
  1208. {
  1209. start = 0;
  1210. if(rule_pre[0] == RULE_SPACE)
  1211. {
  1212. // omit '_' at the beginning of the pre-string and imply it by using RULE_PRE_ATSTART
  1213. c = RULE_PRE_ATSTART;
  1214. start = 1;
  1215. }
  1216. else
  1217. {
  1218. c = RULE_PRE;
  1219. }
  1220. output[len++] = c;
  1221. // output PRE string in reverse order
  1222. for(ix = strlen(rule_pre)-1; ix>=start; ix--)
  1223. output[len++] = rule_pre[ix];
  1224. }
  1225. if(rule_post[0] != 0)
  1226. {
  1227. sprintf(&output[len],"%c%s",RULE_POST,rule_post);
  1228. len += (strlen(rule_post)+1);
  1229. }
  1230. output[len++]=0;
  1231. prule = (char *)malloc(len);
  1232. memcpy(prule,output,len);
  1233. return(prule);
  1234. } // end of compile_rule
  1235. int __cdecl string_sorter(char **a, char **b)
  1236. {//===========================================
  1237. char *pa, *pb;
  1238. int ix;
  1239. if((ix = strcmp(pa = *a,pb = *b)) != 0)
  1240. return(ix);
  1241. pa += (strlen(pa)+1);
  1242. pb += (strlen(pb)+1);
  1243. return(strcmp(pa,pb));
  1244. } /* end of string_sorter */
  1245. static int __cdecl rgroup_sorter(RGROUP *a, RGROUP *b)
  1246. {//===================================================
  1247. // Sort long names before short names
  1248. int ix;
  1249. ix = strlen(b->name) - strlen(a->name);
  1250. if(ix != 0) return(ix);
  1251. ix = strcmp(a->name,b->name);
  1252. if(ix != 0) return(ix);
  1253. return(a->start-b->start);
  1254. }
  1255. #ifdef OUTPUT_FORMAT
  1256. static void print_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
  1257. {//=============================================================================
  1258. int rule;
  1259. int ix;
  1260. unsigned char c;
  1261. int len1;
  1262. int len2;
  1263. int spaces;
  1264. char *p;
  1265. char *pout;
  1266. int condition;
  1267. char buf[80];
  1268. char suffix[12];
  1269. static unsigned char symbols[] = {'@','&','%','+','#','$','D','Z','A','B','C','F'};
  1270. fprintf(f_out,"\n$group %s\n",name);
  1271. for(rule=0; rule<n_rules; rule++)
  1272. {
  1273. p = rules[rule];
  1274. len1 = strlen(p) + 1;
  1275. p = &p[len1];
  1276. len2 = strlen(p);
  1277. rule_match[0]=0;
  1278. rule_pre[0]=0;
  1279. rule_post[0]=0;
  1280. condition = 0;
  1281. pout = rule_match;
  1282. for(ix=0; ix<len2; ix++)
  1283. {
  1284. switch(c = p[ix])
  1285. {
  1286. case RULE_PRE:
  1287. *pout = 0;
  1288. pout = rule_pre;
  1289. break;
  1290. case RULE_POST:
  1291. *pout = 0;
  1292. pout = rule_post;
  1293. break;
  1294. case RULE_CONDITION:
  1295. condition = p[++ix];
  1296. break;
  1297. case RULE_ENDING:
  1298. sprintf(suffix,"$%d[%x]",(p[ix+2]),p[ix+1] & 0x7f);
  1299. ix += 2;
  1300. strcpy(pout,suffix);
  1301. pout += strlen(suffix);
  1302. break;
  1303. default:
  1304. if(c <= RULE_LETTER7)
  1305. c = symbols[c-RULE_SYLLABLE];
  1306. if(c == ' ')
  1307. c = '_';
  1308. *pout++ = c;
  1309. break;
  1310. }
  1311. }
  1312. *pout = 0;
  1313. spaces = 12;
  1314. if(condition > 0)
  1315. {
  1316. sprintf(buf,"?%d ",condition);
  1317. spaces -= strlen(buf);
  1318. fprintf(f_out,"%s",buf);
  1319. }
  1320. if(rule_pre[0] != 0)
  1321. {
  1322. p = buf;
  1323. for(ix=strlen(rule_pre)-1; ix>=0; ix--)
  1324. *p++ = rule_pre[ix];
  1325. sprintf(p,") ");
  1326. spaces -= strlen(buf);
  1327. for(ix=0; ix<spaces; ix++)
  1328. fputc(' ',f_out);
  1329. fprintf(f_out,"%s",buf);
  1330. spaces = 0;
  1331. }
  1332. for(ix=0; ix<spaces; ix++)
  1333. fputc(' ',f_out);
  1334. spaces = 14;
  1335. sprintf(buf," %s ",rule_match);
  1336. if(rule_post[0] != 0)
  1337. {
  1338. p = &buf[strlen(buf)];
  1339. sprintf(p,"(%s ",rule_post);
  1340. }
  1341. fprintf(f_out,"%s",buf);
  1342. spaces -= strlen(buf);
  1343. for(ix=0; ix<spaces; ix++)
  1344. fputc(' ',f_out);
  1345. DecodePhonemes(rules[rule],buf);
  1346. fprintf(f_out,"%s\n",buf); // phonemes
  1347. }
  1348. }
  1349. #endif
  1350. //#define LIST_GROUP_INFO
  1351. static void output_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
  1352. {//==============================================================================
  1353. int ix;
  1354. int len1;
  1355. int len2;
  1356. int len_name;
  1357. char *p;
  1358. char *p2, *p3;
  1359. const char *common;
  1360. short nextchar_count[256];
  1361. memset(nextchar_count,0,sizeof(nextchar_count));
  1362. len_name = strlen(name);
  1363. #ifdef OUTPUT_FORMAT
  1364. print_rule_group(f_log,n_rules,rules,name);
  1365. #endif
  1366. // sort the rules in this group by their phoneme string
  1367. common = "";
  1368. qsort((void *)rules,n_rules,sizeof(char *),(int (__cdecl *)(const void *,const void *))string_sorter);
  1369. if(strcmp(name,"9")==0)
  1370. len_name = 0; // don't remove characters from numeric match strings
  1371. for(ix=0; ix<n_rules; ix++)
  1372. {
  1373. p = rules[ix];
  1374. len1 = strlen(p) + 1; // phoneme string
  1375. p3 = &p[len1];
  1376. p2 = p3 + len_name; // remove group name from start of match string
  1377. len2 = strlen(p2);
  1378. nextchar_count[(unsigned char)(p2[0])]++; // the next byte after the group name
  1379. if((common[0] != 0) && (strcmp(p,common)==0))
  1380. {
  1381. fwrite(p2,len2,1,f_out);
  1382. fputc(0,f_out); // no phoneme string, it's the same as previous rule
  1383. }
  1384. else
  1385. {
  1386. if((ix < n_rules-1) && (strcmp(p,rules[ix+1])==0))
  1387. {
  1388. common = rules[ix]; // phoneme string is same as next, set as common
  1389. fputc(RULE_PH_COMMON,f_out);
  1390. }
  1391. fwrite(p2,len2,1,f_out);
  1392. fputc(RULE_PHONEMES,f_out);
  1393. fwrite(p,len1,1,f_out);
  1394. }
  1395. }
  1396. #ifdef LIST_GROUP_INFO
  1397. for(ix=32; ix<256; ix++)
  1398. {
  1399. if(nextchar_count[ix] > 30)
  1400. printf("Group %s %c %d\n",name,ix,nextchar_count[ix]);
  1401. }
  1402. #endif
  1403. } // end of output_rule_group
  1404. static int compile_lettergroup(char *input, FILE *f_out)
  1405. {//=====================================================
  1406. char *p;
  1407. char *p_start;
  1408. int group;
  1409. int ix;
  1410. int n_items;
  1411. int length;
  1412. int max_length = 0;
  1413. #define N_LETTERGP_ITEMS 200
  1414. char *items[N_LETTERGP_ITEMS];
  1415. char item_length[N_LETTERGP_ITEMS];
  1416. p = input;
  1417. if(!IsDigit09(p[0]) || !IsDigit09(p[1]))
  1418. {
  1419. fprintf(f_log,"%5d: Expected 2 digits after '.L'\n",linenum);
  1420. error_count++;
  1421. return(1);
  1422. }
  1423. group = atoi(&p[0]);
  1424. if(group >= N_LETTER_GROUPS)
  1425. {
  1426. fprintf(f_log,"%5d: lettergroup out of range (01-%.2d)\n",linenum,N_LETTER_GROUPS-1);
  1427. error_count++;
  1428. return(1);
  1429. }
  1430. while(!isspace2(*p)) p++;
  1431. fputc(RULE_GROUP_START,f_out);
  1432. fputc(RULE_LETTERGP2,f_out);
  1433. fputc(group + 'A', f_out);
  1434. if(letterGroupsDefined[group] != 0)
  1435. {
  1436. fprintf(f_log,"%5d: lettergroup L%.2d is already defined\n",linenum,group);
  1437. error_count++;
  1438. }
  1439. letterGroupsDefined[group] = 1;
  1440. n_items = 0;
  1441. while(n_items < N_LETTERGP_ITEMS)
  1442. {
  1443. while(isspace2(*p)) p++;
  1444. if(*p == 0)
  1445. break;
  1446. items[n_items] = p_start = p;
  1447. while((*p & 0xff) > ' ')
  1448. {
  1449. if (*p == '_') *p = ' '; // allow '_' for word break
  1450. p++;
  1451. }
  1452. *p++ = 0;
  1453. length = p - p_start;
  1454. if(length > max_length)
  1455. max_length = length;
  1456. item_length[n_items++] = length;
  1457. }
  1458. // write out the items, longest first
  1459. while(max_length > 1)
  1460. {
  1461. for(ix=0; ix < n_items; ix++)
  1462. {
  1463. if(item_length[ix] == max_length)
  1464. {
  1465. fwrite(items[ix],1,max_length,f_out);
  1466. }
  1467. }
  1468. max_length--;
  1469. }
  1470. fputc(RULE_GROUP_END,f_out);
  1471. return(0);
  1472. }
  1473. static int compile_dictrules(FILE *f_in, FILE *f_out, char *fname_temp)
  1474. {//====================================================================
  1475. char *prule;
  1476. unsigned char *p;
  1477. int ix;
  1478. int c;
  1479. int gp;
  1480. FILE *f_temp;
  1481. int n_rules=0;
  1482. int count=0;
  1483. int different;
  1484. int wc;
  1485. int err_n_rules=0;
  1486. const char *prev_rgroup_name;
  1487. unsigned int char_code;
  1488. int compile_mode=0;
  1489. char *buf;
  1490. char buf1[500];
  1491. char *rules[N_RULES];
  1492. int n_rgroups = 0;
  1493. int n_groups3 = 0;
  1494. RGROUP rgroup[N_RULE_GROUP2];
  1495. linenum = 0;
  1496. group_name[0] = 0;
  1497. if((f_temp = fopen_log(fname_temp,"wb")) == NULL)
  1498. return(1);
  1499. for(;;)
  1500. {
  1501. linenum++;
  1502. buf = fgets(buf1,sizeof(buf1),f_in);
  1503. if(buf != NULL)
  1504. {
  1505. if((p = (unsigned char *)strstr(buf,"//")) != NULL)
  1506. *p = 0;
  1507. if(buf[0] == '\r') buf++; // ignore extra \r in \r\n
  1508. }
  1509. if((buf == NULL) || (buf[0] == '.'))
  1510. {
  1511. // next .group or end of file, write out the previous group
  1512. if(n_rules > 0)
  1513. {
  1514. strcpy(rgroup[n_rgroups].name,group_name);
  1515. rgroup[n_rgroups].group3_ix = group3_ix;
  1516. rgroup[n_rgroups].start = ftell(f_temp);
  1517. output_rule_group(f_temp,n_rules,rules,group_name);
  1518. rgroup[n_rgroups].length = ftell(f_temp) - rgroup[n_rgroups].start;
  1519. n_rgroups++;
  1520. count += n_rules;
  1521. }
  1522. n_rules = 0;
  1523. err_n_rules = 0;
  1524. if(compile_mode == 2)
  1525. {
  1526. // end of the character replacements section
  1527. fwrite(&n_rules,1,4,f_out); // write a zero word to terminate the replacemenmt list
  1528. compile_mode = 0;
  1529. }
  1530. if(buf == NULL) break; // end of file
  1531. if(memcmp(buf,".L",2)==0)
  1532. {
  1533. compile_lettergroup(&buf[2], f_out);
  1534. continue;
  1535. }
  1536. if(memcmp(buf,".replace",8)==0)
  1537. {
  1538. compile_mode = 2;
  1539. fputc(RULE_GROUP_START,f_out);
  1540. fputc(RULE_REPLACEMENTS,f_out);
  1541. // advance to next word boundary
  1542. while((ftell(f_out) & 3) != 0)
  1543. fputc(0,f_out);
  1544. }
  1545. if(memcmp(buf,".group",6)==0)
  1546. {
  1547. compile_mode = 1;
  1548. p = (unsigned char *)&buf[6];
  1549. while((p[0]==' ') || (p[0]=='\t')) p++; // Note: Windows isspace(0xe1) gives TRUE !
  1550. ix = 0;
  1551. while((*p > ' ') && (ix < LEN_GROUP_NAME))
  1552. group_name[ix++] = *p++;
  1553. group_name[ix]=0;
  1554. group3_ix = 0;
  1555. if(sscanf(group_name,"0x%x",&char_code)==1)
  1556. {
  1557. // group character is given as a character code (max 16 bits)
  1558. p = (unsigned char *)group_name;
  1559. if(char_code > 0x100)
  1560. {
  1561. *p++ = (char_code >> 8);
  1562. }
  1563. *p++ = char_code;
  1564. *p = 0;
  1565. }
  1566. else
  1567. {
  1568. if(translator->letter_bits_offset > 0)
  1569. {
  1570. utf8_in(&wc, group_name);
  1571. if(((ix = (wc - translator->letter_bits_offset)) >= 0) && (ix < 128))
  1572. {
  1573. group3_ix = ix+1; // not zero
  1574. }
  1575. }
  1576. }
  1577. if((group3_ix == 0) && (strlen(group_name) > 2))
  1578. {
  1579. if(utf8_in(&c,group_name) < 2)
  1580. {
  1581. fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum);
  1582. error_count++;
  1583. }
  1584. group_name[2] = 0;
  1585. }
  1586. }
  1587. continue;
  1588. }
  1589. switch(compile_mode)
  1590. {
  1591. case 1: // .group
  1592. prule = compile_rule(buf);
  1593. if(prule != NULL)
  1594. {
  1595. if(n_rules < N_RULES)
  1596. {
  1597. rules[n_rules++] = prule;
  1598. }
  1599. else
  1600. {
  1601. if(err_n_rules == 0)
  1602. {
  1603. fprintf(stderr, "\nExceeded limit of rules (%d) in group '%s'\n", N_RULES, group_name);
  1604. error_count++;
  1605. err_n_rules = 1;
  1606. }
  1607. }
  1608. }
  1609. break;
  1610. case 2: // .replace
  1611. {
  1612. int replace1;
  1613. int replace2;
  1614. char *p;
  1615. p = buf;
  1616. replace1 = 0;
  1617. replace2 = 0;
  1618. while(isspace2(*p)) p++;
  1619. ix = 0;
  1620. while((unsigned char)(*p) > 0x20) // not space or zero-byte
  1621. {
  1622. p += utf8_in(&c,p);
  1623. replace1 += (c << ix);
  1624. ix += 16;
  1625. }
  1626. while(isspace2(*p)) p++;
  1627. ix = 0;
  1628. while((unsigned char)(*p) > 0x20)
  1629. {
  1630. p += utf8_in(&c,p);
  1631. replace2 += (c << ix);
  1632. ix += 16;
  1633. }
  1634. if(replace1 != 0)
  1635. {
  1636. Write4Bytes(f_out,replace1); // write as little-endian
  1637. Write4Bytes(f_out,replace2); // if big-endian, reverse the bytes in LoadDictionary()
  1638. }
  1639. }
  1640. break;
  1641. }
  1642. }
  1643. fclose(f_temp);
  1644. qsort((void *)rgroup,n_rgroups,sizeof(rgroup[0]),(int (__cdecl *)(const void *,const void *))rgroup_sorter);
  1645. if((f_temp = fopen(fname_temp,"rb"))==NULL)
  1646. return(2);
  1647. prev_rgroup_name = "\n";
  1648. for(gp = 0; gp < n_rgroups; gp++)
  1649. {
  1650. fseek(f_temp,rgroup[gp].start,SEEK_SET);
  1651. if((different = strcmp(rgroup[gp].name, prev_rgroup_name)) != 0)
  1652. {
  1653. // not the same as the previous group
  1654. if(gp > 0)
  1655. fputc(RULE_GROUP_END,f_out);
  1656. fputc(RULE_GROUP_START,f_out);
  1657. if(rgroup[gp].group3_ix != 0)
  1658. {
  1659. n_groups3++;
  1660. fputc(1,f_out);
  1661. fputc(rgroup[gp].group3_ix, f_out);
  1662. }
  1663. else
  1664. {
  1665. fprintf(f_out, "%s", prev_rgroup_name = rgroup[gp].name);
  1666. }
  1667. fputc(0,f_out);
  1668. }
  1669. for(ix=rgroup[gp].length; ix>0; ix--)
  1670. {
  1671. c = fgetc(f_temp);
  1672. fputc(c,f_out);
  1673. }
  1674. if(different)
  1675. {
  1676. }
  1677. }
  1678. fputc(RULE_GROUP_END,f_out);
  1679. fputc(0,f_out);
  1680. fclose(f_temp);
  1681. remove(fname_temp);
  1682. fprintf(f_log,"\t%d rules, %d groups (%d)\n\n",count,n_rgroups,n_groups3);
  1683. return(0);
  1684. } // end of compile_dictrules
  1685. int CompileDictionary(const char *dsource, const char *dict_name, FILE *log, char *fname_err, int flags)
  1686. {//=====================================================================================================
  1687. // fname: space to write the filename in case of error
  1688. // flags: bit 0: include source line number information, for debug purposes.
  1689. FILE *f_in;
  1690. FILE *f_out;
  1691. int offset_rules=0;
  1692. int value;
  1693. char fname_in[sizeof(path_home)+45];
  1694. char fname_out[sizeof(path_home)+15];
  1695. char fname_temp[sizeof(path_home)+15];
  1696. char path[sizeof(path_home)+40]; // path_dsource+20
  1697. error_count = 0;
  1698. error_need_dictionary = 0;
  1699. memset(letterGroupsDefined,0,sizeof(letterGroupsDefined));
  1700. debug_flag = flags & 1;
  1701. if(dsource == NULL)
  1702. dsource = "";
  1703. f_log = log;
  1704. //f_log = fopen("log2.txt","w");
  1705. if(f_log == NULL)
  1706. f_log = stderr;
  1707. // try with and without '.txt' extension
  1708. sprintf(path,"%s%s_",dsource,dict_name);
  1709. sprintf(fname_in,"%srules.txt",path);
  1710. if((f_in = fopen(fname_in,"r")) == NULL)
  1711. {
  1712. sprintf(fname_in,"%srules",path);
  1713. if((f_in = fopen_log(fname_in,"r")) == NULL)
  1714. {
  1715. if(fname_err)
  1716. strcpy(fname_err,fname_in);
  1717. return(-1);
  1718. }
  1719. }
  1720. sprintf(fname_out,"%s%c%s_dict",path_home,PATHSEP,dict_name);
  1721. if((f_out = fopen_log(fname_out,"wb+")) == NULL)
  1722. {
  1723. if(fname_err)
  1724. strcpy(fname_err,fname_out);
  1725. fclose(f_in);
  1726. return(-1);
  1727. }
  1728. sprintf(fname_temp,"%s%ctemp",path_home,PATHSEP);
  1729. value = N_HASH_DICT;
  1730. Write4Bytes(f_out,value);
  1731. Write4Bytes(f_out,offset_rules);
  1732. compile_dictlist_start();
  1733. fprintf(f_log,"Using phonemetable: '%s'\n",phoneme_tab_list[phoneme_tab_number].name);
  1734. compile_dictlist_file(path,"roots");
  1735. if(translator->langopts.listx)
  1736. {
  1737. compile_dictlist_file(path,"list");
  1738. compile_dictlist_file(path,"listx");
  1739. }
  1740. else
  1741. {
  1742. compile_dictlist_file(path,"listx");
  1743. compile_dictlist_file(path,"list");
  1744. }
  1745. compile_dictlist_file(path,"extra");
  1746. compile_dictlist_end(f_out);
  1747. offset_rules = ftell(f_out);
  1748. fprintf(f_log,"Compiling: '%s'\n",fname_in);
  1749. compile_dictrules(f_in,f_out,fname_temp);
  1750. fclose(f_in);
  1751. fseek(f_out,4,SEEK_SET);
  1752. Write4Bytes(f_out,offset_rules);
  1753. fclose(f_out);
  1754. fflush(f_log);
  1755. LoadDictionary(translator, dict_name, 0);
  1756. return(error_count);
  1757. } // end of compile_dictionary