eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

compiledict.c 39KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604
  1. /*
  2. * Copyright (C) 2005 to 2014 by Jonathan Duddington
  3. * email: [email protected]
  4. * Copyright (C) 2015-2017 Reece H. Dunn
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write see:
  18. * <http://www.gnu.org/licenses/>.
  19. */
  20. #include "config.h"
  21. #include <ctype.h>
  22. #include <errno.h>
  23. #include <stdbool.h>
  24. #include <stdint.h>
  25. #include <stdio.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #include <wctype.h>
  29. #include <espeak-ng/espeak_ng.h>
  30. #include <espeak-ng/speak_lib.h>
  31. #include <espeak-ng/encoding.h>
  32. #include "common.h" // for strncpy0
  33. #include "compiledict.h"
  34. #include "dictionary.h" // for EncodePhonemes, HashDicti...
  35. #include "error.h" // for create_file_error_context
  36. #include "mnemonics.h" // for LookupMnemName, MNEM_TAB
  37. #include "phoneme.h" // for PHONEME_TAB_LIST, phonSWITCH, phone...
  38. #include "speech.h" // for path_home
  39. #include "synthesize.h" // for Write4Bytes
  40. static FILE *f_log = NULL;
  41. extern char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
  42. static int linenum;
  43. static int error_count;
  44. static bool text_mode = false;
  45. static int debug_flag = 0;
  46. static int error_need_dictionary = 0;
  47. // A hash chain is a linked-list of hash chain entry objects:
  48. // struct hash_chain_entry {
  49. // hash_chain_entry *next_entry;
  50. // // dict_line output from compile_line:
  51. // uint8_t length;
  52. // char contents[length];
  53. // };
  54. static char *hash_chains[N_HASH_DICT];
  55. static char letterGroupsDefined[N_LETTER_GROUPS];
  56. static const MNEM_TAB mnem_rules[] = {
  57. { "unpr", DOLLAR_UNPR },
  58. { "noprefix", DOLLAR_NOPREFIX }, // rule fails if a prefix has been removed
  59. { "list", DOLLAR_LIST }, // a pronunciation is given in the *_list file
  60. { "w_alt1", 0x11 },
  61. { "w_alt2", 0x12 },
  62. { "w_alt3", 0x13 },
  63. { "w_alt4", 0x14 },
  64. { "w_alt5", 0x15 },
  65. { "w_alt6", 0x16 },
  66. { "w_alt", 0x11 }, // note: put longer names before their sub-strings
  67. { "p_alt1", 0x21 },
  68. { "p_alt2", 0x22 },
  69. { "p_alt3", 0x23 },
  70. { "p_alt4", 0x24 },
  71. { "p_alt5", 0x25 },
  72. { "p_alt6", 0x26 },
  73. { "p_alt", 0x21 },
  74. { NULL, -1 }
  75. };
  76. static const MNEM_TAB mnem_flags[] = {
  77. // these in the first group put a value in bits0-3 of dictionary_flags
  78. { "$1", 0x41 }, // stress on 1st syllable
  79. { "$2", 0x42 }, // stress on 2nd syllable
  80. { "$3", 0x43 },
  81. { "$4", 0x44 },
  82. { "$5", 0x45 },
  83. { "$6", 0x46 },
  84. { "$7", 0x47 },
  85. { "$u", 0x48 }, // reduce to unstressed
  86. { "$u1", 0x49 },
  87. { "$u2", 0x4a },
  88. { "$u3", 0x4b },
  89. { "$u+", 0x4c }, // reduce to unstressed, but stress at end of clause
  90. { "$u1+", 0x4d },
  91. { "$u2+", 0x4e },
  92. { "$u3+", 0x4f },
  93. // these set the corresponding numbered bit if dictionary_flags
  94. { "$pause", 8 }, // ensure pause before this word
  95. { "$strend", 9 }, // full stress if at end of clause
  96. { "$strend2", 10 }, // full stress if at end of clause, or only followed by unstressed
  97. { "$unstressend", 11 }, // reduce stress at end of clause
  98. { "$accent_before", 12 }, // used with accent names, say this accent name before the letter name
  99. { "$abbrev", 13 }, // use this pronuciation rather than split into letters
  100. // language specific
  101. { "$double", 14 }, // IT double the initial consonant of next word
  102. { "$alt", 15 }, // use alternative pronunciation
  103. { "$alt1", 15 }, // synonym for $alt
  104. { "$alt2", 16 },
  105. { "$alt3", 17 },
  106. { "$alt4", 18 },
  107. { "$alt5", 19 },
  108. { "$alt6", 20 },
  109. { "$alt7", 21 },
  110. { "$combine", 23 }, // Combine with the next word
  111. { "$dot", 24 }, // ignore '.' after this word (abbreviation)
  112. { "$hasdot", 25 }, // use this pronunciation if there is a dot after the word
  113. { "$max3", 27 }, // limit to 3 repetitions
  114. { "$brk", 28 }, // a shorter $pause
  115. { "$text", 29 }, // word translates to replcement text, not phonemes
  116. // flags in dictionary word 2
  117. { "$verbf", 0x20 }, // verb follows
  118. { "$verbsf", 0x21 }, // verb follows, allow -s suffix
  119. { "$nounf", 0x22 }, // noun follows
  120. { "$pastf", 0x23 }, // past tense follows
  121. { "$verb", 0x24 }, // use this pronunciation when its a verb
  122. { "$noun", 0x25 }, // use this pronunciation when its a noun
  123. { "$past", 0x26 }, // use this pronunciation when its past tense
  124. { "$verbextend", 0x28 }, // extend influence of 'verb follows'
  125. { "$capital", 0x29 }, // use this pronunciation if initial letter is upper case
  126. { "$allcaps", 0x2a }, // use this pronunciation if initial letter is upper case
  127. { "$accent", 0x2b }, // character name is base-character name + accent name
  128. { "$sentence", 0x2d }, // only if this clause is a sentence (i.e. terminator is {. ? !} not {, ; :}
  129. { "$only", 0x2e }, // only match on this word without suffix
  130. { "$onlys", 0x2f }, // only match with none, or with 's' suffix
  131. { "$stem", 0x30 }, // must have a suffix
  132. { "$atend", 0x31 }, // use this pronunciation if at end of clause
  133. { "$atstart", 0x32 }, // use this pronunciation at start of clause
  134. { "$native", 0x33 }, // not if we've switched translators
  135. // doesn't set dictionary_flags
  136. { "$?", 100 }, // conditional rule, followed by byte giving the condition number
  137. { "$textmode", 200 },
  138. { "$phonememode", 201 },
  139. { NULL, -1 }
  140. };
  141. #define LEN_GROUP_NAME 12
  142. typedef struct {
  143. char name[LEN_GROUP_NAME+1];
  144. unsigned int start;
  145. unsigned int length;
  146. int group3_ix;
  147. } RGROUP;
  148. void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len)
  149. {
  150. int stress;
  151. int ix;
  152. const char *name;
  153. int len;
  154. int total = 0;
  155. buf[0] = 0;
  156. if ((stress = flags[0] & 0xf) != 0) {
  157. sprintf(buf, "%s", LookupMnemName(mnem_flags, stress + 0x40));
  158. total = strlen(buf);
  159. buf += total;
  160. }
  161. for (ix = 8; ix < 64; ix++) {
  162. if (((ix < 30) && (flags[0] & (1 << ix))) || ((ix >= 0x20) && (flags[1] & (1 << (ix-0x20))))) {
  163. name = LookupMnemName(mnem_flags, ix);
  164. len = strlen(name) + 1;
  165. total += len;
  166. if (total >= buf_len)
  167. continue;
  168. sprintf(buf, " %s", name);
  169. buf += len;
  170. }
  171. }
  172. }
  173. char *DecodeRule(const char *group_chars, int group_length, char *rule, int control)
  174. {
  175. // Convert compiled match template to ascii
  176. unsigned char rb;
  177. unsigned char c;
  178. char *p;
  179. char *p_end;
  180. int ix;
  181. int match_type;
  182. bool finished = false;
  183. int value;
  184. int linenum = 0;
  185. int flags;
  186. int suffix_char;
  187. int condition_num = 0;
  188. bool at_start = false;
  189. const char *name;
  190. char buf[200];
  191. char buf_pre[200];
  192. char suffix[20];
  193. static char output[80];
  194. MAKE_MEM_UNDEFINED(&output, sizeof(output));
  195. static const char symbols[] = {
  196. ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
  197. '&', '%', '+', '#', 'S', 'D', 'Z', 'A', 'L', '!',
  198. ' ', '@', '?', 'J', 'N', 'K', 'V', '?', 'T', 'X',
  199. '?', 'W'
  200. };
  201. static const char symbols_lg[] = { 'A', 'B', 'C', 'H', 'F', 'G', 'Y' };
  202. match_type = 0;
  203. buf_pre[0] = 0;
  204. for (ix = 0; ix < group_length; ix++)
  205. buf[ix] = group_chars[ix];
  206. buf[ix] = 0;
  207. p = &buf[strlen(buf)];
  208. while (!finished) {
  209. rb = *rule++;
  210. if (rb <= RULE_LINENUM) {
  211. switch (rb)
  212. {
  213. case 0:
  214. case RULE_PHONEMES:
  215. finished = true;
  216. break;
  217. case RULE_PRE_ATSTART:
  218. at_start = true;
  219. // fallthrough:
  220. case RULE_PRE:
  221. match_type = RULE_PRE;
  222. *p = 0;
  223. p = buf_pre;
  224. break;
  225. case RULE_POST:
  226. match_type = RULE_POST;
  227. *p = 0;
  228. strcat(buf, " (");
  229. p = &buf[strlen(buf)];
  230. break;
  231. case RULE_PH_COMMON:
  232. break;
  233. case RULE_CONDITION:
  234. // conditional rule, next byte gives condition number
  235. condition_num = *rule++;
  236. break;
  237. case RULE_LINENUM:
  238. value = (rule[1] & 0xff) - 1;
  239. linenum = (rule[0] & 0xff) - 1 + (value * 255);
  240. rule += 2;
  241. break;
  242. }
  243. continue;
  244. }
  245. if (rb == RULE_DOLLAR) {
  246. value = *rule++ & 0xff;
  247. if ((value != 0x01) || (control & FLAG_UNPRON_TEST)) {
  248. // TODO write the string backwards if in RULE_PRE
  249. p[0] = '$';
  250. name = LookupMnemName(mnem_rules, value);
  251. strcpy(&p[1], name);
  252. p += (strlen(name)+1);
  253. }
  254. c = ' ';
  255. } else if (rb == RULE_ENDING) {
  256. static const char *flag_chars = "eipvdfq tba ";
  257. flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f);
  258. suffix_char = 'S';
  259. if (flags & (SUFX_P >> 8))
  260. suffix_char = 'P';
  261. sprintf(suffix, "%c%d", suffix_char, rule[2] & 0x7f);
  262. rule += 3;
  263. for (ix = 0; ix < 9; ix++) {
  264. if (flags & 1)
  265. sprintf(&suffix[strlen(suffix)], "%c", flag_chars[ix]);
  266. flags = (flags >> 1);
  267. }
  268. strcpy(p, suffix);
  269. p += strlen(suffix);
  270. c = ' ';
  271. } else if (rb == RULE_LETTERGP)
  272. c = symbols_lg[*rule++ - 'A'];
  273. else if (rb == RULE_LETTERGP2) {
  274. value = *rule++ - 'A';
  275. if (value < 0)
  276. value += 256;
  277. p[0] = 'L';
  278. p[1] = (value / 10) + '0';
  279. c = (value % 10) + '0';
  280. if (match_type == RULE_PRE) {
  281. p[0] = c;
  282. c = 'L';
  283. }
  284. p += 2;
  285. } else if (rb <= RULE_LAST_RULE)
  286. c = symbols[rb];
  287. else if (rb == RULE_SPACE)
  288. c = '_';
  289. else
  290. c = rb;
  291. *p++ = c;
  292. }
  293. *p = 0;
  294. p = output;
  295. p_end = p + sizeof(output) - 1;
  296. if (linenum > 0) {
  297. sprintf(p, "%5d:\t", linenum);
  298. p += 7;
  299. }
  300. if (condition_num > 0) {
  301. sprintf(p, "?%d ", condition_num);
  302. p = &p[strlen(p)];
  303. }
  304. if (((ix = strlen(buf_pre)) > 0) || at_start) {
  305. if (at_start)
  306. *p++ = '_';
  307. while ((--ix >= 0) && (p < p_end-3))
  308. *p++ = buf_pre[ix];
  309. *p++ = ')';
  310. *p++ = ' ';
  311. }
  312. *p = 0;
  313. buf[p_end - p] = 0; // prevent overflow in output[]
  314. strcat(p, buf);
  315. ix = strlen(output);
  316. while (ix < 8)
  317. output[ix++] = ' ';
  318. output[ix] = 0;
  319. return output;
  320. }
  321. typedef enum
  322. {
  323. LINE_PARSER_WORD = 0,
  324. LINE_PARSER_END_OF_WORD = 1,
  325. LINE_PARSER_MULTIPLE_WORDS = 2,
  326. LINE_PARSER_END_OF_WORDS = 3,
  327. LINE_PARSER_PRONUNCIATION = 4,
  328. LINE_PARSER_END_OF_PRONUNCIATION = 5,
  329. } LINE_PARSER_STATES;
  330. static int compile_line(char *linebuf, char *dict_line, int n_dict_line, int *hash)
  331. {
  332. // Compile a line in the language_list file
  333. unsigned char c;
  334. char *p;
  335. char *word;
  336. char *phonetic;
  337. char *phonetic_end;
  338. unsigned int ix;
  339. LINE_PARSER_STATES step;
  340. unsigned int n_flag_codes = 0;
  341. int flagnum;
  342. int flag_offset;
  343. int length;
  344. int multiple_words = 0;
  345. bool multiple_numeric_hyphen = false;
  346. char *multiple_string = NULL;
  347. char *multiple_string_end = NULL;
  348. int len_word;
  349. int len_phonetic;
  350. bool text_not_phonemes = false; // this word specifies replacement text, not phonemes
  351. unsigned int wc;
  352. bool all_upper_case;
  353. char *mnemptr;
  354. unsigned char flag_codes[100];
  355. char encoded_ph[200];
  356. char bad_phoneme_str[4];
  357. int bad_phoneme;
  358. static char nullstring[] = { 0 };
  359. phonetic = word = nullstring;
  360. p = linebuf;
  361. step = LINE_PARSER_WORD;
  362. c = *p;
  363. while (c != '\n' && c != '\0') {
  364. c = *p;
  365. if ((c == '?') && (step == 0)) {
  366. // conditional rule, allow only if the numbered condition is set for the voice
  367. flag_offset = 100;
  368. p++;
  369. if (*p == '!') {
  370. // allow only if the numbered condition is NOT set
  371. flag_offset = 132;
  372. p++;
  373. }
  374. ix = 0;
  375. if (IsDigit09(*p)) {
  376. ix += (*p-'0');
  377. p++;
  378. }
  379. if (IsDigit09(*p)) {
  380. ix = ix*10 + (*p-'0');
  381. p++;
  382. }
  383. flag_codes[n_flag_codes++] = ix + flag_offset;
  384. c = *p;
  385. }
  386. if ((c == '$') && isalnum(p[1])) {
  387. // read keyword parameter
  388. mnemptr = p;
  389. while (!isspace2(c = *p)) p++;
  390. *p = 0;
  391. flagnum = LookupMnem(mnem_flags, mnemptr);
  392. if (flagnum > 0) {
  393. if (flagnum == 200)
  394. text_mode = true;
  395. else if (flagnum == 201)
  396. text_mode = false;
  397. else if (flagnum == BITNUM_FLAG_TEXTMODE)
  398. text_not_phonemes = true;
  399. else
  400. flag_codes[n_flag_codes++] = flagnum;
  401. } else {
  402. fprintf(f_log, "%5d: Unknown keyword: %s\n", linenum, mnemptr);
  403. error_count++;
  404. }
  405. }
  406. if ((c == '/') && (p[1] == '/') && (multiple_words == 0))
  407. c = '\n'; // "//" treat comment as end of line
  408. switch (step)
  409. {
  410. case LINE_PARSER_WORD:
  411. if (c == '(') {
  412. multiple_words = 1;
  413. word = p+1;
  414. step = LINE_PARSER_END_OF_WORD;
  415. } else if (!isspace2(c)) {
  416. word = p;
  417. step = LINE_PARSER_END_OF_WORD;
  418. }
  419. break;
  420. case LINE_PARSER_END_OF_WORD:
  421. if ((c == '-') && multiple_words) {
  422. if (IsDigit09(word[0]))
  423. multiple_numeric_hyphen = true;
  424. flag_codes[n_flag_codes++] = BITNUM_FLAG_HYPHENATED;
  425. c = ' ';
  426. }
  427. if (isspace2(c)) {
  428. p[0] = 0; // terminate english word
  429. if (multiple_words) {
  430. multiple_string = multiple_string_end = p+1;
  431. step = LINE_PARSER_MULTIPLE_WORDS;
  432. } else
  433. step = LINE_PARSER_END_OF_WORDS;
  434. } else if (c == ')') {
  435. if (multiple_words) {
  436. p[0] = 0;
  437. multiple_words = 0;
  438. step = LINE_PARSER_END_OF_WORDS;
  439. } else if (word[0] != '_') {
  440. fprintf(f_log, "%5d: Missing '('\n", linenum);
  441. error_count++;
  442. step = LINE_PARSER_END_OF_WORDS;
  443. }
  444. }
  445. break;
  446. case LINE_PARSER_MULTIPLE_WORDS:
  447. if (isspace2(c))
  448. multiple_words++;
  449. else if (c == ')') {
  450. p[0] = ' '; // terminate extra string
  451. multiple_string_end = p+1;
  452. step = LINE_PARSER_END_OF_WORDS;
  453. }
  454. break;
  455. case LINE_PARSER_END_OF_WORDS:
  456. if (!isspace2(c)) {
  457. phonetic = p;
  458. step = LINE_PARSER_PRONUNCIATION;
  459. }
  460. break;
  461. case LINE_PARSER_PRONUNCIATION:
  462. if (isspace2(c)) {
  463. phonetic_end = p;
  464. p[0] = 0; // terminate phonetic
  465. step = LINE_PARSER_END_OF_PRONUNCIATION;
  466. }
  467. break;
  468. case LINE_PARSER_END_OF_PRONUNCIATION:
  469. if (!isspace2(c)) {
  470. *phonetic_end = ' ';
  471. step = LINE_PARSER_PRONUNCIATION;
  472. }
  473. break;
  474. }
  475. p++;
  476. }
  477. if (word[0] == 0)
  478. return 0; // blank line
  479. if (text_mode)
  480. text_not_phonemes = true;
  481. if (text_not_phonemes) {
  482. if (word[0] == '_') {
  483. // This is a special word, used by eSpeak. Translate this into phonemes now
  484. strcat(phonetic, " "); // need a space to indicate word-boundary
  485. // PROBLEM vowel reductions are not applied to the translated phonemes
  486. // condition rules are not applied
  487. TranslateWord(translator, phonetic, NULL, NULL);
  488. text_not_phonemes = false;
  489. strncpy0(encoded_ph, word_phonemes, N_WORD_BYTES-4);
  490. if ((word_phonemes[0] == 0) && (error_need_dictionary < 3)) {
  491. // the dictionary was not loaded, we need a second attempt
  492. error_need_dictionary++;
  493. fprintf(f_log, "%5d: Need to compile dictionary again\n", linenum);
  494. }
  495. } else
  496. // this is replacement text, so don't encode as phonemes. Restrict the length of the replacement word
  497. strncpy0(encoded_ph, phonetic, N_WORD_BYTES-4);
  498. } else {
  499. EncodePhonemes(phonetic, encoded_ph, &bad_phoneme);
  500. if (strchr(encoded_ph, phonSWITCH) != 0)
  501. flag_codes[n_flag_codes++] = BITNUM_FLAG_ONLY_S; // don't match on suffixes (except 's') when switching languages
  502. // check for errors in the phonemes codes
  503. if (bad_phoneme != 0) {
  504. // unrecognised phoneme, report error
  505. bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
  506. fprintf(f_log, "%5d: Bad phoneme [%s] (U+%x) in: %s %s\n", linenum, bad_phoneme_str, bad_phoneme, word, phonetic);
  507. error_count++;
  508. }
  509. }
  510. if (text_not_phonemes != translator->langopts.textmode)
  511. flag_codes[n_flag_codes++] = BITNUM_FLAG_TEXTMODE;
  512. if (sscanf(word, "U+%x", &wc) == 1) {
  513. // Character code
  514. ix = utf8_out(wc, word);
  515. word[ix] = 0;
  516. } else if (word[0] != '_') {
  517. // convert to lower case, and note if the word is all-capitals
  518. int c2;
  519. all_upper_case = true;
  520. for (p = word;;) {
  521. // this assumes that the lower case char is the same length as the upper case char
  522. // OK, except for Turkish "I", but use towlower() rather than towlower2()
  523. ix = utf8_in(&c2, p);
  524. if (c2 == 0)
  525. break;
  526. if (iswupper(c2))
  527. utf8_out(towlower2(c2, translator), p);
  528. else
  529. all_upper_case = false;
  530. p += ix;
  531. }
  532. if (all_upper_case)
  533. flag_codes[n_flag_codes++] = BITNUM_FLAG_ALLCAPS;
  534. }
  535. len_word = strlen(word);
  536. if (translator->transpose_min > 0)
  537. len_word = TransposeAlphabet(translator, word);
  538. *hash = HashDictionary(word);
  539. len_phonetic = strlen(encoded_ph);
  540. dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed
  541. len_word &= 0x3f;
  542. memcpy(&dict_line[2], word, len_word);
  543. if (len_phonetic == 0) {
  544. // no phonemes specified. set bit 7
  545. dict_line[1] |= 0x80;
  546. length = len_word + 2;
  547. } else {
  548. length = len_word + len_phonetic + 3;
  549. if (length < n_dict_line) {
  550. strcpy(&dict_line[(len_word)+2], encoded_ph);
  551. } else {
  552. fprintf(f_log, "%5d: Dictionary line length would overflow the data buffer: %d\n", linenum, length);
  553. error_count++;
  554. // no phonemes specified. set bit 7
  555. dict_line[1] |= 0x80;
  556. length = len_word + 2;
  557. }
  558. }
  559. for (ix = 0; ix < n_flag_codes; ix++)
  560. dict_line[ix+length] = flag_codes[ix];
  561. length += n_flag_codes;
  562. if ((multiple_string != NULL) && (multiple_words > 0)) {
  563. if (multiple_words > 10) {
  564. fprintf(f_log, "%5d: Two many parts in a multi-word entry: %d\n", linenum, multiple_words);
  565. error_count++;
  566. } else {
  567. dict_line[length++] = 80 + multiple_words;
  568. ix = multiple_string_end - multiple_string;
  569. if (multiple_numeric_hyphen)
  570. dict_line[length++] = ' '; // ???
  571. memcpy(&dict_line[length], multiple_string, ix);
  572. length += ix;
  573. }
  574. }
  575. *((uint8_t *)dict_line) = (uint8_t)length;
  576. return length;
  577. }
  578. static void compile_dictlist_start(void)
  579. {
  580. // initialise dictionary list
  581. int ix;
  582. char *p;
  583. char *p2;
  584. for (ix = 0; ix < N_HASH_DICT; ix++) {
  585. p = hash_chains[ix];
  586. while (p != NULL) {
  587. memcpy(&p2, p, sizeof(char *));
  588. free(p);
  589. p = p2;
  590. }
  591. hash_chains[ix] = NULL;
  592. }
  593. }
  594. static void compile_dictlist_end(FILE *f_out)
  595. {
  596. // Write out the compiled dictionary list
  597. int hash;
  598. int length;
  599. char *p;
  600. for (hash = 0; hash < N_HASH_DICT; hash++) {
  601. p = hash_chains[hash];
  602. while (p != NULL) {
  603. length = *(uint8_t *)(p+sizeof(char *));
  604. fwrite(p+sizeof(char *), length, 1, f_out);
  605. memcpy(&p, p, sizeof(char *));
  606. }
  607. fputc(0, f_out);
  608. }
  609. }
  610. static int compile_dictlist_file(const char *path, const char *filename)
  611. {
  612. int length;
  613. int hash;
  614. char *p;
  615. int count = 0;
  616. FILE *f_in;
  617. char buf[200];
  618. char fname[sizeof(path_home)+45];
  619. char dict_line[256]; // length is uint8_t, so an entry can't take up more than 256 bytes
  620. text_mode = false;
  621. // try with and without '.txt' extension
  622. sprintf(fname, "%s%s.txt", path, filename);
  623. if ((f_in = fopen(fname, "r")) == NULL) {
  624. sprintf(fname, "%s%s", path, filename);
  625. if ((f_in = fopen(fname, "r")) == NULL)
  626. return -1;
  627. }
  628. if (f_log != NULL)
  629. fprintf(f_log, "Compiling: '%s'\n", fname);
  630. linenum = 0;
  631. while (fgets(buf, sizeof(buf), f_in) != NULL) {
  632. linenum++;
  633. length = compile_line(buf, dict_line, sizeof(dict_line), &hash);
  634. if (length == 0) continue; // blank line
  635. p = (char *)malloc(length+sizeof(char *));
  636. if (p == NULL) {
  637. if (f_log != NULL) {
  638. fprintf(f_log, "Can't allocate memory\n");
  639. error_count++;
  640. }
  641. break;
  642. }
  643. memcpy(p, &hash_chains[hash], sizeof(char *));
  644. hash_chains[hash] = p;
  645. // NOTE: dict_line[0] is the entry length (0-255)
  646. memcpy(p+sizeof(char *), dict_line, length);
  647. count++;
  648. }
  649. if (f_log != NULL)
  650. fprintf(f_log, "\t%d entries\n", count);
  651. fclose(f_in);
  652. return 0;
  653. }
  654. static char rule_cond[80];
  655. static char rule_pre[80];
  656. static char rule_post[80];
  657. static char rule_match[80];
  658. static char rule_phonemes[80];
  659. static char group_name[LEN_GROUP_NAME+1];
  660. static int group3_ix;
  661. #define N_RULES 3000 // max rules for each group
  662. static int isHexDigit(int c)
  663. {
  664. if ((c >= '0') && (c <= '9'))
  665. return c - '0';
  666. if ((c >= 'a') && (c <= 'f'))
  667. return c - 'a' + 10;
  668. if ((c >= 'A') && (c <= 'F'))
  669. return c - 'A' + 10;
  670. return -1;
  671. }
  672. static void copy_rule_string(char *string, int *state_out)
  673. {
  674. // state 0: conditional, 1=pre, 2=match, 3=post, 4=phonemes
  675. static char * const outbuf[5] = { rule_cond, rule_pre, rule_match, rule_post, rule_phonemes };
  676. static const int next_state[5] = { 2, 2, 4, 4, 4 };
  677. char *output;
  678. char *p;
  679. int ix;
  680. int len;
  681. char c;
  682. int c2, c3;
  683. int sxflags;
  684. int value;
  685. bool literal;
  686. bool hexdigit_input = false;
  687. int state = *state_out;
  688. const MNEM_TAB *mr;
  689. if (string[0] == 0) return;
  690. output = outbuf[state];
  691. if (state == 4) {
  692. // append to any previous phoneme string, i.e. allow spaces in the phoneme string
  693. len = strlen(rule_phonemes);
  694. if (len > 0)
  695. rule_phonemes[len++] = ' ';
  696. output = &rule_phonemes[len];
  697. }
  698. sxflags = 0x808000; // to ensure non-zero bytes
  699. for (p = string, ix = 0;;) {
  700. literal = false;
  701. c = *p++;
  702. if ((c == '0') && (p[0] == 'x') && (isHexDigit(p[1]) >= 0) && (isHexDigit(p[2]) >= 0)) {
  703. hexdigit_input = true;
  704. c = p[1];
  705. p += 2;
  706. }
  707. if (c == '\\') {
  708. c = *p++; // treat next character literally
  709. if ((c >= '0') && (c <= '3') && (p[0] >= '0') && (p[0] <= '7') && (p[1] >= '0') && (p[1] <= '7')) {
  710. // character code given by 3 digit octal value;
  711. c = (c-'0')*64 + (p[0]-'0')*8 + (p[1]-'0');
  712. p += 2;
  713. }
  714. literal = true;
  715. }
  716. if (hexdigit_input) {
  717. if (((c2 = isHexDigit(c)) >= 0) && ((c3 = isHexDigit(p[0])) >= 0)) {
  718. c = c2 * 16 + c3;
  719. literal = true;
  720. p++;
  721. } else
  722. hexdigit_input = false;
  723. }
  724. if ((state == 1) || (state == 3)) {
  725. // replace special characters (note: 'E' is reserved for a replaced silent 'e')
  726. if (literal == false) {
  727. static const char lettergp_letters[9] = { LETTERGP_A, LETTERGP_B, LETTERGP_C, 0, 0, LETTERGP_F, LETTERGP_G, LETTERGP_H, LETTERGP_Y };
  728. switch (c)
  729. {
  730. case '_':
  731. c = RULE_SPACE;
  732. break;
  733. case 'Y':
  734. c = 'I';
  735. // fallthrough:
  736. case 'A': // vowel
  737. case 'B':
  738. case 'C':
  739. case 'H':
  740. case 'F':
  741. case 'G':
  742. if (state == 1) {
  743. // pre-rule, put the number before the RULE_LETTERGP;
  744. output[ix++] = lettergp_letters[c-'A'] + 'A';
  745. c = RULE_LETTERGP;
  746. } else {
  747. output[ix++] = RULE_LETTERGP;
  748. c = lettergp_letters[c-'A'] + 'A';
  749. }
  750. break;
  751. case 'D':
  752. c = RULE_DIGIT;
  753. break;
  754. case 'K':
  755. c = RULE_NOTVOWEL;
  756. break;
  757. case 'N':
  758. c = RULE_NO_SUFFIX;
  759. break;
  760. case 'V':
  761. c = RULE_IFVERB;
  762. break;
  763. case 'Z':
  764. c = RULE_NONALPHA;
  765. break;
  766. case '+':
  767. c = RULE_INC_SCORE;
  768. break;
  769. case '<': // Can't use - as opposite for + because it is used literally as part of word
  770. c = RULE_DEC_SCORE;
  771. break;
  772. case '@':
  773. c = RULE_SYLLABLE;
  774. break;
  775. case '&':
  776. c = RULE_STRESSED;
  777. break;
  778. case '%':
  779. c = RULE_DOUBLE;
  780. break;
  781. case '#':
  782. c = RULE_DEL_FWD;
  783. break;
  784. case '!':
  785. c = RULE_CAPITAL;
  786. break;
  787. case 'T':
  788. output[ix++] = RULE_DOLLAR;
  789. c = 0x11;
  790. break;
  791. case 'W':
  792. c = RULE_SPELLING;
  793. break;
  794. case 'X':
  795. c = RULE_NOVOWELS;
  796. break;
  797. case 'J':
  798. c = RULE_SKIPCHARS;
  799. break;
  800. case 'L':
  801. // expect two digits
  802. c = *p++ - '0';
  803. value = *p++ - '0';
  804. c = c * 10 + value;
  805. if ((value < 0) || (value > 9)) {
  806. c = 0;
  807. fprintf(f_log, "%5d: Expected 2 digits after 'L'\n", linenum);
  808. error_count++;
  809. } else if ((c <= 0) || (c >= N_LETTER_GROUPS) || (letterGroupsDefined[(int)c] == 0)) {
  810. fprintf(f_log, "%5d: Letter group L%.2d not defined\n", linenum, c);
  811. error_count++;
  812. }
  813. c += 'A';
  814. if (state == 1) {
  815. // pre-rule, put the group number before the RULE_LETTERGP command
  816. output[ix++] = c;
  817. c = RULE_LETTERGP2;
  818. } else
  819. output[ix++] = RULE_LETTERGP2;
  820. break;
  821. case '$':
  822. value = 0;
  823. mr = mnem_rules;
  824. while (mr->mnem != NULL) {
  825. len = strlen(mr->mnem);
  826. if (strncmp(p, mr->mnem, len) == 0) {
  827. value = mr->value;
  828. p += len;
  829. break;
  830. }
  831. mr++;
  832. }
  833. if (state == 1) {
  834. // pre-rule, put the number before the RULE_DOLLAR
  835. output[ix++] = value;
  836. c = RULE_DOLLAR;
  837. } else {
  838. output[ix++] = RULE_DOLLAR;
  839. c = value;
  840. }
  841. if (value == 0) {
  842. fprintf(f_log, "%5d: $ command not recognized\n", linenum);
  843. error_count++;
  844. }
  845. break;
  846. case 'P': // Prefix
  847. sxflags |= SUFX_P;
  848. // fallthrough
  849. case 'S': // Suffix
  850. output[ix++] = RULE_ENDING;
  851. value = 0;
  852. while (!isspace2(c = *p++) && (c != 0)) {
  853. switch (c)
  854. {
  855. case 'e':
  856. sxflags |= SUFX_E;
  857. break;
  858. case 'i':
  859. sxflags |= SUFX_I;
  860. break;
  861. case 'p': // obsolete, replaced by 'P' above
  862. sxflags |= SUFX_P;
  863. break;
  864. case 'v':
  865. sxflags |= SUFX_V;
  866. break;
  867. case 'd':
  868. sxflags |= SUFX_D;
  869. break;
  870. case 'f':
  871. sxflags |= SUFX_F;
  872. break;
  873. case 'q':
  874. sxflags |= SUFX_Q;
  875. break;
  876. case 't':
  877. sxflags |= SUFX_T;
  878. break;
  879. case 'b':
  880. sxflags |= SUFX_B;
  881. break;
  882. case 'a':
  883. sxflags |= SUFX_A;
  884. break;
  885. case 'm':
  886. sxflags |= SUFX_M;
  887. break;
  888. default:
  889. if (IsDigit09(c))
  890. value = (value*10) + (c - '0');
  891. break;
  892. }
  893. }
  894. p--;
  895. output[ix++] = sxflags >> 16;
  896. output[ix++] = sxflags >> 8;
  897. c = value | 0x80;
  898. break;
  899. }
  900. }
  901. }
  902. output[ix++] = c;
  903. if (c == 0) break;
  904. }
  905. *state_out = next_state[state];
  906. }
  907. static char *compile_rule(char *input)
  908. {
  909. int ix;
  910. unsigned char c;
  911. int wc;
  912. char *p;
  913. char *prule;
  914. int len;
  915. int len_name;
  916. int start;
  917. int state = 2;
  918. bool finish = false;
  919. char buf[80];
  920. char output[150];
  921. int bad_phoneme;
  922. char bad_phoneme_str[4];
  923. buf[0] = 0;
  924. rule_cond[0] = 0;
  925. rule_pre[0] = 0;
  926. rule_post[0] = 0;
  927. rule_match[0] = 0;
  928. rule_phonemes[0] = 0;
  929. p = buf;
  930. for (ix = 0; finish == false; ix++) {
  931. switch (c = input[ix])
  932. {
  933. case ')': // end of prefix section
  934. *p = 0;
  935. state = 1;
  936. copy_rule_string(buf, &state);
  937. p = buf;
  938. break;
  939. case '(': // start of suffix section
  940. *p = 0;
  941. state = 2;
  942. copy_rule_string(buf, &state);
  943. state = 3;
  944. p = buf;
  945. if (input[ix+1] == ' ') {
  946. fprintf(f_log, "%5d: Syntax error. Space after (, or negative score for previous rule\n", linenum);
  947. error_count++;
  948. }
  949. break;
  950. case '\n': // end of line
  951. case '\r':
  952. case 0: // end of line
  953. *p = 0;
  954. copy_rule_string(buf, &state);
  955. finish = true;
  956. break;
  957. case '\t': // end of section section
  958. case ' ':
  959. *p = 0;
  960. copy_rule_string(buf, &state);
  961. p = buf;
  962. break;
  963. case '?':
  964. if (state == 2)
  965. state = 0;
  966. else
  967. *p++ = c;
  968. break;
  969. default:
  970. *p++ = c;
  971. break;
  972. }
  973. }
  974. if (strcmp(rule_match, "$group") == 0)
  975. strcpy(rule_match, group_name);
  976. if (rule_match[0] == 0) {
  977. if (rule_post[0] != 0) {
  978. fprintf(f_log, "%5d: Syntax error\n", linenum);
  979. error_count++;
  980. }
  981. return NULL;
  982. }
  983. EncodePhonemes(rule_phonemes, buf, &bad_phoneme);
  984. if (bad_phoneme != 0) {
  985. bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
  986. fprintf(f_log, "%5d: Bad phoneme [%s] (U+%x) in: %s\n", linenum, bad_phoneme_str, bad_phoneme, input);
  987. error_count++;
  988. }
  989. strcpy(output, buf);
  990. len = strlen(buf)+1;
  991. len_name = strlen(group_name);
  992. if ((len_name > 0) && (memcmp(rule_match, group_name, len_name) != 0)) {
  993. utf8_in(&wc, rule_match);
  994. if ((group_name[0] == '9') && IsDigit(wc)) {
  995. // numeric group, rule_match starts with a digit, so OK
  996. } else {
  997. fprintf(f_log, "%5d: Wrong initial letters '%s' for group '%s'\n", linenum, rule_match, group_name);
  998. error_count++;
  999. }
  1000. }
  1001. strcpy(&output[len], rule_match);
  1002. len += strlen(rule_match);
  1003. if (debug_flag) {
  1004. output[len] = RULE_LINENUM;
  1005. output[len+1] = (linenum % 255) + 1;
  1006. output[len+2] = (linenum / 255) + 1;
  1007. len += 3;
  1008. }
  1009. if (rule_cond[0] != 0) {
  1010. if (rule_cond[0] == '!') {
  1011. // allow the rule only if the condition number is NOT set for the voice
  1012. ix = atoi(&rule_cond[1]) + 32;
  1013. } else {
  1014. // allow the rule only if the condition number is set for the voice
  1015. ix = atoi(rule_cond);
  1016. }
  1017. if ((ix > 0) && (ix < 255)) {
  1018. output[len++] = RULE_CONDITION;
  1019. output[len++] = ix;
  1020. } else {
  1021. fprintf(f_log, "%5d: bad condition number ?%d\n", linenum, ix);
  1022. error_count++;
  1023. }
  1024. }
  1025. if (rule_pre[0] != 0) {
  1026. start = 0;
  1027. if (rule_pre[0] == RULE_SPACE) {
  1028. // omit '_' at the beginning of the pre-string and imply it by using RULE_PRE_ATSTART
  1029. c = RULE_PRE_ATSTART;
  1030. start = 1;
  1031. } else
  1032. c = RULE_PRE;
  1033. output[len++] = c;
  1034. // output PRE string in reverse order
  1035. for (ix = strlen(rule_pre)-1; ix >= start; ix--)
  1036. output[len++] = rule_pre[ix];
  1037. }
  1038. if (rule_post[0] != 0) {
  1039. sprintf(&output[len], "%c%s", RULE_POST, rule_post);
  1040. len += (strlen(rule_post)+1);
  1041. }
  1042. output[len++] = 0;
  1043. if ((prule = (char *)malloc(len)) != NULL)
  1044. memcpy(prule, output, len);
  1045. return prule;
  1046. }
  1047. static int __cdecl string_sorter(char **a, char **b)
  1048. {
  1049. char *pa, *pb;
  1050. int ix;
  1051. if ((ix = strcmp(pa = *a, pb = *b)) != 0)
  1052. return ix;
  1053. pa += (strlen(pa)+1);
  1054. pb += (strlen(pb)+1);
  1055. return strcmp(pa, pb);
  1056. }
  1057. static int __cdecl rgroup_sorter(RGROUP *a, RGROUP *b)
  1058. {
  1059. // Sort long names before short names
  1060. int ix;
  1061. ix = strlen(b->name) - strlen(a->name);
  1062. if (ix != 0) return ix;
  1063. ix = strcmp(a->name, b->name);
  1064. if (ix != 0) return ix;
  1065. return a->start-b->start;
  1066. }
  1067. static void output_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
  1068. {
  1069. int ix;
  1070. int len1;
  1071. int len2;
  1072. int len_name;
  1073. char *p;
  1074. char *p2, *p3;
  1075. const char *common;
  1076. short nextchar_count[256];
  1077. memset(nextchar_count, 0, sizeof(nextchar_count));
  1078. len_name = strlen(name);
  1079. // sort the rules in this group by their phoneme string
  1080. common = "";
  1081. qsort((void *)rules, n_rules, sizeof(char *), (int(__cdecl *)(const void *, const void *))string_sorter);
  1082. if (strcmp(name, "9") == 0)
  1083. len_name = 0; // don't remove characters from numeric match strings
  1084. for (ix = 0; ix < n_rules; ix++) {
  1085. p = rules[ix];
  1086. len1 = strlen(p) + 1; // phoneme string
  1087. p3 = &p[len1];
  1088. p2 = p3 + len_name; // remove group name from start of match string
  1089. len2 = strlen(p2);
  1090. nextchar_count[(unsigned char)(p2[0])]++; // the next byte after the group name
  1091. if ((common[0] != 0) && (strcmp(p, common) == 0)) {
  1092. fwrite(p2, len2, 1, f_out);
  1093. fputc(0, f_out); // no phoneme string, it's the same as previous rule
  1094. } else {
  1095. if ((ix < n_rules-1) && (strcmp(p, rules[ix+1]) == 0)) {
  1096. common = rules[ix]; // phoneme string is same as next, set as common
  1097. fputc(RULE_PH_COMMON, f_out);
  1098. }
  1099. fwrite(p2, len2, 1, f_out);
  1100. fputc(RULE_PHONEMES, f_out);
  1101. fwrite(p, len1, 1, f_out);
  1102. }
  1103. }
  1104. }
  1105. static int compile_lettergroup(char *input, FILE *f_out)
  1106. {
  1107. char *p;
  1108. char *p_start;
  1109. int group;
  1110. int ix;
  1111. int n_items;
  1112. int length;
  1113. int max_length = 0;
  1114. #define N_LETTERGP_ITEMS 200
  1115. char *items[N_LETTERGP_ITEMS];
  1116. char item_length[N_LETTERGP_ITEMS];
  1117. p = input;
  1118. if (!IsDigit09(p[0]) || !IsDigit09(p[1])) {
  1119. fprintf(f_log, "%5d: Expected 2 digits after '.L'\n", linenum);
  1120. error_count++;
  1121. return 1;
  1122. }
  1123. group = atoi(&p[0]);
  1124. if (group >= N_LETTER_GROUPS) {
  1125. fprintf(f_log, "%5d: lettergroup out of range (01-%.2d)\n", linenum, N_LETTER_GROUPS-1);
  1126. error_count++;
  1127. return 1;
  1128. }
  1129. while (!isspace2(*p)) p++;
  1130. fputc(RULE_GROUP_START, f_out);
  1131. fputc(RULE_LETTERGP2, f_out);
  1132. fputc(group + 'A', f_out);
  1133. if (letterGroupsDefined[group] != 0) {
  1134. fprintf(f_log, "%5d: lettergroup L%.2d is already defined\n", linenum, group);
  1135. error_count++;
  1136. }
  1137. letterGroupsDefined[group] = 1;
  1138. n_items = 0;
  1139. while (n_items < N_LETTERGP_ITEMS) {
  1140. while (isspace2(*p)) p++;
  1141. if (*p == 0)
  1142. break;
  1143. items[n_items] = p_start = p;
  1144. while ((*p & 0xff) > ' ') {
  1145. if (*p == '_') *p = ' '; // allow '_' for word break
  1146. p++;
  1147. }
  1148. *p++ = 0;
  1149. length = p - p_start;
  1150. if (length > max_length)
  1151. max_length = length;
  1152. item_length[n_items++] = length;
  1153. }
  1154. // write out the items, longest first
  1155. while (max_length > 1) {
  1156. for (ix = 0; ix < n_items; ix++) {
  1157. if (item_length[ix] == max_length)
  1158. fwrite(items[ix], 1, max_length, f_out);
  1159. }
  1160. max_length--;
  1161. }
  1162. fputc(RULE_GROUP_END, f_out);
  1163. return 0;
  1164. }
  1165. static void free_rules(char **rules, int n_rules)
  1166. {
  1167. for (int i = 0; i < n_rules; ++i) {
  1168. free(*rules);
  1169. *rules++ = NULL;
  1170. }
  1171. }
  1172. static espeak_ng_STATUS compile_dictrules(FILE *f_in, FILE *f_out, char *fname_temp, espeak_ng_ERROR_CONTEXT *context)
  1173. {
  1174. char *prule;
  1175. unsigned char *p;
  1176. int ix;
  1177. int c;
  1178. int gp;
  1179. FILE *f_temp;
  1180. int n_rules = 0;
  1181. int count = 0;
  1182. int different;
  1183. int wc;
  1184. int err_n_rules = 0;
  1185. const char *prev_rgroup_name;
  1186. unsigned int char_code;
  1187. int compile_mode = 0;
  1188. char *buf;
  1189. char buf1[500];
  1190. char *rules[N_RULES];
  1191. int n_rgroups = 0;
  1192. int n_groups3 = 0;
  1193. RGROUP rgroup[N_RULE_GROUP2];
  1194. linenum = 0;
  1195. group_name[0] = 0;
  1196. if ((f_temp = fopen(fname_temp, "wb")) == NULL)
  1197. return create_file_error_context(context, errno, fname_temp);
  1198. for (;;) {
  1199. linenum++;
  1200. buf = fgets(buf1, sizeof(buf1), f_in);
  1201. if (buf != NULL) {
  1202. if ((p = (unsigned char *)strstr(buf, "//")) != NULL)
  1203. *p = 0;
  1204. if (buf[0] == '\r') buf++; // ignore extra \r in \r\n
  1205. }
  1206. if ((buf == NULL) || (buf[0] == '.')) {
  1207. // next .group or end of file, write out the previous group
  1208. if (n_rules > 0) {
  1209. strcpy(rgroup[n_rgroups].name, group_name);
  1210. rgroup[n_rgroups].group3_ix = group3_ix;
  1211. rgroup[n_rgroups].start = ftell(f_temp);
  1212. output_rule_group(f_temp, n_rules, rules, group_name);
  1213. rgroup[n_rgroups].length = ftell(f_temp) - rgroup[n_rgroups].start;
  1214. n_rgroups++;
  1215. count += n_rules;
  1216. free_rules(rules, n_rules);
  1217. }
  1218. n_rules = 0;
  1219. err_n_rules = 0;
  1220. if (compile_mode == 2) {
  1221. // end of the character replacements section
  1222. fwrite(&n_rules, 1, 4, f_out); // write a zero word to terminate the replacemenmt list
  1223. fputc(RULE_GROUP_END, f_out);
  1224. compile_mode = 0;
  1225. }
  1226. if (buf == NULL) break; // end of file
  1227. if (memcmp(buf, ".L", 2) == 0) {
  1228. compile_lettergroup(&buf[2], f_out);
  1229. continue;
  1230. }
  1231. if (memcmp(buf, ".replace", 8) == 0) {
  1232. compile_mode = 2;
  1233. fputc(RULE_GROUP_START, f_out);
  1234. fputc(RULE_REPLACEMENTS, f_out);
  1235. // advance to next word boundary
  1236. while ((ftell(f_out) & 3) != 0)
  1237. fputc(0, f_out);
  1238. }
  1239. if (memcmp(buf, ".group", 6) == 0) {
  1240. compile_mode = 1;
  1241. p = (unsigned char *)&buf[6];
  1242. while ((p[0] == ' ') || (p[0] == '\t')) p++; // Note: Windows isspace(0xe1) gives TRUE !
  1243. ix = 0;
  1244. while ((*p > ' ') && (ix < LEN_GROUP_NAME))
  1245. group_name[ix++] = *p++;
  1246. group_name[ix] = 0;
  1247. group3_ix = 0;
  1248. if (sscanf(group_name, "0x%x", &char_code) == 1) {
  1249. // group character is given as a character code (max 16 bits)
  1250. p = (unsigned char *)group_name;
  1251. if (char_code > 0x100)
  1252. *p++ = (char_code >> 8);
  1253. *p++ = char_code;
  1254. *p = 0;
  1255. } else {
  1256. if (translator->letter_bits_offset > 0) {
  1257. utf8_in(&wc, group_name);
  1258. if (((ix = (wc - translator->letter_bits_offset)) >= 0) && (ix < 128))
  1259. group3_ix = ix+1; // not zero
  1260. }
  1261. }
  1262. if ((group3_ix == 0) && (strlen(group_name) > 2)) {
  1263. if (utf8_in(&c, group_name) < 2) {
  1264. fprintf(f_log, "%5d: Group name longer than 2 bytes (UTF8)", linenum);
  1265. error_count++;
  1266. }
  1267. group_name[2] = 0;
  1268. }
  1269. }
  1270. continue;
  1271. }
  1272. switch (compile_mode)
  1273. {
  1274. case 1: // .group
  1275. prule = compile_rule(buf);
  1276. if (prule != NULL) {
  1277. if (n_rules < N_RULES)
  1278. rules[n_rules++] = prule;
  1279. else {
  1280. if (err_n_rules == 0) {
  1281. fprintf(stderr, "\nExceeded limit of rules (%d) in group '%s'\n", N_RULES, group_name);
  1282. error_count++;
  1283. err_n_rules = 1;
  1284. }
  1285. }
  1286. }
  1287. break;
  1288. case 2: // .replace
  1289. p = (unsigned char *)buf;
  1290. while (isspace2(*p)) p++;
  1291. if ((unsigned char)(*p) > 0x20) {
  1292. while ((unsigned char)(*p) > 0x20) { // not space or zero-byte
  1293. fputc(*p, f_out);
  1294. p++;
  1295. }
  1296. fputc(0, f_out);
  1297. while (isspace2(*p)) p++;
  1298. while ((unsigned char)(*p) > 0x20) {
  1299. fputc(*p, f_out);
  1300. p++;
  1301. }
  1302. fputc(0, f_out);
  1303. }
  1304. break;
  1305. }
  1306. }
  1307. fclose(f_temp);
  1308. qsort((void *)rgroup, n_rgroups, sizeof(rgroup[0]), (int(__cdecl *)(const void *, const void *))rgroup_sorter);
  1309. if ((f_temp = fopen(fname_temp, "rb")) == NULL) {
  1310. free_rules(rules, n_rules);
  1311. return create_file_error_context(context, errno, fname_temp);
  1312. }
  1313. prev_rgroup_name = "\n";
  1314. for (gp = 0; gp < n_rgroups; gp++) {
  1315. fseek(f_temp, rgroup[gp].start, SEEK_SET);
  1316. if ((different = strcmp(rgroup[gp].name, prev_rgroup_name)) != 0) {
  1317. // not the same as the previous group
  1318. if (gp > 0)
  1319. fputc(RULE_GROUP_END, f_out);
  1320. fputc(RULE_GROUP_START, f_out);
  1321. if (rgroup[gp].group3_ix != 0) {
  1322. n_groups3++;
  1323. fputc(1, f_out);
  1324. fputc(rgroup[gp].group3_ix, f_out);
  1325. } else
  1326. fprintf(f_out, "%s", prev_rgroup_name = rgroup[gp].name);
  1327. fputc(0, f_out);
  1328. }
  1329. for (ix = rgroup[gp].length; ix > 0; ix--) {
  1330. c = fgetc(f_temp);
  1331. fputc(c, f_out);
  1332. }
  1333. }
  1334. fputc(RULE_GROUP_END, f_out);
  1335. fputc(0, f_out);
  1336. fclose(f_temp);
  1337. remove(fname_temp);
  1338. fprintf(f_log, "\t%d rules, %d groups (%d)\n\n", count, n_rgroups, n_groups3);
  1339. free_rules(rules, n_rules);
  1340. return ENS_OK;
  1341. }
  1342. #pragma GCC visibility push(default)
  1343. ESPEAK_NG_API espeak_ng_STATUS espeak_ng_CompileDictionary(const char *dsource, const char *dict_name, FILE *log, int flags, espeak_ng_ERROR_CONTEXT *context)
  1344. {
  1345. if (!log) log = stderr;
  1346. if (!dict_name) dict_name = dictionary_name;
  1347. // fname: space to write the filename in case of error
  1348. // flags: bit 0: include source line number information, for debug purposes.
  1349. FILE *f_in;
  1350. FILE *f_out;
  1351. int offset_rules = 0;
  1352. int value;
  1353. char fname_in[sizeof(path_home)+45];
  1354. char fname_out[sizeof(path_home)+15];
  1355. char fname_temp[sizeof(path_home)+15];
  1356. char path[sizeof(path_home)+40]; // path_dsource+20
  1357. error_count = 0;
  1358. error_need_dictionary = 0;
  1359. memset(letterGroupsDefined, 0, sizeof(letterGroupsDefined));
  1360. debug_flag = flags & 1;
  1361. if (dsource == NULL)
  1362. dsource = "";
  1363. f_log = log;
  1364. if (f_log == NULL)
  1365. f_log = stderr;
  1366. // try with and without '.txt' extension
  1367. sprintf(path, "%s%s_", dsource, dict_name);
  1368. sprintf(fname_in, "%srules.txt", path);
  1369. if ((f_in = fopen(fname_in, "r")) == NULL) {
  1370. sprintf(fname_in, "%srules", path);
  1371. if ((f_in = fopen(fname_in, "r")) == NULL)
  1372. return create_file_error_context(context, errno, fname_in);
  1373. }
  1374. sprintf(fname_out, "%s%c%s_dict", path_home, PATHSEP, dict_name);
  1375. if ((f_out = fopen(fname_out, "wb+")) == NULL) {
  1376. int error = errno;
  1377. fclose(f_in);
  1378. return create_file_error_context(context, error, fname_out);
  1379. }
  1380. /* Use dictionary-specific temp names to allow parallel compilation
  1381. * of multiple ductionaries. */
  1382. sprintf(fname_temp, "%s%c%stemp", path_home, PATHSEP, dict_name);
  1383. value = N_HASH_DICT;
  1384. Write4Bytes(f_out, value);
  1385. Write4Bytes(f_out, offset_rules);
  1386. compile_dictlist_start();
  1387. fprintf(f_log, "Using phonemetable: '%s'\n", phoneme_tab_list[phoneme_tab_number].name);
  1388. compile_dictlist_file(path, "roots");
  1389. if (translator->langopts.listx) {
  1390. compile_dictlist_file(path, "list");
  1391. compile_dictlist_file(path, "listx");
  1392. } else {
  1393. compile_dictlist_file(path, "listx");
  1394. compile_dictlist_file(path, "list");
  1395. }
  1396. compile_dictlist_file(path, "emoji");
  1397. compile_dictlist_file(path, "extra");
  1398. compile_dictlist_end(f_out);
  1399. offset_rules = ftell(f_out);
  1400. fprintf(f_log, "Compiling: '%s'\n", fname_in);
  1401. espeak_ng_STATUS status = compile_dictrules(f_in, f_out, fname_temp, context);
  1402. fclose(f_in);
  1403. fseek(f_out, 4, SEEK_SET);
  1404. Write4Bytes(f_out, offset_rules);
  1405. fclose(f_out);
  1406. fflush(f_log);
  1407. if (status != ENS_OK)
  1408. return status;
  1409. LoadDictionary(translator, dict_name, 0);
  1410. return error_count > 0 ? ENS_COMPILE_ERROR : ENS_OK;
  1411. }
  1412. #pragma GCC visibility pop