eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tr_languages.cpp 45KB


  1. /***************************************************************************
  2. * Copyright (C) 2005 to 2007 by Jonathan Duddington *
  3. * email: [email protected] *
  4. * *
  5. * This program is free software; you can redistribute it and/or modify *
  6. * it under the terms of the GNU General Public License as published by *
  7. * the Free Software Foundation; either version 3 of the License, or *
  8. * (at your option) any later version. *
  9. * *
  10. * This program is distributed in the hope that it will be useful, *
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  13. * GNU General Public License for more details. *
  14. * *
  15. * You should have received a copy of the GNU General Public License *
  16. * along with this program; if not, see: *
  17. * <http://www.gnu.org/licenses/>. *
  18. ***************************************************************************/
  19. #include "StdAfx.h"
  20. #include <stdio.h>
  21. #include <ctype.h>
  22. #include <wctype.h>
  23. #include <stdlib.h>
  24. #include <string.h>
  25. #include <locale.h>
  26. #include <wctype.h>
  27. #include "speak_lib.h"
  28. #include "speech.h"
  29. #include "phoneme.h"
  30. #include "synthesize.h"
  31. #include "translate.h"
  32. #define L_qa 0x716100
  33. #define L_grc 0x677263 // grc Ancient Greek
  34. #define L_jbo 0x6a626f // jbo Lojban
  35. #define L_zhy 0x7a6879 // zhy
  36. // start of unicode pages for character sets
  37. #define OFFSET_GREEK 0x380
  38. #define OFFSET_CYRILLIC 0x420
  39. #define OFFSET_ARMENIAN 0x530
  40. #define OFFSET_DEVANAGARI 0x900
  41. #define OFFSET_BENGALI 0x980
  42. #define OFFSET_TAMIL 0xb80
  43. #define OFFSET_KANNADA 0xc80
  44. #define OFFSET_MALAYALAM 0xd00
  45. #define OFFSET_KOREAN 0x1100
  46. static void Translator_Russian(Translator *tr);
  47. static void SetLetterVowel(Translator *tr, int c)
  48. {//==============================================
  49. tr->letter_bits[c] = (tr->letter_bits[c] & 0x40) | 0x81; // keep value for group 6 (front vowels e,i,y)
  50. }
  51. static void ResetLetterBits(Translator *tr, int groups)
  52. {//====================================================
  53. // Clear all the specified groups
  54. unsigned int ix;
  55. unsigned int mask;
  56. mask = ~groups;
  57. for(ix=0; ix<sizeof(tr->letter_bits); ix++)
  58. {
  59. tr->letter_bits[ix] &= mask;
  60. }
  61. }
  62. static void SetLetterBits(Translator *tr, int group, const char *string)
  63. {//=====================================================================
  64. int bits;
  65. unsigned char c;
  66. bits = (1L << group);
  67. while((c = *string++) != 0)
  68. tr->letter_bits[c] |= bits;
  69. }
  70. static void SetLetterBitsRange(Translator *tr, int group, int first, int last)
  71. {//===========================================================================
  72. int bits;
  73. int ix;
  74. bits = (1L << group);
  75. for(ix=first; ix<=last; ix++)
  76. {
  77. tr->letter_bits[ix] |= bits;
  78. }
  79. }
  80. static Translator* NewTranslator(void)
  81. {//===================================
  82. Translator *tr;
  83. int ix;
  84. static const unsigned char stress_amps2[] = {17,17, 20,20, 20,22, 22,20 };
  85. static const short stress_lengths2[8] = {182,140, 220,220, 220,240, 260,280};
  86. static const wchar_t empty_wstring[1] = {0};
  87. static const wchar_t punct_in_word[2] = {'\'', 0}; // allow hyphen within words
  88. tr = (Translator *)Alloc(sizeof(Translator));
  89. if(tr == NULL)
  90. return(NULL);
  91. tr->charset_a0 = charsets[1]; // ISO-8859-1, this is for when the input is not utf8
  92. dictionary_name[0] = 0;
  93. tr->dict_condition=0;
  94. tr->data_dictrules = NULL; // language_1 translation rules file
  95. tr->data_dictlist = NULL; // language_2 dictionary lookup file
  96. tr->transpose_offset = 0;
  97. // only need lower case
  98. tr->letter_bits_offset = 0;
  99. memset(tr->letter_bits,0,sizeof(tr->letter_bits));
  100. memset(tr->letter_groups,0,sizeof(tr->letter_groups));
  101. // 0-5 sets of characters matched by A B C E F G in pronunciation rules
  102. // these may be set differently for different languages
  103. SetLetterBits(tr,0,"aeiou"); // A vowels, except y
  104. SetLetterBits(tr,1,"bcdfgjklmnpqstvxz"); // B hard consonants, excluding h,r,w
  105. SetLetterBits(tr,2,"bcdfghjklmnpqrstvwxz"); // C all consonants
  106. SetLetterBits(tr,3,"hlmnr"); // H 'soft' consonants
  107. SetLetterBits(tr,4,"cfhkpqstx"); // F voiceless consonants
  108. SetLetterBits(tr,5,"bdgjlmnrvwyz"); // G voiced
  109. SetLetterBits(tr,6,"eiy"); // Letter group Y, front vowels
  110. SetLetterBits(tr,7,"aeiouy"); // vowels, including y
  111. tr->char_plus_apostrophe = empty_wstring;
  112. tr->punct_within_word = punct_in_word;
  113. for(ix=0; ix<8; ix++)
  114. {
  115. tr->stress_amps[ix] = stress_amps2[ix];
  116. tr->stress_amps_r[ix] = stress_amps2[ix] - 1;
  117. tr->stress_lengths[ix] = stress_lengths2[ix];
  118. }
  119. memset(&(tr->langopts),0,sizeof(tr->langopts));
  120. tr->langopts.stress_rule = 2;
  121. tr->langopts.unstressed_wd1 = 1;
  122. tr->langopts.unstressed_wd2 = 3;
  123. tr->langopts.param[LOPT_SONORANT_MIN] = 95;
  124. tr->langopts.param[LOPT_MAXAMP_EOC] = 19;
  125. tr->langopts.param[LOPT_UNPRONOUNCABLE] = 's'; // don't count this character at start of word
  126. tr->langopts.max_initial_consonants = 3;
  127. tr->langopts.replace_chars = NULL;
  128. tr->langopts.ascii_language = ""; // Non-Latin alphabet languages, use this language to speak Latin words, default is English
  129. SetLengthMods(tr,201);
  130. // tr->langopts.length_mods = length_mods_en;
  131. // tr->langopts.length_mods0 = length_mods_en0;
  132. tr->langopts.long_stop = 100;
  133. tr->langopts.max_roman = 49;
  134. tr->langopts.thousands_sep = ',';
  135. tr->langopts.decimal_sep = '.';
  136. memcpy(tr->punct_to_tone, punctuation_to_tone, sizeof(tr->punct_to_tone));
  137. return(tr);
  138. }
  139. static const unsigned int replace_cyrillic_latin[] =
  140. {0x430,'a',
  141. 0x431,'b',
  142. 0x446,'c',
  143. 0x45b,0x107,
  144. 0x447,0x10d,
  145. 0x45f,'d'+(0x17e<<16),
  146. 0x455,'d'+('z'<<16),
  147. 0x434,'d',
  148. 0x452,0x111,
  149. 0x435,'e',
  150. 0x444,'f',
  151. 0x433,'g',
  152. 0x445,'h',
  153. 0x438,'i',
  154. 0x458,'j',
  155. 0x43a,'k',
  156. 0x459,'l'+('j'<<16),
  157. 0x43b,'l',
  158. 0x43c,'m',
  159. 0x45a,'n'+('j'<<16),
  160. 0x43d,'n',
  161. 0x43e,'o',
  162. 0x43f,'p',
  163. 0x440,'r',
  164. 0x441,'s',
  165. 0x448,0x161,
  166. 0x442,'t',
  167. 0x443,'u',
  168. 0x432,'v',
  169. 0x437,'z',
  170. 0x436,0x17e,
  171. 0x453,0x111,
  172. 0x45c,0x107,
  173. 0}; // ѓ ѕ ќ
  174. void SetIndicLetters(Translator *tr)
  175. {//=================================
  176. // Set letter types for Indic scripts, Devanagari, Tamill, etc
  177. static const char dev_consonants2[] = {0x02,0x03,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f};
  178. memset(tr->letter_bits,0,sizeof(tr->letter_bits));
  179. SetLetterBitsRange(tr,LETTERGP_A,0x04,0x14); // vowel letters only
  180. SetLetterBitsRange(tr,LETTERGP_B,0x3e,0x4d); // vowel signs, and virama
  181. SetLetterBitsRange(tr,LETTERGP_C,0x15,0x39); // the main consonant range
  182. SetLetterBits(tr,LETTERGP_C,dev_consonants2); // + additional consonants
  183. SetLetterBitsRange(tr,LETTERGP_Y,0x04,0x14); // vowel letters
  184. SetLetterBitsRange(tr,LETTERGP_Y,0x3e,0x4c); // + vowel signs
  185. tr->langopts.param[LOPT_UNPRONOUNCABLE] = 1; // disable check for unpronouncable words
  186. }
  187. void SetupTranslator(Translator *tr, const short *lengths, const unsigned char *amps)
  188. {//==================================================================================
  189. if(lengths != NULL)
  190. memcpy(tr->stress_lengths,lengths,sizeof(tr->stress_lengths));
  191. if(amps != NULL)
  192. memcpy(tr->stress_amps,amps,sizeof(tr->stress_amps));
  193. }
  194. Translator *SelectTranslator(const char *name)
  195. {//===========================================
  196. int name2 = 0;
  197. Translator *tr;
  198. static const unsigned char stress_amps_sk[8] = {17,17, 20,20, 20,22, 22,21 };
  199. static const short stress_lengths_sk[8] = {190,190, 210,210, 0,0, 210,210};
  200. // convert name string into a word of up to 4 characters, for the switch()
  201. while(*name != 0)
  202. name2 = (name2 << 8) + *name++;
  203. tr = NewTranslator();
  204. switch(name2)
  205. {
  206. case L('a','f'):
  207. {
  208. static const short stress_lengths_af[8] = {170,140, 220,220, 0, 0, 250,270};
  209. SetupTranslator(tr,stress_lengths_af,NULL);
  210. tr->langopts.stress_rule = 0;
  211. tr->langopts.vowel_pause = 0x30;
  212. tr->langopts.param[LOPT_DIERESES] = 1;
  213. tr->langopts.param[LOPT_PREFIXES] = 1;
  214. SetLetterVowel(tr,'y'); // add 'y' to vowels
  215. tr->langopts.numbers = 0x8d1 + NUM_ROMAN;
  216. tr->langopts.accents = 1;
  217. }
  218. break;
  219. case L('b','n'): // Bengali
  220. {
  221. static const short stress_lengths_bn[8] = {180, 180, 210, 210, 0, 0, 230, 240};
  222. static const unsigned char stress_amps_bn[8] = {18,18, 18,18, 20,20, 22,22 };
  223. SetupTranslator(tr,stress_lengths_bn,stress_amps_bn);
  224. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  225. tr->langopts.stress_rule = 0;
  226. tr->langopts.stress_flags = 0x10004; // use 'diminished' for unstressed final syllable
  227. tr->letter_bits_offset = OFFSET_BENGALI;
  228. SetIndicLetters(tr); // call this after setting OFFSET_BENGALI
  229. SetLetterBitsRange(tr,LETTERGP_F,0x3e,0x4c); // vowel signs, but not virama
  230. tr->langopts.numbers = 0x1;
  231. tr->langopts.numbers2 = 0x100;
  232. }
  233. break;
  234. case L('c','y'): // Welsh
  235. {
  236. static const short stress_lengths_cy[8] = {170,220, 180,180, 0, 0, 250,270};
  237. static const unsigned char stress_amps_cy[8] = {17,15, 18,18, 0,0, 22,20 }; // 'diminished' is used to mark a quieter, final unstressed syllable
  238. SetupTranslator(tr,stress_lengths_cy,stress_amps_cy);
  239. tr->charset_a0 = charsets[14]; // ISO-8859-14
  240. // tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  241. tr->langopts.stress_rule = 2;
  242. // tr->langopts.intonation_group = 4;
  243. // 'diminished' is an unstressed final syllable
  244. tr->langopts.stress_flags = 0x6 | 0x10;
  245. tr->langopts.unstressed_wd1 = 0;
  246. tr->langopts.unstressed_wd2 = 2;
  247. tr->langopts.param[LOPT_SONORANT_MIN] = 120; // limit the shortening of sonorants before short vowels
  248. tr->langopts.numbers = 0x401;
  249. SetLetterVowel(tr,'w'); // add letter to vowels and remove from consonants
  250. SetLetterVowel(tr,'y');
  251. }
  252. break;
  253. case L('d','a'): // Danish
  254. {
  255. static const short stress_lengths_da[8] = {160,140, 200,200, 0,0, 220,210};
  256. SetupTranslator(tr,stress_lengths_da,NULL);
  257. tr->langopts.stress_rule = 0;
  258. SetLetterVowel(tr,'y');
  259. // tr->langopts.numbers = 0x11849;
  260. }
  261. break;
  262. case L('d','e'):
  263. {
  264. static const short stress_lengths_de[8] = {150,130, 190,190, 0, 0, 260,275};
  265. tr->langopts.stress_rule = 0;
  266. tr->langopts.word_gap = 0x8; // don't use linking phonemes
  267. tr->langopts.vowel_pause = 0x30;
  268. tr->langopts.param[LOPT_PREFIXES] = 1;
  269. memcpy(tr->stress_lengths,stress_lengths_de,sizeof(tr->stress_lengths));
  270. tr->langopts.numbers = 0x11419 + NUM_ROMAN;
  271. SetLetterVowel(tr,'y');
  272. }
  273. break;
  274. case L('e','n'):
  275. {
  276. static const short stress_lengths_en[8] = {182,140, 220,220, 0,0, 248,275};
  277. SetupTranslator(tr,stress_lengths_en,NULL);
  278. tr->langopts.stress_rule = 0;
  279. tr->langopts.numbers = 0x841 + NUM_ROMAN;
  280. tr->langopts.param[LOPT_COMBINE_WORDS] = 2; // allow "mc" to cmbine with the following word
  281. }
  282. break;
  283. case L('e','l'): // Greek
  284. case L_grc: // Ancient Greek
  285. {
  286. static const short stress_lengths_el[8] = {155, 180, 210, 210, 0, 0, 270, 300};
  287. static const unsigned char stress_amps_el[8] = {15,12, 20,20, 20,22, 22,21 }; // 'diminished' is used to mark a quieter, final unstressed syllable
  288. // character codes offset by 0x380
  289. static const char el_vowels[] = {0x10,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x35,0x37,0x39,0x3f,0x45,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0};
  290. static const char el_fvowels[] = {0x2d,0x2e,0x2f,0x35,0x37,0x39,0x45,0x4d,0}; // ε η ι υ έ ή ί ύ
  291. static const char el_voiceless[]= {0x38,0x3a,0x3e,0x40,0x42,0x43,0x44,0x46,0x47,0}; // θ κ ξ π ς σ τ φ χ
  292. static const char el_consonants[]={0x32,0x33,0x34,0x36,0x38,0x3a,0x3b,0x3c,0x3d,0x3e,0x40,0x41,0x42,0x43,0x44,0x46,0x47,0x48,0};
  293. static const wchar_t el_char_apostrophe[] = {0x3c3,0}; // σ
  294. SetupTranslator(tr,stress_lengths_el,stress_amps_el);
  295. tr->charset_a0 = charsets[7]; // ISO-8859-7
  296. tr->char_plus_apostrophe = el_char_apostrophe;
  297. tr->letter_bits_offset = OFFSET_GREEK;
  298. memset(tr->letter_bits,0,sizeof(tr->letter_bits));
  299. SetLetterBits(tr,LETTERGP_A,el_vowels);
  300. SetLetterBits(tr,LETTERGP_B,el_voiceless);
  301. SetLetterBits(tr,LETTERGP_C,el_consonants);
  302. SetLetterBits(tr,LETTERGP_Y,el_fvowels); // front vowels: ε η ι υ
  303. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  304. tr->langopts.stress_rule = 2;
  305. tr->langopts.stress_flags = 0x6; // mark unstressed final syllables as diminished
  306. tr->langopts.unstressed_wd1 = 0;
  307. tr->langopts.unstressed_wd2 = 2;
  308. tr->langopts.param[LOPT_SONORANT_MIN] = 130; // limit the shortening of sonorants before short vowels
  309. tr->langopts.numbers = 0x309;
  310. tr->langopts.numbers2 = 0x2; // variant form of numbers before thousands
  311. if(name2 == L_grc)
  312. {
  313. // ancient greek
  314. tr->langopts.param[LOPT_UNPRONOUNCABLE] = 1;
  315. }
  316. }
  317. break;
  318. case L('e','o'):
  319. {
  320. static const short stress_lengths_eo[8] = {145, 145, 230, 170, 0, 0, 360, 370};
  321. static const unsigned char stress_amps_eo[] = {16,14, 20,20, 20,22, 22,21 };
  322. static const wchar_t eo_char_apostrophe[2] = {'l',0};
  323. SetupTranslator(tr,stress_lengths_eo,stress_amps_eo);
  324. tr->charset_a0 = charsets[3]; // ISO-8859-3
  325. tr->char_plus_apostrophe = eo_char_apostrophe;
  326. tr->langopts.word_gap = 1;
  327. tr->langopts.vowel_pause = 2;
  328. tr->langopts.stress_rule = 2;
  329. tr->langopts.stress_flags = 0x6 | 0x10;
  330. tr->langopts.unstressed_wd1 = 3;
  331. tr->langopts.unstressed_wd2 = 2;
  332. tr->langopts.numbers = 0x1409 + NUM_ROMAN;
  333. }
  334. break;
  335. case L('e','s'): // Spanish
  336. case L('c','a'): // Catalan
  337. {
  338. static const short stress_lengths_es[8] = {180, 210, 190, 190, 0, 0, 230, 260};
  339. // static const short stress_lengths_es[8] = {170, 200, 180, 180, 0, 0, 220, 250};
  340. static const unsigned char stress_amps_es[8] = {16,12, 18,18, 20,20, 20,20 }; // 'diminished' is used to mark a quieter, final unstressed syllable
  341. static const wchar_t ca_punct_within_word[] = {'\'',0xb7,0}; // ca: allow middle-dot within word
  342. SetupTranslator(tr,stress_lengths_es,stress_amps_es);
  343. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  344. tr->langopts.stress_rule = 2;
  345. // stress last syllable if it doesn't end in vowel or "s" or "n"
  346. // 'diminished' is an unstressed final syllable
  347. tr->langopts.stress_flags = 0x200 | 0x6 | 0x10;
  348. tr->langopts.unstressed_wd1 = 0;
  349. tr->langopts.unstressed_wd2 = 2;
  350. tr->langopts.param[LOPT_SONORANT_MIN] = 120; // limit the shortening of sonorants before short vowels
  351. tr->langopts.numbers = 0x529 + NUM_ROMAN + NUM_ROMAN_AFTER;
  352. if(name2 == L('c','a'))
  353. {
  354. tr->punct_within_word = ca_punct_within_word;
  355. tr->langopts.stress_flags = 0x200 | 0x6 | 0x30; // stress last syllable unless word ends with a vowel
  356. }
  357. }
  358. break;
  359. case L('f','i'): // Finnish
  360. {
  361. static const unsigned char stress_amps_fi[8] = {18,16, 22,22, 20,22, 22,22 };
  362. static const short stress_lengths_fi[8] = {150,180, 200,200, 0,0, 210,250};
  363. SetupTranslator(tr,stress_lengths_fi,stress_amps_fi);
  364. tr->langopts.stress_rule = 0;
  365. tr->langopts.stress_flags = 0x56; // move secondary stress from light to a following heavy syllable
  366. tr->langopts.param[LOPT_IT_DOUBLING] = 1;
  367. tr->langopts.long_stop = 130;
  368. tr->langopts.numbers = 0x1809;
  369. SetLetterVowel(tr,'y');
  370. tr->langopts.max_initial_consonants = 2;
  371. tr->langopts.spelling_stress = 1;
  372. tr->langopts.intonation_group = 3; // less intonation, don't raise pitch at comma
  373. }
  374. break;
  375. case L('f','r'): // french
  376. {
  377. static const short stress_lengths_fr[8] = {190, 170, 190, 200, 0, 0, 235, 240};
  378. static const unsigned char stress_amps_fr[8] = {18,16, 20,20, 20,22, 22,21 };
  379. SetupTranslator(tr,stress_lengths_fr,stress_amps_fr);
  380. tr->langopts.stress_rule = 3; // stress on final syllable
  381. tr->langopts.stress_flags = 0x0024; // don't use secondary stress
  382. tr->langopts.param[LOPT_IT_LENGTHEN] = 1; // remove lengthen indicator from unstressed syllables
  383. tr->langopts.numbers = 0x1509 + 0x8000 + NUM_NOPAUSE | NUM_ROMAN;
  384. SetLetterVowel(tr,'y');
  385. }
  386. break;
  387. #ifdef deleted
  388. case L('g','a'): // Irish Gaelic
  389. {
  390. tr->langopts.stress_rule = 1;
  391. }
  392. break;
  393. #endif
  394. case L('h','i'): // Hindi
  395. case L('n','e'): // Nepali
  396. {
  397. static const short stress_lengths_hi[8] = {190, 190, 210, 210, 0, 0, 230, 250};
  398. static const unsigned char stress_amps_hi[8] = {17,14, 20,19, 20,22, 22,21 };
  399. SetupTranslator(tr,stress_lengths_hi,stress_amps_hi);
  400. tr->charset_a0 = charsets[19]; // ISCII
  401. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  402. tr->langopts.stress_rule = 6; // stress on last heaviest syllable, excluding final syllable
  403. tr->langopts.stress_flags = 0x10004; // use 'diminished' for unstressed final syllable
  404. tr->langopts.numbers = 0x011;
  405. tr->langopts.numbers2 = NUM2_100000;
  406. tr->letter_bits_offset = OFFSET_DEVANAGARI;
  407. SetIndicLetters(tr);
  408. }
  409. break;
  410. case L('h','r'): // Croatian
  411. case L('b','s'): // Bosnian
  412. case L('s','r'): // Serbian
  413. {
  414. static const unsigned char stress_amps_hr[8] = {17,17, 20,20, 20,22, 22,21 };
  415. static const short stress_lengths_hr[8] = {180,160, 200,200, 0,0, 220,230};
  416. static const short stress_lengths_sr[8] = {160,150, 200,200, 0,0, 250,260};
  417. if(name2 == L('s','r'))
  418. SetupTranslator(tr,stress_lengths_sr,stress_amps_hr);
  419. else
  420. SetupTranslator(tr,stress_lengths_hr,stress_amps_hr);
  421. tr->charset_a0 = charsets[2]; // ISO-8859-2
  422. tr->langopts.stress_rule = 0;
  423. tr->langopts.stress_flags = 0x10;
  424. tr->langopts.param[LOPT_REGRESSIVE_VOICING] = 0x3;
  425. tr->langopts.max_initial_consonants = 5;
  426. tr->langopts.spelling_stress = 1;
  427. tr->langopts.accents = 1;
  428. tr->langopts.numbers = 0x140d + 0x4000 + NUM_ROMAN_UC;
  429. tr->langopts.numbers2 = 0x4a; // variant numbers before thousands,milliards
  430. tr->langopts.replace_chars = replace_cyrillic_latin;
  431. SetLetterVowel(tr,'y');
  432. SetLetterVowel(tr,'r');
  433. }
  434. break;
  435. case L('h','u'): // Hungarian
  436. {
  437. static const unsigned char stress_amps_hu[8] = {17,17, 19,19, 20,22, 22,21 };
  438. static const short stress_lengths_hu[8] = {185,195, 195,190, 0,0, 210,220};
  439. SetupTranslator(tr,stress_lengths_hu,stress_amps_hu);
  440. tr->charset_a0 = charsets[2]; // ISO-8859-2
  441. tr->langopts.vowel_pause = 0x20;
  442. tr->langopts.stress_rule = 0;
  443. tr->langopts.stress_flags = 0x8036;
  444. tr->langopts.unstressed_wd1 = 2;
  445. // tr->langopts.param[LOPT_REGRESSIVE_VOICING] = 0x4; // don't propagate over word boundaries
  446. tr->langopts.param[LOPT_IT_DOUBLING] = 1;
  447. tr->langopts.param[LOPT_COMBINE_WORDS] = 99; // combine some prepositions with the following word
  448. tr->langopts.numbers = 0x1009 + NUM_ROMAN;
  449. SetLetterVowel(tr,'y');
  450. tr->langopts.spelling_stress = 1;
  451. SetLengthMods(tr,3); // all equal
  452. }
  453. break;
  454. case L('h','y'): // Armenian
  455. {
  456. static const short stress_lengths_hy[8] = {250, 200, 250, 250, 0, 0, 250, 250};
  457. static const char hy_vowels[] = {0x31, 0x35, 0x37, 0x38, 0x3b, 0x48, 0x55, 0};
  458. static const char hy_consonants[] = {0x32,0x33,0x34,0x36,0x39,0x3a,0x3c,0x3d,0x3e,0x3f,
  459. 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x56,0};
  460. SetupTranslator(tr,stress_lengths_hy,NULL);
  461. tr->langopts.stress_rule = 3; // default stress on final syllable
  462. tr->letter_bits_offset = OFFSET_ARMENIAN;
  463. memset(tr->letter_bits,0,sizeof(tr->letter_bits));
  464. SetLetterBits(tr,LETTERGP_A,hy_vowels);
  465. SetLetterBits(tr,LETTERGP_C,hy_consonants);
  466. tr->langopts.max_initial_consonants = 6;
  467. tr->langopts.numbers = 0x409;
  468. // tr->langopts.param[LOPT_UNPRONOUNCABLE] = 1; // disable check for unpronouncable words
  469. }
  470. break;
  471. case L('i','d'): // Indonesian
  472. {
  473. static const short stress_lengths_id[8] = {160, 200, 180, 180, 0, 0, 220, 240};
  474. static const unsigned char stress_amps_id[8] = {16,18, 18,18, 20,22, 22,21 };
  475. SetupTranslator(tr,stress_lengths_id,stress_amps_id);
  476. tr->langopts.stress_rule = 2;
  477. tr->langopts.numbers = 0x1009 + NUM_ROMAN;
  478. tr->langopts.stress_flags = 0x6 | 0x10;
  479. tr->langopts.accents = 2; // "capital" after letter name
  480. }
  481. break;
  482. case L('i','s'): // Icelandic
  483. {
  484. static const short stress_lengths_is[8] = {180,160, 200,200, 0,0, 240,250};
  485. static const wchar_t is_lettergroup_B[] = {'c','f','h','k','p','t','x',0xfe,0}; // voiceless conants, including 'þ' ?? 's'
  486. SetupTranslator(tr,stress_lengths_is,NULL);
  487. tr->langopts.stress_rule = 0;
  488. tr->langopts.stress_flags = 0x10;
  489. tr->langopts.param[LOPT_IT_LENGTHEN] = 0x11; // remove lengthen indicator from unstressed vowels
  490. tr->langopts.param[LOPT_REDUCE] = 2;
  491. ResetLetterBits(tr,0x18);
  492. SetLetterBits(tr,4,"kpst"); // Letter group F
  493. SetLetterBits(tr,3,"jvr"); // Letter group H
  494. tr->letter_groups[1] = is_lettergroup_B;
  495. SetLetterVowel(tr,'y');
  496. tr->langopts.numbers = 0x8e9;
  497. tr->langopts.numbers2 = 0x2;
  498. }
  499. break;
  500. case L('i','t'): // Italian
  501. {
  502. static const short stress_lengths_it[8] = {150, 140, 170, 170, 0, 0, 300, 330};
  503. static const unsigned char stress_amps_it[8] = {15,14, 19,19, 20,22, 22,20 };
  504. SetupTranslator(tr,stress_lengths_it,stress_amps_it);
  505. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  506. tr->langopts.stress_rule = 2;
  507. tr->langopts.vowel_pause = 1;
  508. tr->langopts.unstressed_wd1 = 2;
  509. tr->langopts.unstressed_wd2 = 2;
  510. tr->langopts.param[LOPT_IT_LENGTHEN] = 2; // remove lengthen indicator from unstressed or non-penultimate syllables
  511. tr->langopts.param[LOPT_IT_DOUBLING] = 2; // double the first consonant if the previous word ends in a stressed vowel
  512. tr->langopts.param[LOPT_SONORANT_MIN] = 130; // limit the shortening of sonorants before short vowels
  513. tr->langopts.param[LOPT_REDUCE] = 1; // reduce vowels even if phonemes are specified in it_list
  514. tr->langopts.numbers = 0x2709 + NUM_ROMAN;
  515. tr->langopts.accents = 2; // Say "Capital" after the letter.
  516. }
  517. break;
  518. case L_jbo: // Lojban
  519. {
  520. static const short stress_lengths_jbo[8] = {145,145, 170,160, 0,0, 330,350};
  521. static const wchar_t jbo_punct_within_word[] = {'.',',','\'',0x2c8,0}; // allow period and comma within a word, also stress marker (from LOPT_SYLLABLE_CAPS)
  522. SetupTranslator(tr,stress_lengths_jbo,NULL);
  523. tr->langopts.stress_rule = 2;
  524. tr->langopts.vowel_pause = 0x20c; // pause before a word which starts with a vowel, or after a word which ends in a consonant
  525. // tr->langopts.word_gap = 1;
  526. tr->punct_within_word = jbo_punct_within_word;
  527. tr->langopts.param[LOPT_SYLLABLE_CAPS] = 1; // capitals indicate stressed syllables
  528. SetLetterVowel(tr,'y');
  529. }
  530. break;
  531. case L('k','o'): // Korean, TEST
  532. {
  533. static const char ko_ivowels[] = {0x63,0x64,0x67,0x68,0x6d,0x72,0x74,0x75,0}; // y and i vowels
  534. static const unsigned char ko_voiced[] = {0x02,0x05,0x06,0xab,0xaf,0xb7,0xbc,0}; // voiced consonants, l,m,n,N
  535. tr->letter_bits_offset = OFFSET_KOREAN;
  536. memset(tr->letter_bits,0,sizeof(tr->letter_bits));
  537. SetLetterBitsRange(tr,LETTERGP_A,0x61,0x75);
  538. SetLetterBits(tr,LETTERGP_Y,ko_ivowels);
  539. SetLetterBits(tr,LETTERGP_G,(const char *)ko_voiced);
  540. tr->langopts.stress_rule = 8; // ?? 1st syllable if it is heavy, else 2nd syllable
  541. tr->langopts.param[LOPT_UNPRONOUNCABLE] = 1; // disable check for unpronouncable words
  542. tr->langopts.numbers = 0x0401;
  543. }
  544. break;
  545. case L('k','u'): // Kurdish
  546. {
  547. static const unsigned char stress_amps_ku[8] = {18,18, 20,20, 20,22, 22,21 };
  548. static const short stress_lengths_ku[8] = {180,180, 190,180, 0,0, 230,240};
  549. SetupTranslator(tr,stress_lengths_ku,stress_amps_ku);
  550. tr->charset_a0 = charsets[9]; // ISO-8859-9 - Latin5
  551. tr->langopts.stress_rule = 7; // stress on the last syllable, before any explicitly unstressed syllable
  552. tr->langopts.numbers = 0x100461;
  553. tr->langopts.max_initial_consonants = 2;
  554. }
  555. break;
  556. case L('l','a'): //Latin
  557. {
  558. tr->charset_a0 = charsets[4]; // ISO-8859-4, includes a,e,i,o,u-macron
  559. tr->langopts.stress_rule = 2;
  560. tr->langopts.stress_flags = 0x20;
  561. tr->langopts.unstressed_wd1 = 0;
  562. tr->langopts.unstressed_wd2 = 2;
  563. tr->langopts.param[LOPT_DIERESES] = 1;
  564. tr->langopts.numbers = 0x1 + NUM_ROMAN;
  565. tr->langopts.max_roman = 5000;
  566. }
  567. break;
  568. case L('l','v'): // latvian
  569. {
  570. static const unsigned char stress_amps_lv[8] = {17,13, 20,20, 20,22, 22,21 };
  571. static const short stress_lengths_lv[8] = {180,130, 210,210, 0,0, 210,210};
  572. SetupTranslator(tr,stress_lengths_lv,stress_amps_lv);
  573. tr->langopts.stress_rule = 0;
  574. tr->langopts.spelling_stress = 1;
  575. tr->charset_a0 = charsets[4]; // ISO-8859-4
  576. tr->langopts.numbers = 0x409 + 0x8000 + 0x10000;
  577. tr->langopts.stress_flags = 0x16 + 0x40000;
  578. }
  579. break;
  580. case L('m','k'): // Macedonian
  581. {
  582. static wchar_t vowels_cyrillic[] = {0x440, // also include 'р' [R]
  583. 0x430,0x435,0x438,0x439,0x43e,0x443,0x44b,0x44d,0x44e,0x44f,0x450,0x451,0x456,0x457,0x45d,0x45e,0};
  584. static const unsigned char stress_amps_mk[8] = {17,17, 20,20, 20,22, 22,21 };
  585. static const short stress_lengths_mk[8] = {180,160, 200,200, 0,0, 220,230};
  586. SetupTranslator(tr,stress_lengths_mk,stress_amps_mk);
  587. tr->charset_a0 = charsets[5]; // ISO-8859-5
  588. tr->letter_groups[0] = vowels_cyrillic;
  589. tr->langopts.stress_rule = 4; // antipenultimate
  590. tr->langopts.numbers = 0x0429 + 0x4000;
  591. tr->langopts.numbers2 = 0x8a; // variant numbers before thousands,milliards
  592. }
  593. break;
  594. case L('n','l'): // Dutch
  595. {
  596. static const short stress_lengths_nl[8] = {160,135, 210,210, 0, 0, 260,280};
  597. tr->langopts.stress_rule = 0;
  598. tr->langopts.vowel_pause = 1;
  599. tr->langopts.param[LOPT_DIERESES] = 1;
  600. tr->langopts.param[LOPT_PREFIXES] = 1;
  601. SetLetterVowel(tr,'y');
  602. tr->langopts.numbers = 0x11c19;
  603. memcpy(tr->stress_lengths,stress_lengths_nl,sizeof(tr->stress_lengths));
  604. }
  605. break;
  606. case L('n','o'): // Norwegian
  607. {
  608. static const short stress_lengths_no[8] = {160,140, 200,200, 0,0, 220,210};
  609. SetupTranslator(tr,stress_lengths_no,NULL);
  610. tr->langopts.stress_rule = 0;
  611. SetLetterVowel(tr,'y');
  612. tr->langopts.numbers = 0x11849;
  613. }
  614. break;
  615. case L('o','m'):
  616. {
  617. static const unsigned char stress_amps_om[] = {18,15, 20,20, 20,22, 22,22 };
  618. static const short stress_lengths_om[8] = {200,200, 200,200, 0,0, 200,200};
  619. SetupTranslator(tr,stress_lengths_om,stress_amps_om);
  620. tr->langopts.stress_rule = 2;
  621. tr->langopts.stress_flags = 0x16 + 0x80000;
  622. }
  623. break;
  624. case L('p','l'): // Polish
  625. {
  626. static const short stress_lengths_pl[8] = {160, 190, 175, 175, 0, 0, 200, 210};
  627. static const unsigned char stress_amps_pl[8] = {17,13, 19,19, 20,22, 22,21 }; // 'diminished' is used to mark a quieter, final unstressed syllable
  628. SetupTranslator(tr,stress_lengths_pl,stress_amps_pl);
  629. tr->charset_a0 = charsets[2]; // ISO-8859-2
  630. tr->langopts.stress_rule = 2;
  631. tr->langopts.stress_flags = 0x6; // mark unstressed final syllables as diminished
  632. tr->langopts.param[LOPT_REGRESSIVE_VOICING] = 0x8;
  633. tr->langopts.max_initial_consonants = 7; // for example: wchrzczony :)
  634. tr->langopts.numbers=0x1009 + 0x4000;
  635. tr->langopts.numbers2=0x40;
  636. tr->langopts.param[LOPT_COMBINE_WORDS] = 4 + 0x100; // combine 'nie' (marked with $alt2) with some 1-syllable (and 2-syllable) words (marked with $alt)
  637. SetLetterVowel(tr,'y');
  638. }
  639. break;
  640. case L('p','t'): // Portuguese
  641. {
  642. static const short stress_lengths_pt[8] = {180, 125, 210, 210, 0, 0, 270, 295};
  643. static const unsigned char stress_amps_pt[8] = {16,13, 19,19, 20,22, 22,21 }; // 'diminished' is used to mark a quieter, final unstressed syllable
  644. SetupTranslator(tr,stress_lengths_pt,stress_amps_pt);
  645. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  646. tr->langopts.stress_rule = 3; // stress on final syllable
  647. tr->langopts.stress_flags = 0x6 | 0x10 | 0x20000;
  648. tr->langopts.numbers = 0x269 + 0x4000 + NUM_ROMAN;
  649. SetLetterVowel(tr,'y');
  650. ResetLetterBits(tr,0x2);
  651. SetLetterBits(tr,1,"bcdfgjkmnpqstvxz"); // B hard consonants, excluding h,l,r,w,y
  652. }
  653. break;
  654. case L('r','o'): // Romanian
  655. {
  656. static const short stress_lengths_ro[8] = {170, 170, 180, 180, 0, 0, 240, 260};
  657. static const unsigned char stress_amps_ro[8] = {15,13, 18,18, 20,22, 22,21 };
  658. SetupTranslator(tr,stress_lengths_ro,stress_amps_ro);
  659. tr->langopts.stress_rule = 2;
  660. tr->langopts.stress_flags = 0x100 + 0x6;
  661. tr->charset_a0 = charsets[2]; // ISO-8859-2
  662. tr->langopts.numbers = 0x1029+0x6000 + NUM_ROMAN;
  663. tr->langopts.numbers2 = 0x1e; // variant numbers before all thousandplex
  664. }
  665. break;
  666. case L('r','u'): // Russian
  667. Translator_Russian(tr);
  668. break;
  669. case L('r','w'): // Kiryarwanda
  670. {
  671. tr->langopts.stress_rule = 2;
  672. tr->langopts.stress_flags = 0x16;
  673. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  674. tr->langopts.numbers = 0x61 + 0x100000 + 0x4000;
  675. tr->langopts.numbers2 = 0x200; // say "thousands" before its number
  676. }
  677. break;
  678. case L('s','k'): // Slovak
  679. case L('c','s'): // Czech
  680. {
  681. static const char *sk_voiced = "bdgjlmnrvwzaeiouy";
  682. SetupTranslator(tr,stress_lengths_sk,stress_amps_sk);
  683. tr->charset_a0 = charsets[2]; // ISO-8859-2
  684. tr->langopts.stress_rule = 0;
  685. tr->langopts.stress_flags = 0x16;
  686. tr->langopts.param[LOPT_REGRESSIVE_VOICING] = 0x3;
  687. tr->langopts.max_initial_consonants = 5;
  688. tr->langopts.spelling_stress = 1;
  689. tr->langopts.param[LOPT_COMBINE_WORDS] = 4; // combine some prepositions with the following word
  690. tr->langopts.numbers = 0x0401 + 0x4000 + NUM_ROMAN;
  691. tr->langopts.numbers2 = 0x40;
  692. tr->langopts.thousands_sep = 0; //no thousands separator
  693. tr->langopts.decimal_sep = ',';
  694. if(name2 == L('c','s'))
  695. {
  696. tr->langopts.numbers2 = 0x48; // variant numbers before milliards
  697. }
  698. SetLetterVowel(tr,'y');
  699. SetLetterVowel(tr,'r');
  700. ResetLetterBits(tr,0x20);
  701. SetLetterBits(tr,5,sk_voiced);
  702. }
  703. break;
  704. case L('s','q'): // Albanian
  705. {
  706. static const short stress_lengths_sq[8] = {150, 150, 180, 180, 0, 0, 300, 300};
  707. static const unsigned char stress_amps_sq[8] = {16,12, 16,16, 20,20, 21,19 };
  708. SetupTranslator(tr,stress_lengths_sq,stress_amps_sq);
  709. tr->langopts.stress_rule = 2;
  710. tr->langopts.stress_flags = 0x16 + 0x100;
  711. SetLetterVowel(tr,'y');
  712. tr->langopts.numbers = 0x69 + 0x8000;
  713. }
  714. break;
  715. case L('s','v'): // Swedish
  716. {
  717. static const unsigned char stress_amps_sv[] = {16,16, 20,20, 20,22, 22,21 };
  718. static const short stress_lengths_sv[8] = {160,135, 220,220, 0,0, 250,280};
  719. SetupTranslator(tr,stress_lengths_sv,stress_amps_sv);
  720. tr->langopts.stress_rule = 0;
  721. SetLetterVowel(tr,'y');
  722. tr->langopts.numbers = 0x1909;
  723. tr->langopts.accents = 1;
  724. }
  725. break;
  726. case L('s','w'): // Swahili
  727. {
  728. static const short stress_lengths_sw[8] = {160, 170, 200, 200, 0, 0, 320, 340};
  729. static const unsigned char stress_amps_sw[] = {16,12, 19,19, 20,22, 22,21 };
  730. SetupTranslator(tr,stress_lengths_sw,stress_amps_sw);
  731. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  732. tr->langopts.vowel_pause = 1;
  733. tr->langopts.stress_rule = 2;
  734. tr->langopts.stress_flags = 0x6 | 0x10;
  735. tr->langopts.numbers = 0x4e1;
  736. tr->langopts.numbers2 = NUM2_100000a;
  737. }
  738. break;
  739. case L('t','a'): // Tamil
  740. case L('m','l'): // Malayalam
  741. case L('k','n'): // Kannada
  742. case L('m','r'): // Marathi
  743. {
  744. static const short stress_lengths_ta[8] = {200, 200, 210, 210, 0, 0, 230, 230};
  745. static const unsigned char stress_amps_ta[8] = {18,18, 18,18, 20,20, 22,22 };
  746. SetupTranslator(tr,stress_lengths_ta,stress_amps_ta);
  747. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  748. tr->langopts.stress_rule = 0;
  749. tr->langopts.stress_flags = 0x10004; // use 'diminished' for unstressed final syllable
  750. tr->letter_bits_offset = OFFSET_TAMIL;
  751. if(name2 == L('m','r'))
  752. {
  753. tr->letter_bits_offset = OFFSET_DEVANAGARI;
  754. }
  755. else
  756. if(name2 == L('m','l'))
  757. {
  758. tr->letter_bits_offset = OFFSET_MALAYALAM;
  759. }
  760. else
  761. if(name2 == L('k','n'))
  762. {
  763. tr->letter_bits_offset = OFFSET_KANNADA;
  764. tr->langopts.numbers = 0x1;
  765. tr->langopts.numbers2 = NUM2_100000;
  766. }
  767. tr->langopts.param[LOPT_WORD_MERGE] = 1; // don't break vowels betwen words
  768. SetIndicLetters(tr); // call this after setting OFFSET_
  769. }
  770. break;
  771. #ifdef deleted
  772. case L('t','h'): // Thai
  773. {
  774. static const short stress_lengths_th[8] = {230,150, 230,230, 230,0, 230,250};
  775. static const unsigned char stress_amps_th[] = {22,16, 22,22, 22,22, 22,22 };
  776. SetupTranslator(tr,stress_lengths_th,stress_amps_th);
  777. tr->langopts.stress_rule = 0; // stress on final syllable of a "word"
  778. tr->langopts.stress_flags = 1; // don't automatically set diminished stress (may be set in the intonation module)
  779. tr->langopts.tone_language = 1; // Tone language, use CalcPitches_Tone() rather than CalcPitches()
  780. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  781. // tr->langopts.tone_numbers = 1; // a number after letters indicates a tone number (eg. pinyin or jyutping)
  782. tr->langopts.word_gap = 0x21; // length of a final vowel is less dependent on the next consonant, don't merge consonant with next word
  783. }
  784. break;
  785. #endif
  786. case L('t','r'): // Turkish
  787. {
  788. static const unsigned char stress_amps_tr[8] = {18,18, 20,20, 20,22, 22,21 };
  789. static const short stress_lengths_tr[8] = {190,190, 190,190, 0,0, 250,270};
  790. SetupTranslator(tr,stress_lengths_tr,stress_amps_tr);
  791. tr->charset_a0 = charsets[9]; // ISO-8859-9 - Latin5
  792. tr->langopts.stress_rule = 7; // stress on the last syllable, before any explicitly unstressed syllable
  793. tr->langopts.stress_flags = 0x20; //no automatic secondary stress
  794. tr->langopts.numbers = 0x1509 + 0x4000;
  795. tr->langopts.max_initial_consonants = 2;
  796. }
  797. break;
  798. case L('v','i'): // Vietnamese
  799. {
  800. static const short stress_lengths_vi[8] = {150, 150, 180, 180, 210, 230, 230, 240};
  801. static const unsigned char stress_amps_vi[] = {16,16, 16,16, 22,22, 22,22 };
  802. static wchar_t vowels_vi[] = {
  803. 0x61, 0xe0, 0xe1, 0x1ea3, 0xe3, 0x1ea1, // a
  804. 0x103, 0x1eb1, 0x1eaf, 0x1eb3, 0x1eb5, 0x1eb7, // ă
  805. 0xe2, 0x1ea7, 0x1ea5, 0x1ea9, 0x1eab, 0x1ead, // â
  806. 0x65, 0xe8, 0xe9, 0x1ebb, 0x1ebd, 0x1eb9, // e
  807. 0xea, 0x1ec1, 0x1ebf, 0x1ec3, 0x1ec5, 0x1ec7, // i
  808. 0x69, 0xec, 0xed, 0x1ec9, 0x129, 0x1ecb, // i
  809. 0x6f, 0xf2, 0xf3, 0x1ecf, 0xf5, 0x1ecd, // o
  810. 0xf4, 0x1ed3, 0x1ed1, 0x1ed5, 0x1ed7, 0x1ed9, // ô
  811. 0x1a1, 0x1edd, 0x1edb, 0x1edf, 0x1ee1, 0x1ee3, // ơ
  812. 0x75, 0xf9, 0xfa, 0x1ee7, 0x169, 0x1ee5, // u
  813. 0x1b0, 0x1eeb, 0x1ee9, 0x1eed, 0x1eef, 0x1ef1, // ư
  814. 0x79, 0x1ef3, 0xfd, 0x1ef7, 0x1ef9, 0x1e, 0 }; // y
  815. SetupTranslator(tr,stress_lengths_vi,stress_amps_vi);
  816. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  817. tr->langopts.stress_rule = 0;
  818. tr->langopts.word_gap = 0x21; // length of a final vowel is less dependent on the next consonant, don't merge consonant with next word
  819. // tr->langopts.vowel_pause = 4;
  820. tr->letter_groups[0] = vowels_vi;
  821. tr->langopts.tone_language = 1; // Tone language, use CalcPitches_Tone() rather than CalcPitches()
  822. tr->langopts.unstressed_wd1 = 2;
  823. tr->langopts.numbers = 0x0049 + 0x8000;
  824. }
  825. break;
  826. case L('z','h'):
  827. case L_zhy:
  828. {
  829. static const short stress_lengths_zh[8] = {230,150, 230,230, 230,0, 240,250}; // 1=tone5. end-of-sentence, 6=tone 1&4, 7=tone 2&3
  830. static const unsigned char stress_amps_zh[] = {22,16, 22,22, 22,22, 22,22 };
  831. SetupTranslator(tr,stress_lengths_zh,stress_amps_zh);
  832. tr->langopts.stress_rule = 3; // stress on final syllable of a "word"
  833. tr->langopts.stress_flags = 1; // don't automatically set diminished stress (may be set in the intonation module)
  834. tr->langopts.vowel_pause = 0;
  835. tr->langopts.tone_language = 1; // Tone language, use CalcPitches_Tone() rather than CalcPitches()
  836. tr->langopts.length_mods0 = tr->langopts.length_mods; // don't lengthen vowels in the last syllable
  837. tr->langopts.tone_numbers = 1; // a number after letters indicates a tone number (eg. pinyin or jyutping)
  838. tr->langopts.ideographs = 1;
  839. tr->langopts.word_gap = 0x21; // length of a final vowel is less dependent on the next consonant, don't merge consonant with next word
  840. if(name2 == L('z','h'))
  841. {
  842. tr->langopts.textmode = 1;
  843. tr->langopts.listx = 1; // compile zh_listx after zh_list
  844. }
  845. }
  846. break;
  847. default:
  848. break;
  849. }
  850. tr->translator_name = name2;
  851. if(tr->langopts.numbers & 0x8)
  852. {
  853. // use . and ; for thousands and decimal separators
  854. tr->langopts.thousands_sep = '.';
  855. tr->langopts.decimal_sep = ',';
  856. }
  857. if(tr->langopts.numbers & 0x4)
  858. {
  859. tr->langopts.thousands_sep = 0; // don't allow thousands separator, except space
  860. }
  861. return(tr);
  862. } // end of SelectTranslator
  863. //**********************************************************************************************************
  864. static void Translator_Russian(Translator *tr)
  865. {//===========================================
  866. static const unsigned char stress_amps_ru[] = {16,16, 18,18, 20,24, 24,22 };
  867. static const short stress_lengths_ru[8] = {150,140, 220,220, 0,0, 260,280};
  868. // character codes offset by 0x420
  869. static const char ru_vowels[] = {0x10,0x15,0x31,0x18,0x1e,0x23,0x2b,0x2d,0x2e,0x2f,0};
  870. static const char ru_consonants[] = {0x11,0x12,0x13,0x14,0x16,0x17,0x19,0x1a,0x1b,0x1c,0x1d,0x1f,0x20,0x21,0x22,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2c,0};
  871. static const char ru_soft[] = {0x2c,0x19,0x27,0x29,0}; // letter group B [k ts; s;]
  872. static const char ru_hard[] = {0x2a,0x16,0x26,0x28,0}; // letter group H [S Z ts]
  873. static const char ru_nothard[] = {0x11,0x12,0x13,0x14,0x17,0x19,0x1a,0x1b,0x1c,0x1d,0x1f,0x20,0x21,0x22,0x24,0x25,0x27,0x29,0x2c,0};
  874. static const char ru_voiced[] = {0x11,0x12,0x13,0x14,0x16,0x17,0}; // letter group G (voiced obstruents)
  875. static const char ru_ivowels[] = {0x2c,0x15,0x31,0x18,0x2e,0x2f,0}; // letter group Y (iotated vowels & soft-sign)
  876. SetupTranslator(tr,stress_lengths_ru,stress_amps_ru);
  877. tr->charset_a0 = charsets[18]; // KOI8-R
  878. tr->transpose_offset = 0x42f; // convert cyrillic from unicode into range 0x01 to 0x22
  879. tr->transpose_min = 0x430;
  880. tr->transpose_max = 0x451;
  881. tr->letter_bits_offset = OFFSET_CYRILLIC;
  882. memset(tr->letter_bits,0,sizeof(tr->letter_bits));
  883. SetLetterBits(tr,0,ru_vowels);
  884. SetLetterBits(tr,1,ru_soft);
  885. SetLetterBits(tr,2,ru_consonants);
  886. SetLetterBits(tr,3,ru_hard);
  887. SetLetterBits(tr,4,ru_nothard);
  888. SetLetterBits(tr,5,ru_voiced);
  889. SetLetterBits(tr,6,ru_ivowels);
  890. SetLetterBits(tr,7,ru_vowels);
  891. tr->langopts.param[LOPT_UNPRONOUNCABLE] = 0x432; // [v] don't count this character at start of word
  892. tr->langopts.param[LOPT_REGRESSIVE_VOICING] = 1;
  893. tr->langopts.param[LOPT_REDUCE] = 2;
  894. tr->langopts.stress_rule = 5;
  895. tr->langopts.stress_flags = 0x0020; // waas 0x1010
  896. tr->langopts.numbers = 0x0409;
  897. tr->langopts.numbers2 = 0xc2; // variant numbers before thousands
  898. tr->langopts.phoneme_change = 1;
  899. tr->langopts.testing = 2;
  900. } // end of Translator_Russian
  901. /*
  902. typedef struct {
  903. int flags;
  904. unsigned char stress; // stress level of this vowel
  905. unsigned char stress_highest; // the highest stress level of a vowel in this word
  906. unsigned char n_vowels; // number of vowels in the word
  907. unsigned char vowel_this; // syllable number of this vowel (counting from 1)
  908. unsigned char vowel_stressed; // syllable number of the highest stressed vowel
  909. } CHANGEPH;
  910. */
  911. #define RUSSIAN2
  912. #ifdef RUSSIAN2
  913. int ChangePhonemes_ru(Translator *tr, PHONEME_LIST2 *phlist, int n_ph, int index, PHONEME_TAB *ph, CHANGEPH *ch)
  914. {//=============================================================================================================
  915. // Called for each phoneme in the phoneme list, to allow a language to make changes
  916. // ph The current phoneme
  917. int variant;
  918. int vowelix;
  919. PHONEME_TAB *prev, *next;
  920. if(ch->flags & 8)
  921. return(0); // full phoneme translation has already been given
  922. // Russian vowel softening and reduction rules
  923. if(ph->type == phVOWEL)
  924. {
  925. int prestressed = ch->vowel_stressed==ch->vowel_this+1; // the next vowel after this has the main stress
  926. #define N_VOWELS_RU 11
  927. static unsigned int vowels_ru[N_VOWELS_RU] = {'a','V','O','I',PH('I','#'),PH('E','#'),PH('E','2'),
  928. PH('V','#'),PH('I','3'),PH('I','2'),PH('E','3')};
  929. static unsigned int vowel_replace[N_VOWELS_RU][6] = {
  930. // stressed, soft, soft-stressed, j+stressed, j+soft, j+soft-stressed
  931. /*0*/ {'A', 'I', PH('j','a'), 'a', 'a', 'a'}, // a Uses 3,4,5 columns.
  932. /*1*/ {'A', 'V', PH('j','a'), 'a', 'V', 'a'}, // V Uses 3,4,5 columns.
  933. /*2*/ {'o', '8', '8', 'o', '8', '8'}, // O
  934. /*3*/ {'i', 'I', 'i', 'a', 'I', 'a'}, // I Uses 3,4,5 columns.
  935. /*4*/ {'i', PH('I','#'), 'i', 'i', PH('I','#'), 'i'}, // I#
  936. /*5*/ {'E', PH('E','#'), 'E', 'e', PH('E','#'), 'e'}, // E#
  937. /*6*/ {'E', PH('E','2'), 'E', 'e', PH('E','2'), 'e'}, // E2 Uses 3,4,5 columns.
  938. /*7*/ {PH('j','a'), 'V', PH('j','a'), 'A', 'V', 'A'}, // V#
  939. /*8*/ {PH('j','a'), 'I', PH('j','a'), 'e', 'I', 'e'}, // I3 Uses 3,4,5 columns.
  940. /*9*/ {'e', 'I', 'e', 'e', 'I', 'e'}, // I2
  941. /*10*/ {'e', PH('E', '2'), 'e', 'e', PH('E','2'), 'e'} // E3
  942. };
  943. prev = phoneme_tab[phlist[index-1].phcode];
  944. next = phoneme_tab[phlist[index+1].phcode];
  945. // lookup the vowel name to get an index into the vowel_replace[] table
  946. for(vowelix=0; vowelix<N_VOWELS_RU; vowelix++)
  947. {
  948. if(vowels_ru[vowelix] == ph->mnemonic)
  949. break;
  950. }
  951. if(vowelix == N_VOWELS_RU)
  952. return(0);
  953. if(prestressed)
  954. {
  955. if((vowelix==6)&&(prev->mnemonic=='j'))
  956. vowelix=8;
  957. if(vowelix==1)
  958. vowelix=0;
  959. if(vowelix==4)
  960. vowelix=3;
  961. if(vowelix==6)
  962. vowelix=5;
  963. if(vowelix==7)
  964. vowelix=8;
  965. if(vowelix==10)
  966. vowelix=9;
  967. }
  968. // do we need a variant of this vowel, depending on the stress and adjacent phonemes ?
  969. variant = -1;
  970. int stressed = ch->flags & 2;
  971. int soft=prev->phflags & phPALATAL;
  972. if (soft && stressed)
  973. variant = 2; else
  974. if (stressed)
  975. variant = 0; else
  976. if (soft)
  977. variant = 1;
  978. if(variant >= 0)
  979. {
  980. if(prev->mnemonic == 'j')
  981. variant += 3;
  982. phlist[index].phcode = PhonemeCode(vowel_replace[vowelix][variant]);
  983. }
  984. else
  985. {
  986. phlist[index].phcode = PhonemeCode(vowels_ru[vowelix]);
  987. }
  988. }
  989. return(0);
  990. }
  991. #else
  992. int ChangePhonemes_ru(Translator *tr, PHONEME_LIST2 *phlist, int n_ph, int index, PHONEME_TAB *ph, CHANGEPH *ch)
  993. {//=============================================================================================================
  994. // Called for each phoneme in the phoneme list, to allow a language to make changes
  995. // flags: bit 0=1 last phoneme in a word
  996. // bit 1=1 this is the highest stressed vowel in the current word
  997. // bit 2=1 after the highest stressed vowel in the current word
  998. // bit 3=1 the phonemes were specified explicitly, or found from an entry in the xx_list dictionary
  999. // ph The current phoneme
  1000. int variant;
  1001. int vowelix;
  1002. PHONEME_TAB *prev, *next;
  1003. if(ch->flags & 8)
  1004. return(0); // full phoneme translation has already been given
  1005. // Russian vowel softening and reduction rules
  1006. if(ph->type == phVOWEL)
  1007. {
  1008. #define N_VOWELS_RU 7
  1009. static unsigned char vowels_ru[N_VOWELS_RU] = {'a','A','o','E','i','u','y'};
  1010. // each line gives: soft, reduced, soft-reduced, post-tonic
  1011. static unsigned short vowel_replace[N_VOWELS_RU][4] = {
  1012. {'&', 'V', 'I', 'V'}, // a
  1013. {'&', 'V', 'I', 'V'}, // A
  1014. {'8', 'V', 'I', 'V'}, // o
  1015. {'e', 'I', 'I', 'I'}, // E
  1016. {'i', 'I', 'I', 'I'}, // i
  1017. {'u'+('"'<<8), 'U', 'U', 'U'}, // u
  1018. {'y', 'Y', 'Y', 'Y'}}; // y
  1019. prev = phoneme_tab[phlist[index-1].phcode];
  1020. next = phoneme_tab[phlist[index+1].phcode];
  1021. if(prev->mnemonic == 'j')
  1022. return(0);
  1023. // lookup the vowel name to get an index into the vowel_replace[] table
  1024. for(vowelix=0; vowelix<N_VOWELS_RU; vowelix++)
  1025. {
  1026. if(vowels_ru[vowelix] == ph->mnemonic)
  1027. break;
  1028. }
  1029. if(vowelix == N_VOWELS_RU)
  1030. return(0);
  1031. // do we need a variant of this vowel, depending on the stress and adjacent phonemes ?
  1032. variant = -1;
  1033. if(ch->flags & 2)
  1034. {
  1035. // a stressed vowel
  1036. if((prev->phflags & phPALATAL) && ((next->phflags & phPALATAL) || phoneme_tab[phlist[index+2].phcode]->mnemonic == ';'))
  1037. {
  1038. // between two palatal consonants, use the soft variant
  1039. variant = 0;
  1040. }
  1041. }
  1042. else
  1043. {
  1044. // an unstressed vowel
  1045. if(prev->phflags & phPALATAL)
  1046. {
  1047. variant = 2; // unstressed soft
  1048. }
  1049. else
  1050. if((ph->mnemonic == 'o') && ((prev->phflags & phPLACE) == phPLACE_pla))
  1051. {
  1052. variant = 2; // unstressed soft ([o] vowel following: ш ж
  1053. }
  1054. else
  1055. if(ch->flags & 4)
  1056. {
  1057. variant = 3; // post tonic
  1058. }
  1059. else
  1060. {
  1061. variant = 1; // unstressed
  1062. }
  1063. }
  1064. if(variant >= 0)
  1065. {
  1066. phlist[index].phcode = PhonemeCode(vowel_replace[vowelix][variant]);
  1067. }
  1068. }
  1069. return(0);
  1070. }
  1071. #endif