|
|
|
|
|
|
|
|
static int IsLetterGroup(Translator *tr, char *word, int group, int pre) |
|
|
static int IsLetterGroup(Translator *tr, char *word, int group, int pre) |
|
|
{ |
|
|
{ |
|
|
/* Match the word against a list of utf-8 strings. |
|
|
/* Match the word against a list of utf-8 strings. |
|
|
|
|
|
* returns length of matching letter group or -1 |
|
|
* |
|
|
* |
|
|
* How this works: |
|
|
* How this works: |
|
|
* |
|
|
* |
|
|
|
|
|
|
|
|
} else |
|
|
} else |
|
|
w = word; |
|
|
w = word; |
|
|
|
|
|
|
|
|
// If no character is allowed in group |
|
|
|
|
|
// at the start (for pre-rule) or end (post-rule) |
|
|
|
|
|
// of the checked letter in the word, return true. |
|
|
|
|
|
if (*p == '~' && *w == ' ') // word end checked because of comment below |
|
|
|
|
|
return 1; |
|
|
|
|
|
/* TODO: Need to investigate why word end mark _ doesn't work properly |
|
|
|
|
|
* for post rule somewhere in MatchRule() function. or e.g.: |
|
|
|
|
|
* |
|
|
|
|
|
* .L01 ~ b c |
|
|
|
|
|
* .group a |
|
|
|
|
|
* _L01) a i // this works |
|
|
|
|
|
* a (L01_ u // this doesn't work |
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
// If '~' (no character) is allowed in group, return 0. |
|
|
|
|
|
if (*p == '~') |
|
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
|
|
// Check current group |
|
|
while ((*p == *w) && (*w != 0)) { |
|
|
while ((*p == *w) && (*w != 0)) { |
|
|
w++; |
|
|
w++; |
|
|
p++; |
|
|
p++; |
|
|
|
|
|
|
|
|
while (*p++ != 0) |
|
|
while (*p++ != 0) |
|
|
; |
|
|
; |
|
|
} |
|
|
} |
|
|
return 0; |
|
|
|
|
|
|
|
|
// Not found |
|
|
|
|
|
return -1; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
static int IsLetter(Translator *tr, int letter, int group) |
|
|
static int IsLetter(Translator *tr, int letter, int group) |
|
|
|
|
|
|
|
|
break; |
|
|
break; |
|
|
case RULE_LETTERGP2: // match against a list of utf-8 strings |
|
|
case RULE_LETTERGP2: // match against a list of utf-8 strings |
|
|
letter_group = LetterGroupNo(rule++); |
|
|
letter_group = LetterGroupNo(rule++); |
|
|
if ((n_bytes = IsLetterGroup(tr, post_ptr-1, letter_group, 0)) > 0) { |
|
|
|
|
|
|
|
|
if ((n_bytes = IsLetterGroup(tr, post_ptr-1, letter_group, 0)) >= 0) { |
|
|
add_points = (20-distance_right); |
|
|
add_points = (20-distance_right); |
|
|
post_ptr += (n_bytes-1); |
|
|
|
|
|
|
|
|
if (n_bytes > 0) // move pointer, if non-zero length group was found |
|
|
|
|
|
post_ptr += (n_bytes-1); |
|
|
} else |
|
|
} else |
|
|
failed = 1; |
|
|
failed = 1; |
|
|
break; |
|
|
break; |
|
|
|
|
|
|
|
|
case RULE_SKIPCHARS: |
|
|
case RULE_SKIPCHARS: |
|
|
{ |
|
|
{ |
|
|
// '(Jxy' means 'skip characters until xy' |
|
|
// '(Jxy' means 'skip characters until xy' |
|
|
char *p = post_ptr + letter_xbytes; |
|
|
|
|
|
char *p2 = p; // pointer to the previous character in the word |
|
|
|
|
|
int rule_w; // first wide character of skip rule |
|
|
|
|
|
|
|
|
char *p = post_ptr - 1; // to allow empty jump (without letter between), go one back |
|
|
|
|
|
char *p2 = p; // pointer to the previous character in the word |
|
|
|
|
|
int rule_w; // first wide character of skip rule |
|
|
utf8_in(&rule_w, rule); |
|
|
utf8_in(&rule_w, rule); |
|
|
int g_bytes = 0; // bytes of successfully found character group |
|
|
|
|
|
while ((letter_w != rule_w) && (letter_w != RULE_SPACE) && (letter_w != 0) && (g_bytes == 0)) { |
|
|
|
|
|
|
|
|
int g_bytes = -1; // bytes of successfully found character group |
|
|
|
|
|
while ((letter_w != rule_w) && (letter_w != RULE_SPACE) && (letter_w != 0) && (g_bytes == -1)) { |
|
|
|
|
|
if (rule_w == RULE_LETTERGP2) |
|
|
|
|
|
g_bytes = IsLetterGroup(tr, p, LetterGroupNo(rule + 1), 0); |
|
|
p2 = p; |
|
|
p2 = p; |
|
|
p += utf8_in(&letter_w, p); |
|
|
p += utf8_in(&letter_w, p); |
|
|
if (rule_w == RULE_LETTERGP2) |
|
|
|
|
|
g_bytes = IsLetterGroup(tr, p2, LetterGroupNo(rule + 1), 0); |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
if ((letter_w == rule_w) || (g_bytes > 0)) |
|
|
|
|
|
|
|
|
if ((letter_w == rule_w) || (g_bytes >= 0)) |
|
|
post_ptr = p2; |
|
|
post_ptr = p2; |
|
|
} |
|
|
} |
|
|
break; |
|
|
break; |
|
|
|
|
|
|
|
|
break; |
|
|
break; |
|
|
case RULE_LETTERGP2: // match against a list of utf-8 strings |
|
|
case RULE_LETTERGP2: // match against a list of utf-8 strings |
|
|
letter_group = LetterGroupNo(rule++); |
|
|
letter_group = LetterGroupNo(rule++); |
|
|
if ((n_bytes = IsLetterGroup(tr, pre_ptr, letter_group, 1)) > 0) { |
|
|
|
|
|
|
|
|
if ((n_bytes = IsLetterGroup(tr, pre_ptr, letter_group, 1)) >= 0) { |
|
|
add_points = (20-distance_right); |
|
|
add_points = (20-distance_right); |
|
|
pre_ptr -= (n_bytes-1); |
|
|
|
|
|
|
|
|
if (n_bytes > 0) // move pointer, if non-zero length group was found |
|
|
|
|
|
pre_ptr -= (n_bytes-1); |
|
|
} else |
|
|
} else |
|
|
failed = 1; |
|
|
failed = 1; |
|
|
break; |
|
|
break; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
case RULE_SKIPCHARS: { |
|
|
case RULE_SKIPCHARS: { |
|
|
// 'xyJ)' means 'skip characters backwards until xy' |
|
|
// 'xyJ)' means 'skip characters backwards until xy' |
|
|
char *p = pre_ptr; // pointer to current character in word |
|
|
|
|
|
char *p2 = p; // pointer to previous character in word |
|
|
|
|
|
int g_bytes = 0; // bytes of successfully found character group |
|
|
|
|
|
|
|
|
char *p = pre_ptr + 1; // to allow empty jump (without letter between), go one forward |
|
|
|
|
|
char *p2 = p; // pointer to previous character in word |
|
|
|
|
|
int g_bytes = -1; // bytes of successfully found character group |
|
|
|
|
|
|
|
|
while ((*p != *rule) && (*p != RULE_SPACE) && (*p != 0) && (g_bytes == 0)) { |
|
|
|
|
|
|
|
|
while ((*p != *rule) && (*p != RULE_SPACE) && (*p != 0) && (g_bytes == -1)) { |
|
|
p2 = p; |
|
|
p2 = p; |
|
|
p--; |
|
|
p--; |
|
|
if (*rule == RULE_LETTERGP2) |
|
|
if (*rule == RULE_LETTERGP2) |
|
|
|
|
|
|
|
|
// 'xy' part is checked as usual in following cycles of PRE rule characters |
|
|
// 'xy' part is checked as usual in following cycles of PRE rule characters |
|
|
if (*p == *rule) |
|
|
if (*p == *rule) |
|
|
pre_ptr = p2; |
|
|
pre_ptr = p2; |
|
|
if (g_bytes > 0) |
|
|
|
|
|
|
|
|
if (g_bytes >= 0) |
|
|
pre_ptr = p2 + 1; |
|
|
pre_ptr = p2 + 1; |
|
|
|
|
|
|
|
|
} |
|
|
} |