Browse Source

tokenizer.c: Support general punctuation tokens.

master
Reece H. Dunn 8 years ago
parent
commit
786575c6ed
3 changed files with 90 additions and 5 deletions
  1. 1
    0
      src/include/espeak-ng/tokenizer.h
  2. 12
    0
      src/libespeak-ng/tokenizer.c
  3. 77
    5
      tests/tokenizer.c

+ 1
- 0
src/include/espeak-ng/tokenizer.h View File

ESPEAKNG_TOKEN_COLON, ESPEAKNG_TOKEN_COLON,
ESPEAKNG_TOKEN_SEMICOLON, ESPEAKNG_TOKEN_SEMICOLON,
ESPEAKNG_TOKEN_ELLIPSIS, ESPEAKNG_TOKEN_ELLIPSIS,
ESPEAKNG_TOKEN_PUNCTUATION,
} espeak_ng_TOKEN_TYPE; } espeak_ng_TOKEN_TYPE;


ESPEAK_NG_API espeak_ng_TOKEN_TYPE ESPEAK_NG_API espeak_ng_TOKEN_TYPE

+ 12
- 0
src/libespeak-ng/tokenizer.c View File

ESPEAKNG_CTYPE_COLON, ESPEAKNG_CTYPE_COLON,
ESPEAKNG_CTYPE_SEMICOLON, ESPEAKNG_CTYPE_SEMICOLON,
ESPEAKNG_CTYPE_ELLIPSIS, ESPEAKNG_CTYPE_ELLIPSIS,
ESPEAKNG_CTYPE_PUNCTUATION,
} espeakng_CTYPE; } espeakng_CTYPE;


#define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull #define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull
{ {
case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE; case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE;
case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE; case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE;
case UCD_CATEGORY_Pc: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pd: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pe: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pf: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Pi: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Po: return ESPEAKNG_CTYPE_PUNCTUATION;
case UCD_CATEGORY_Ps: return ESPEAKNG_CTYPE_PUNCTUATION;
} }


// 5. Classify the remaining codepoints. // 5. Classify the remaining codepoints.
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';
return ESPEAKNG_TOKEN_ELLIPSIS; return ESPEAKNG_TOKEN_ELLIPSIS;
case ESPEAKNG_CTYPE_PUNCTUATION:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_PUNCTUATION;
default: default:
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';

+ 77
- 5
tests/tokenizer.c View File

assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK); assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1); assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);


// U+000C : FORM FEED (FF) -- Used as a page (not paragraph) break.
// FORM FEED (FF) -- Used as a page (not paragraph) break.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);


// U+0085 : NEXT LINE (NEL) -- Used in EBCDIC systems as a combined CR+LF character.
// NEXT LINE (NEL) [U+0085] -- Used in EBCDIC systems as a combined CR+LF character.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);


// General Category: Zl -- LINE SEPARATOR
// General Category: Zl -- LINE SEPARATOR [U+2028]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK); assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1); assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);


// General Category: Zp -- PARAGRAPH SEPARATOR
// General Category: Zp, PARAGRAPH SEPARATOR [U+2029]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);


// General Category: Zs, Property: White_Space, Decomposition: <noBreak>
// General Category: Zs, Property: White_Space, Decomposition: <noBreak>, NO-BREAK SPACE [U+00A0]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);


// HORIZONTAL ELLIPSIS [U+2026]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA6") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA6") == 0);
destroy_tokenizer(tokenizer); destroy_tokenizer(tokenizer);
} }


void
test_Latn_general_punctuation_tokens()
{
printf("testing Latin (Latn) script general punctuation tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\" () - _ \xC2\xAB\xC2\xBB", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

// General Category: Po
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\"") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Ps
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "(") == 0);

// General Category: Pe
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ")") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Pd
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "-") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Pc
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "_") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

// General Category: Pi, LEFT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00AB]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xAB") == 0);

// General Category: Pf, RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00BB]
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xBB") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void void
run_tests() run_tests()
{ {


test_Latn_word_tokens(); test_Latn_word_tokens();
test_Latn_punctuation_tokens(); test_Latn_punctuation_tokens();
test_Latn_general_punctuation_tokens();


printf("done\n"); printf("done\n");
} }

Loading…
Cancel
Save