Browse Source

tokenizer.c: Support full stop tokens.

master
Reece H. Dunn 8 years ago
parent
commit
8f62e18324
3 changed files with 36 additions and 1 deletions
  1. 8
    1
      src/libespeak-ng/tokenizer.c
  2. 1
    0
      src/libespeak-ng/tokenizer.h
  3. 27
    0
      tests/tokenizer.c

+ 8
- 1
src/libespeak-ng/tokenizer.c View File

ESPEAKNG_CTYPE_WHITESPACE, ESPEAKNG_CTYPE_WHITESPACE,
ESPEAKNG_CTYPE_LOWERCASE, ESPEAKNG_CTYPE_LOWERCASE,
ESPEAKNG_CTYPE_UPPERCASE, ESPEAKNG_CTYPE_UPPERCASE,
ESPEAKNG_CTYPE_FULL_STOP,
} espeakng_CTYPE; } espeakng_CTYPE;


#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x000000000000C001ull
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x800000000000C001ull


// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
static espeakng_CTYPE codepoint_type(uint32_t c) static espeakng_CTYPE codepoint_type(uint32_t c)
return ESPEAKNG_CTYPE_LOWERCASE; return ESPEAKNG_CTYPE_LOWERCASE;
case UCD_PROPERTY_OTHER_UPPERCASE: case UCD_PROPERTY_OTHER_UPPERCASE:
return ESPEAKNG_CTYPE_UPPERCASE; return ESPEAKNG_CTYPE_UPPERCASE;
case ESPEAKNG_PROPERTY_FULL_STOP:
return ESPEAKNG_CTYPE_FULL_STOP;
} }


// 4. Classify the remaining codepoints. // 4. Classify the remaining codepoints.
case ESPEAKNG_CTYPE_UPPERCASE: case ESPEAKNG_CTYPE_UPPERCASE:
current += utf8_out(c, current); current += utf8_out(c, current);
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE);
case ESPEAKNG_CTYPE_FULL_STOP:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_FULL_STOP;
default: default:
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

ESPEAKNG_TOKEN_WORD_LOWERCASE, ESPEAKNG_TOKEN_WORD_LOWERCASE,
ESPEAKNG_TOKEN_WORD_MIXEDCASE, ESPEAKNG_TOKEN_WORD_MIXEDCASE,
ESPEAKNG_TOKEN_WORD_CAPITALIZED, ESPEAKNG_TOKEN_WORD_CAPITALIZED,
ESPEAKNG_TOKEN_FULL_STOP,
} espeak_ng_TOKEN_TYPE; } espeak_ng_TOKEN_TYPE;


espeak_ng_TOKEN_TYPE espeak_ng_TOKEN_TYPE

+ 27
- 0
tests/tokenizer.c View File

destroy_tokenizer(tokenizer); destroy_tokenizer(tokenizer);
} }


void
test_Latn_punctuation_tokens()
{
printf("testing Latin (Latn) script punctuation tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, ".", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void void
run_tests() run_tests()
{ {
test_whitespace_tokens(); test_whitespace_tokens();


test_Latn_word_tokens(); test_Latn_word_tokens();
test_Latn_punctuation_tokens();


printf("done\n"); printf("done\n");
} }
case ESPEAKNG_TOKEN_WORD_CAPITALIZED: case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer)); printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer));
break; break;
case ESPEAKNG_TOKEN_FULL_STOP:
printf("full stop : %s\n", tokenizer_get_token_text(tokenizer));
break;
} }
} }



Loading…
Cancel
Save