Browse Source

Create a basic tokenizer API using a structure that mirrors the TtsTokenizer interface in the tts-dev-studio project.

master
Reece H. Dunn 8 years ago
parent
commit
bce44316bb
3 changed files with 124 additions and 0 deletions
  1. 44
    0
      src/libespeak-ng/tokenizer.c
  2. 53
    0
      src/libespeak-ng/tokenizer.h
  3. 27
    0
      tests/tokenizer.c

+ 44
- 0
src/libespeak-ng/tokenizer.c View File

@@ -28,6 +28,7 @@
#include <ucd/ucd.h>

#include "encoding.h"
#include "tokenizer.h"
#include "speech.h"
#include "phoneme.h"
#include "synthesize.h"
@@ -81,3 +82,46 @@ int clause_type_from_codepoint(uint32_t c)

return CLAUSE_NONE;
}

struct espeak_ng_TOKENIZER_
{
espeak_ng_TEXT_DECODER *decoder;
};

espeak_ng_TOKENIZER *
create_tokenizer(void)
{
espeak_ng_TOKENIZER *tokenizer = malloc(sizeof(espeak_ng_TOKENIZER));
if (!tokenizer) return NULL;

tokenizer->decoder = NULL;
return tokenizer;
}

void
destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer)
{
if (tokenizer) free(tokenizer);
}

int
tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
espeak_ng_TEXT_DECODER *decoder)
{
if (!tokenizer || !decoder) return 0;

tokenizer->decoder = decoder;
return 1;
}

espeak_ng_TOKEN_TYPE
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer)
{
return ESPEAKNG_TOKEN_END_OF_BUFFER;
}

const char *
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer)
{
return "";
}

+ 53
- 0
src/libespeak-ng/tokenizer.h View File

@@ -0,0 +1,53 @@
/*
* Copyright (C) 2017 Reece H. Dunn
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see: <http://www.gnu.org/licenses/>.
*/
#ifndef ESPEAK_NG_TOKENIZER_H
#define ESPEAK_NG_TOKENIZER_H

#ifdef __cplusplus
extern "C"
{
#endif

typedef struct espeak_ng_TOKENIZER_ espeak_ng_TOKENIZER;

espeak_ng_TOKENIZER *
create_tokenizer(void);

void
destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer);

int
tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
espeak_ng_TEXT_DECODER *decoder);

typedef enum
{
ESPEAKNG_TOKEN_END_OF_BUFFER,
ESPEAKNG_TOKEN_UNKNOWN,
} espeak_ng_TOKEN_TYPE;

espeak_ng_TOKEN_TYPE
tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer);

const char *
tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer);

#ifdef __cplusplus
}
#endif

#endif

+ 27
- 0
tests/tokenizer.c View File

@@ -26,6 +26,7 @@
#include <espeak-ng/espeak_ng.h>

#include "encoding.h"
#include "tokenizer.h"
#include "speech.h"
#include "phoneme.h"
#include "synthesize.h"
@@ -153,6 +154,30 @@ test_fullwidth()
assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER));
}

void
test_unbound_tokenizer()
{
printf("testing unbound tokenizer\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
assert(tokenizer != NULL);

assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

assert(tokenizer_reset(tokenizer, NULL) == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_tokenizer(tokenizer);
}

int
main(int argc, char **argv)
{
@@ -168,6 +193,8 @@ main(int argc, char **argv)
test_ideographic();
test_fullwidth();

test_unbound_tokenizer();

printf("done\n");

return EXIT_SUCCESS;

Loading…
Cancel
Save