src/speak-ng.1.html \ | src/speak-ng.1.html \ | ||||
README.html | README.html | ||||
check: tests/encoding.test | |||||
tests/encoding.test | |||||
##### build targets: | ##### build targets: | ||||
espeak_includedir = $(includedir)/espeak | espeak_includedir = $(includedir)/espeak | ||||
src/libespeak-ng/synthdata.c \ | src/libespeak-ng/synthdata.c \ | ||||
src/libespeak-ng/synthesize.c \ | src/libespeak-ng/synthesize.c \ | ||||
src/libespeak-ng/synth_mbrola.c \ | src/libespeak-ng/synth_mbrola.c \ | ||||
src/libespeak-ng/tokenizer.c \ | |||||
src/libespeak-ng/translate.c \ | src/libespeak-ng/translate.c \ | ||||
src/libespeak-ng/tr_languages.c \ | src/libespeak-ng/tr_languages.c \ | ||||
src/libespeak-ng/voices.c \ | src/libespeak-ng/voices.c \ | ||||
src/libespeak-ng/mnemonics.c \ | src/libespeak-ng/mnemonics.c \ | ||||
tests/encoding.c | tests/encoding.c | ||||
noinst_bin_PROGRAMS += tests/tokenizer.test | |||||
tests_tokenizer_test_CFLAGS = \ | |||||
-Isrc/libespeak-ng -Isrc/include -Isrc/include/compat -I src/ucd-tools/src/include \ | |||||
-D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} | |||||
tests_tokenizer_test_SOURCES = \ | |||||
src/libespeak-ng/tokenizer.c \ | |||||
tests/tokenizer.c | |||||
check: tests/encoding.test tests/tokenizer.test | |||||
tests/encoding.test | |||||
tests/tokenizer.test | |||||
##### phoneme data: | ##### phoneme data: | ||||
espeak-ng-data/phondata: phsource/phonemes.stamp | espeak-ng-data/phondata: phsource/phonemes.stamp |
src/libespeak-ng/synthdata.c \ | src/libespeak-ng/synthdata.c \ | ||||
src/libespeak-ng/synthesize.c \ | src/libespeak-ng/synthesize.c \ | ||||
src/libespeak-ng/synth_mbrola.c \ | src/libespeak-ng/synth_mbrola.c \ | ||||
src/libespeak-ng/tokenizer.c \ | |||||
src/libespeak-ng/translate.c \ | src/libespeak-ng/translate.c \ | ||||
src/libespeak-ng/tr_languages.c \ | src/libespeak-ng/tr_languages.c \ | ||||
src/libespeak-ng/voices.c \ | src/libespeak-ng/voices.c \ |
static int sayas_start; | static int sayas_start; | ||||
static int ssml_ignore_l_angle = 0; | static int ssml_ignore_l_angle = 0; | ||||
// punctuations symbols that can end a clause | |||||
static const unsigned short punct_chars[] = { | |||||
',', '.', '?', '!', ':', ';', | |||||
0x00a1, // inverted exclamation | |||||
0x00bf, // inverted question | |||||
0x2013, // en-dash | |||||
0x2014, // em-dash | |||||
0x2026, // elipsis | |||||
0x037e, // Greek question mark (looks like semicolon) | |||||
0x0387, // Greek semicolon, ano teleia | |||||
0x0964, // Devanagari Danda (fullstop) | |||||
0x0589, // Armenian period | |||||
0x055d, // Armenian comma | |||||
0x055c, // Armenian exclamation | |||||
0x055e, // Armenian question | |||||
0x055b, // Armenian emphasis mark | |||||
0x060c, // Arabic , | |||||
0x061b, // Arabic ; | |||||
0x061f, // Arabic ? | |||||
0x06d4, // Arabic . | |||||
0x0df4, // Singhalese Kunddaliya | |||||
0x0f0d, // Tibet Shad | |||||
0x0f0e, | |||||
0x1362, // Ethiopic period | |||||
0x1363, | |||||
0x1364, | |||||
0x1365, | |||||
0x1366, | |||||
0x1367, | |||||
0x1368, | |||||
0x10fb, // Georgian paragraph | |||||
0x3001, // ideograph comma | |||||
0x3002, // ideograph period | |||||
0xff01, // fullwidth exclamation | |||||
0xff0c, // fullwidth comma | |||||
0xff0e, // fullwidth period | |||||
0xff1a, // fullwidth colon | |||||
0xff1b, // fullwidth semicolon | |||||
0xff1f, // fullwidth question mark | |||||
0 | |||||
}; | |||||
// indexed by (entry num. in punct_chars) + 1 | |||||
// bits 0-7 pause x 10mS, bits 12-14 intonation type, bit 15 don't need following space or bracket | |||||
static const unsigned int punct_attributes[] = { | |||||
0, | |||||
CLAUSE_COMMA, | |||||
CLAUSE_PERIOD, | |||||
CLAUSE_QUESTION, | |||||
CLAUSE_EXCLAMATION, | |||||
CLAUSE_COLON, | |||||
CLAUSE_SEMICOLON, | |||||
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation | |||||
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question | |||||
CLAUSE_SEMICOLON, // en-dash | |||||
CLAUSE_SEMICOLON, // em-dash | |||||
CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis | |||||
CLAUSE_QUESTION, // Greek question mark | |||||
CLAUSE_SEMICOLON, // Greek semicolon | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop) | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period | |||||
CLAUSE_COMMA, // Armenian comma | |||||
CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation | |||||
CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question | |||||
CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark | |||||
CLAUSE_COMMA, // Arabic , | |||||
CLAUSE_SEMICOLON, // Arabic ; | |||||
CLAUSE_QUESTION, // Arabic question mark | |||||
CLAUSE_PERIOD, // Arabic full stop | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period | |||||
CLAUSE_PARAGRAPH, | |||||
CLAUSE_PERIOD, // Ethiopic period | |||||
CLAUSE_COMMA, // Ethiopic comma | |||||
CLAUSE_SEMICOLON, // Ethiopic semicolon | |||||
CLAUSE_COLON, // Ethiopic colon | |||||
CLAUSE_COLON, // Ethiopic preface colon | |||||
CLAUSE_QUESTION, // Ethiopic question mark | |||||
CLAUSE_PARAGRAPH, // Ethiopic paragraph | |||||
CLAUSE_PARAGRAPH, // Georgian paragraph | |||||
CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period | |||||
CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth | |||||
CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_SEMICOLON, // spare | |||||
0 | |||||
}; | |||||
// stack for language and voice properties | // stack for language and voice properties | ||||
// frame 0 is for the defaults, before any ssml tags. | // frame 0 is for the defaults, before any ssml tags. | ||||
typedef struct { | typedef struct { | ||||
if (c1 == '-') | if (c1 == '-') | ||||
return CLAUSE_NONE; // no pause | return CLAUSE_NONE; // no pause | ||||
attributes = punct_attributes[lookupwchar(punct_chars, c1)]; | |||||
attributes = clause_type_from_codepoint(c1); | |||||
short_pause = CLAUSE_SHORTFALL; | short_pause = CLAUSE_SHORTFALL; | ||||
if ((attributes & CLAUSE_INTONATION_TYPE) == 0x1000) | if ((attributes & CLAUSE_INTONATION_TYPE) == 0x1000) | ||||
int phoneme_mode = 0; | int phoneme_mode = 0; | ||||
int n_xml_buf; | int n_xml_buf; | ||||
int terminator; | int terminator; | ||||
int punct; | |||||
int found; | int found; | ||||
int any_alnum = 0; | int any_alnum = 0; | ||||
int self_closing; | int self_closing; | ||||
if ((c2 == '\n') && (option_linelength == -1)) { | if ((c2 == '\n') && (option_linelength == -1)) { | ||||
// single-line mode, return immediately on NL | // single-line mode, return immediately on NL | ||||
if ((punct = lookupwchar(punct_chars, c1)) == 0) { | |||||
if ((terminator = clause_type_from_codepoint(c1)) == CLAUSE_NONE) { | |||||
charix[ix] = count_characters - clause_start_char; | charix[ix] = count_characters - clause_start_char; | ||||
*charix_top = ix; | *charix_top = ix; | ||||
ix += utf8_out(c1, &buf[ix]); | ix += utf8_out(c1, &buf[ix]); | ||||
terminator = CLAUSE_PERIOD; // line doesn't end in punctuation, assume period | terminator = CLAUSE_PERIOD; // line doesn't end in punctuation, assume period | ||||
} else | |||||
terminator = punct_attributes[punct]; | |||||
} | |||||
buf[ix] = ' '; | buf[ix] = ' '; | ||||
buf[ix+1] = 0; | buf[ix+1] = 0; | ||||
return terminator; | return terminator; | ||||
} | } | ||||
punct_data = 0; | punct_data = 0; | ||||
if ((punct = lookupwchar(punct_chars, c1)) != 0) { | |||||
punct_data = punct_attributes[punct]; | |||||
if ((punct_data = clause_type_from_codepoint(c1)) != CLAUSE_NONE) { | |||||
if (punct_data & CLAUSE_PUNCTUATION_IN_WORD) { | if (punct_data & CLAUSE_PUNCTUATION_IN_WORD) { | ||||
// Armenian punctuation inside a word | // Armenian punctuation inside a word | ||||
stressed_word = 1; | stressed_word = 1; |
/* Tokenizer APIs. | |||||
* | |||||
* Copyright (C) 2005 to 2015 by Jonathan Duddington | |||||
* email: [email protected] | |||||
* Copyright (C) 2017 Reece H. Dunn | |||||
* | |||||
* This program is free software; you can redistribute it and/or modify | |||||
* it under the terms of the GNU General Public License as published by | |||||
* the Free Software Foundation; either version 3 of the License, or | |||||
* (at your option) any later version. | |||||
* | |||||
* This program is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
* GNU General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU General Public License | |||||
* along with this program; if not, see: <http://www.gnu.org/licenses/>. | |||||
*/ | |||||
#include "config.h" | |||||
#include <errno.h> | |||||
#include <stdint.h> | |||||
#include <stdio.h> | |||||
#include <stdlib.h> | |||||
#include <string.h> | |||||
#include <espeak-ng/espeak_ng.h> | |||||
#include "encoding.h" | |||||
#include "speech.h" | |||||
#include "phoneme.h" | |||||
#include "synthesize.h" | |||||
#include "translate.h" | |||||
// punctuations symbols that can end a clause | |||||
static const unsigned short punct_chars[] = { | |||||
',', '.', '?', '!', ':', ';', | |||||
0x00a1, // inverted exclamation | |||||
0x00bf, // inverted question | |||||
0x2013, // en-dash | |||||
0x2014, // em-dash | |||||
0x2026, // elipsis | |||||
0x037e, // Greek question mark (looks like semicolon) | |||||
0x0387, // Greek semicolon, ano teleia | |||||
0x0964, // Devanagari Danda (fullstop) | |||||
0x0589, // Armenian period | |||||
0x055d, // Armenian comma | |||||
0x055c, // Armenian exclamation | |||||
0x055e, // Armenian question | |||||
0x055b, // Armenian emphasis mark | |||||
0x060c, // Arabic , | |||||
0x061b, // Arabic ; | |||||
0x061f, // Arabic ? | |||||
0x06d4, // Arabic . | |||||
0x0df4, // Singhalese Kunddaliya | |||||
0x0f0d, // Tibet Shad | |||||
0x0f0e, | |||||
0x1362, // Ethiopic period | |||||
0x1363, | |||||
0x1364, | |||||
0x1365, | |||||
0x1366, | |||||
0x1367, | |||||
0x1368, | |||||
0x10fb, // Georgian paragraph | |||||
0x3001, // ideograph comma | |||||
0x3002, // ideograph period | |||||
0xff01, // fullwidth exclamation | |||||
0xff0c, // fullwidth comma | |||||
0xff0e, // fullwidth period | |||||
0xff1a, // fullwidth colon | |||||
0xff1b, // fullwidth semicolon | |||||
0xff1f, // fullwidth question mark | |||||
0 | |||||
}; | |||||
// indexed by entry num. in punct_chars | |||||
static const unsigned int punct_attributes[] = { | |||||
CLAUSE_COMMA, | |||||
CLAUSE_PERIOD, | |||||
CLAUSE_QUESTION, | |||||
CLAUSE_EXCLAMATION, | |||||
CLAUSE_COLON, | |||||
CLAUSE_SEMICOLON, | |||||
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation | |||||
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question | |||||
CLAUSE_SEMICOLON, // en-dash | |||||
CLAUSE_SEMICOLON, // em-dash | |||||
CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis | |||||
CLAUSE_QUESTION, // Greek question mark | |||||
CLAUSE_SEMICOLON, // Greek semicolon | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop) | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period | |||||
CLAUSE_COMMA, // Armenian comma | |||||
CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation | |||||
CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question | |||||
CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark | |||||
CLAUSE_COMMA, // Arabic , | |||||
CLAUSE_SEMICOLON, // Arabic ; | |||||
CLAUSE_QUESTION, // Arabic question mark | |||||
CLAUSE_PERIOD, // Arabic full stop | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period | |||||
CLAUSE_PARAGRAPH, | |||||
CLAUSE_PERIOD, // Ethiopic period | |||||
CLAUSE_COMMA, // Ethiopic comma | |||||
CLAUSE_SEMICOLON, // Ethiopic semicolon | |||||
CLAUSE_COLON, // Ethiopic colon | |||||
CLAUSE_COLON, // Ethiopic preface colon | |||||
CLAUSE_QUESTION, // Ethiopic question mark | |||||
CLAUSE_PARAGRAPH, // Ethiopic paragraph | |||||
CLAUSE_PARAGRAPH, // Georgian paragraph | |||||
CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period | |||||
CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth | |||||
CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
0 | |||||
}; | |||||
int clause_type_from_codepoint(uint32_t c) | |||||
{ | |||||
for (int ix = 0; punct_chars[ix] != 0; ++ix) { | |||||
if (punct_chars[ix] == c) | |||||
return punct_attributes[ix]; | |||||
} | |||||
return CLAUSE_NONE; | |||||
} |
#define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE) | #define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE) | ||||
#define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE) | #define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE) | ||||
int clause_type_from_codepoint(uint32_t c); | |||||
//@} | //@} | ||||
#define SAYAS_CHARS 0x12 | #define SAYAS_CHARS 0x12 |
<ClCompile Include="..\libespeak-ng\synthdata.c" /> | <ClCompile Include="..\libespeak-ng\synthdata.c" /> | ||||
<ClCompile Include="..\libespeak-ng\synthesize.c" /> | <ClCompile Include="..\libespeak-ng\synthesize.c" /> | ||||
<ClCompile Include="..\libespeak-ng\synth_mbrola.c" /> | <ClCompile Include="..\libespeak-ng\synth_mbrola.c" /> | ||||
<ClCompile Include="..\libespeak-ng\tokenizer.c" /> | |||||
<ClCompile Include="..\libespeak-ng\translate.c" /> | <ClCompile Include="..\libespeak-ng\translate.c" /> | ||||
<ClCompile Include="..\libespeak-ng\tr_languages.c" /> | <ClCompile Include="..\libespeak-ng\tr_languages.c" /> | ||||
<ClCompile Include="..\libespeak-ng\voices.c" /> | <ClCompile Include="..\libespeak-ng\voices.c" /> |
/* | |||||
* Copyright (C) 2017 Reece H. Dunn | |||||
* | |||||
* This program is free software; you can redistribute it and/or modify | |||||
* it under the terms of the GNU General Public License as published by | |||||
* the Free Software Foundation; either version 3 of the License, or | |||||
* (at your option) any later version. | |||||
* | |||||
* This program is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
* GNU General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU General Public License | |||||
* along with this program; if not, write see: | |||||
* <http://www.gnu.org/licenses/>. | |||||
*/ | |||||
#include "config.h" | |||||
#include <assert.h> | |||||
#include <stdint.h> | |||||
#include <stdlib.h> | |||||
#include <stdio.h> | |||||
#include <espeak-ng/espeak_ng.h> | |||||
#include "encoding.h" | |||||
#include "speech.h" | |||||
#include "phoneme.h" | |||||
#include "synthesize.h" | |||||
#include "translate.h" | |||||
void | |||||
test_latin_common() | |||||
{ | |||||
printf("testing Latin/Common (Latn/Zyyy) script classification\n"); | |||||
assert(clause_type_from_codepoint('a') == CLAUSE_NONE); | |||||
assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD); | |||||
assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION); | |||||
assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION); | |||||
assert(clause_type_from_codepoint(',') == CLAUSE_COMMA); | |||||
assert(clause_type_from_codepoint(':') == CLAUSE_COLON); | |||||
assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON); | |||||
assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON); | |||||
assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON); | |||||
assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
} | |||||
void | |||||
test_greek() | |||||
{ | |||||
printf("testing Greek (Grek) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION); | |||||
assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON); | |||||
} | |||||
void | |||||
test_armenian() | |||||
{ | |||||
printf("testing Armenian (Armn) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x055B) == (CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD)); | |||||
assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD)); | |||||
assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA); | |||||
assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD)); | |||||
assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
} | |||||
void | |||||
test_arabic() | |||||
{ | |||||
printf("testing Arabic (Arab) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA); | |||||
assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON); | |||||
assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION); | |||||
assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD); | |||||
} | |||||
void | |||||
test_devanagari() | |||||
{ | |||||
printf("testing Devanagari (Deva) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
} | |||||
void | |||||
test_tibetan() | |||||
{ | |||||
printf("testing Tibetan (Tibt) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH); | |||||
} | |||||
void | |||||
test_sinhala() | |||||
{ | |||||
printf("testing Sinhala (Sinh) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
} | |||||
void | |||||
test_georgian() | |||||
{ | |||||
printf("testing Georgian (Geor) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH); | |||||
} | |||||
void | |||||
test_ethiopic() | |||||
{ | |||||
printf("testing Ethiopic (Ethi) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD); | |||||
assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA); | |||||
assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON); | |||||
assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON); | |||||
assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON); | |||||
assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION); | |||||
assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH); | |||||
} | |||||
void | |||||
test_ideographic() | |||||
{ | |||||
printf("testing Ideographic (Hani) script classification\n"); | |||||
assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
} | |||||
void | |||||
test_fullwidth() | |||||
{ | |||||
printf("testing Full Width/Common (Zyyy) script classification\n"); | |||||
assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
} | |||||
int | |||||
main(int argc, char **argv) | |||||
{ | |||||
test_latin_common(); | |||||
test_greek(); | |||||
test_armenian(); | |||||
test_arabic(); | |||||
test_devanagari(); | |||||
test_tibetan(); | |||||
test_sinhala(); | |||||
test_georgian(); | |||||
test_ethiopic(); | |||||
test_ideographic(); | |||||
test_fullwidth(); | |||||
printf("done\n"); | |||||
return EXIT_SUCCESS; | |||||
} |