| @@ -112,9 +112,6 @@ docs: docs/index.html \ | |||
| src/speak-ng.1.html \ | |||
| README.html | |||
| check: tests/encoding.test | |||
| tests/encoding.test | |||
| ##### build targets: | |||
| espeak_includedir = $(includedir)/espeak | |||
| @@ -163,6 +160,7 @@ src_libespeak_ng_la_SOURCES = \ | |||
| src/libespeak-ng/synthdata.c \ | |||
| src/libespeak-ng/synthesize.c \ | |||
| src/libespeak-ng/synth_mbrola.c \ | |||
| src/libespeak-ng/tokenizer.c \ | |||
| src/libespeak-ng/translate.c \ | |||
| src/libespeak-ng/tr_languages.c \ | |||
| src/libespeak-ng/voices.c \ | |||
| @@ -217,6 +215,19 @@ tests_encoding_test_SOURCES = \ | |||
| src/libespeak-ng/mnemonics.c \ | |||
| tests/encoding.c | |||
| noinst_bin_PROGRAMS += tests/tokenizer.test | |||
| tests_tokenizer_test_CFLAGS = \ | |||
| -Isrc/libespeak-ng -Isrc/include -Isrc/include/compat -I src/ucd-tools/src/include \ | |||
| -D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} | |||
| tests_tokenizer_test_SOURCES = \ | |||
| src/libespeak-ng/tokenizer.c \ | |||
| tests/tokenizer.c | |||
| check: tests/encoding.test tests/tokenizer.test | |||
| tests/encoding.test | |||
| tests/tokenizer.test | |||
| ##### phoneme data: | |||
| espeak-ng-data/phondata: phsource/phonemes.stamp | |||
| @@ -33,6 +33,7 @@ ESPEAK_SOURCES := \ | |||
| src/libespeak-ng/synthdata.c \ | |||
| src/libespeak-ng/synthesize.c \ | |||
| src/libespeak-ng/synth_mbrola.c \ | |||
| src/libespeak-ng/tokenizer.c \ | |||
| src/libespeak-ng/translate.c \ | |||
| src/libespeak-ng/tr_languages.c \ | |||
| src/libespeak-ng/voices.c \ | |||
| @@ -63,117 +63,6 @@ static int sayas_mode; | |||
| static int sayas_start; | |||
| static int ssml_ignore_l_angle = 0; | |||
| // punctuations symbols that can end a clause | |||
| static const unsigned short punct_chars[] = { | |||
| ',', '.', '?', '!', ':', ';', | |||
| 0x00a1, // inverted exclamation | |||
| 0x00bf, // inverted question | |||
| 0x2013, // en-dash | |||
| 0x2014, // em-dash | |||
| 0x2026, // elipsis | |||
| 0x037e, // Greek question mark (looks like semicolon) | |||
| 0x0387, // Greek semicolon, ano teleia | |||
| 0x0964, // Devanagari Danda (fullstop) | |||
| 0x0589, // Armenian period | |||
| 0x055d, // Armenian comma | |||
| 0x055c, // Armenian exclamation | |||
| 0x055e, // Armenian question | |||
| 0x055b, // Armenian emphasis mark | |||
| 0x060c, // Arabic , | |||
| 0x061b, // Arabic ; | |||
| 0x061f, // Arabic ? | |||
| 0x06d4, // Arabic . | |||
| 0x0df4, // Singhalese Kunddaliya | |||
| 0x0f0d, // Tibet Shad | |||
| 0x0f0e, | |||
| 0x1362, // Ethiopic period | |||
| 0x1363, | |||
| 0x1364, | |||
| 0x1365, | |||
| 0x1366, | |||
| 0x1367, | |||
| 0x1368, | |||
| 0x10fb, // Georgian paragraph | |||
| 0x3001, // ideograph comma | |||
| 0x3002, // ideograph period | |||
| 0xff01, // fullwidth exclamation | |||
| 0xff0c, // fullwidth comma | |||
| 0xff0e, // fullwidth period | |||
| 0xff1a, // fullwidth colon | |||
| 0xff1b, // fullwidth semicolon | |||
| 0xff1f, // fullwidth question mark | |||
| 0 | |||
| }; | |||
| // indexed by (entry num. in punct_chars) + 1 | |||
| // bits 0-7 pause x 10mS, bits 12-14 intonation type, bit 15 don't need following space or bracket | |||
| static const unsigned int punct_attributes[] = { | |||
| 0, | |||
| CLAUSE_COMMA, | |||
| CLAUSE_PERIOD, | |||
| CLAUSE_QUESTION, | |||
| CLAUSE_EXCLAMATION, | |||
| CLAUSE_COLON, | |||
| CLAUSE_SEMICOLON, | |||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation | |||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question | |||
| CLAUSE_SEMICOLON, // en-dash | |||
| CLAUSE_SEMICOLON, // em-dash | |||
| CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis | |||
| CLAUSE_QUESTION, // Greek question mark | |||
| CLAUSE_SEMICOLON, // Greek semicolon | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop) | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period | |||
| CLAUSE_COMMA, // Armenian comma | |||
| CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation | |||
| CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question | |||
| CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark | |||
| CLAUSE_COMMA, // Arabic , | |||
| CLAUSE_SEMICOLON, // Arabic ; | |||
| CLAUSE_QUESTION, // Arabic question mark | |||
| CLAUSE_PERIOD, // Arabic full stop | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period | |||
| CLAUSE_PARAGRAPH, | |||
| CLAUSE_PERIOD, // Ethiopic period | |||
| CLAUSE_COMMA, // Ethiopic comma | |||
| CLAUSE_SEMICOLON, // Ethiopic semicolon | |||
| CLAUSE_COLON, // Ethiopic colon | |||
| CLAUSE_COLON, // Ethiopic preface colon | |||
| CLAUSE_QUESTION, // Ethiopic question mark | |||
| CLAUSE_PARAGRAPH, // Ethiopic paragraph | |||
| CLAUSE_PARAGRAPH, // Georgian paragraph | |||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period | |||
| CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth | |||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_SEMICOLON, // spare | |||
| 0 | |||
| }; | |||
| // stack for language and voice properties | |||
| // frame 0 is for the defaults, before any ssml tags. | |||
| typedef struct { | |||
| @@ -604,7 +493,7 @@ static int AnnouncePunctuation(Translator *tr, int c1, int *c2_ptr, char *output | |||
| if (c1 == '-') | |||
| return CLAUSE_NONE; // no pause | |||
| attributes = punct_attributes[lookupwchar(punct_chars, c1)]; | |||
| attributes = clause_type_from_codepoint(c1); | |||
| short_pause = CLAUSE_SHORTFALL; | |||
| if ((attributes & CLAUSE_INTONATION_TYPE) == 0x1000) | |||
| @@ -1624,7 +1513,6 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_ | |||
| int phoneme_mode = 0; | |||
| int n_xml_buf; | |||
| int terminator; | |||
| int punct; | |||
| int found; | |||
| int any_alnum = 0; | |||
| int self_closing; | |||
| @@ -1797,13 +1685,12 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_ | |||
| if ((c2 == '\n') && (option_linelength == -1)) { | |||
| // single-line mode, return immediately on NL | |||
| if ((punct = lookupwchar(punct_chars, c1)) == 0) { | |||
| if ((terminator = clause_type_from_codepoint(c1)) == CLAUSE_NONE) { | |||
| charix[ix] = count_characters - clause_start_char; | |||
| *charix_top = ix; | |||
| ix += utf8_out(c1, &buf[ix]); | |||
| terminator = CLAUSE_PERIOD; // line doesn't end in punctuation, assume period | |||
| } else | |||
| terminator = punct_attributes[punct]; | |||
| } | |||
| buf[ix] = ' '; | |||
| buf[ix+1] = 0; | |||
| return terminator; | |||
| @@ -1990,9 +1877,7 @@ int ReadClause(Translator *tr, char *buf, short *charix, int *charix_top, int n_ | |||
| } | |||
| punct_data = 0; | |||
| if ((punct = lookupwchar(punct_chars, c1)) != 0) { | |||
| punct_data = punct_attributes[punct]; | |||
| if ((punct_data = clause_type_from_codepoint(c1)) != CLAUSE_NONE) { | |||
| if (punct_data & CLAUSE_PUNCTUATION_IN_WORD) { | |||
| // Armenian punctuation inside a word | |||
| stressed_word = 1; | |||
| @@ -0,0 +1,151 @@ | |||
| /* Tokenizer APIs. | |||
| * | |||
| * Copyright (C) 2005 to 2015 by Jonathan Duddington | |||
| * email: [email protected] | |||
| * Copyright (C) 2017 Reece H. Dunn | |||
| * | |||
| * This program is free software; you can redistribute it and/or modify | |||
| * it under the terms of the GNU General Public License as published by | |||
| * the Free Software Foundation; either version 3 of the License, or | |||
| * (at your option) any later version. | |||
| * | |||
| * This program is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| * GNU General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU General Public License | |||
| * along with this program; if not, see: <http://www.gnu.org/licenses/>. | |||
| */ | |||
| #include "config.h" | |||
| #include <errno.h> | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #include <string.h> | |||
| #include <espeak-ng/espeak_ng.h> | |||
| #include "encoding.h" | |||
| #include "speech.h" | |||
| #include "phoneme.h" | |||
| #include "synthesize.h" | |||
| #include "translate.h" | |||
| // punctuations symbols that can end a clause | |||
| static const unsigned short punct_chars[] = { | |||
| ',', '.', '?', '!', ':', ';', | |||
| 0x00a1, // inverted exclamation | |||
| 0x00bf, // inverted question | |||
| 0x2013, // en-dash | |||
| 0x2014, // em-dash | |||
| 0x2026, // elipsis | |||
| 0x037e, // Greek question mark (looks like semicolon) | |||
| 0x0387, // Greek semicolon, ano teleia | |||
| 0x0964, // Devanagari Danda (fullstop) | |||
| 0x0589, // Armenian period | |||
| 0x055d, // Armenian comma | |||
| 0x055c, // Armenian exclamation | |||
| 0x055e, // Armenian question | |||
| 0x055b, // Armenian emphasis mark | |||
| 0x060c, // Arabic , | |||
| 0x061b, // Arabic ; | |||
| 0x061f, // Arabic ? | |||
| 0x06d4, // Arabic . | |||
| 0x0df4, // Singhalese Kunddaliya | |||
| 0x0f0d, // Tibet Shad | |||
| 0x0f0e, | |||
| 0x1362, // Ethiopic period | |||
| 0x1363, | |||
| 0x1364, | |||
| 0x1365, | |||
| 0x1366, | |||
| 0x1367, | |||
| 0x1368, | |||
| 0x10fb, // Georgian paragraph | |||
| 0x3001, // ideograph comma | |||
| 0x3002, // ideograph period | |||
| 0xff01, // fullwidth exclamation | |||
| 0xff0c, // fullwidth comma | |||
| 0xff0e, // fullwidth period | |||
| 0xff1a, // fullwidth colon | |||
| 0xff1b, // fullwidth semicolon | |||
| 0xff1f, // fullwidth question mark | |||
| 0 | |||
| }; | |||
| // indexed by entry num. in punct_chars | |||
| static const unsigned int punct_attributes[] = { | |||
| CLAUSE_COMMA, | |||
| CLAUSE_PERIOD, | |||
| CLAUSE_QUESTION, | |||
| CLAUSE_EXCLAMATION, | |||
| CLAUSE_COLON, | |||
| CLAUSE_SEMICOLON, | |||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation | |||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question | |||
| CLAUSE_SEMICOLON, // en-dash | |||
| CLAUSE_SEMICOLON, // em-dash | |||
| CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis | |||
| CLAUSE_QUESTION, // Greek question mark | |||
| CLAUSE_SEMICOLON, // Greek semicolon | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop) | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period | |||
| CLAUSE_COMMA, // Armenian comma | |||
| CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation | |||
| CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question | |||
| CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark | |||
| CLAUSE_COMMA, // Arabic , | |||
| CLAUSE_SEMICOLON, // Arabic ; | |||
| CLAUSE_QUESTION, // Arabic question mark | |||
| CLAUSE_PERIOD, // Arabic full stop | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period | |||
| CLAUSE_PARAGRAPH, | |||
| CLAUSE_PERIOD, // Ethiopic period | |||
| CLAUSE_COMMA, // Ethiopic comma | |||
| CLAUSE_SEMICOLON, // Ethiopic semicolon | |||
| CLAUSE_COLON, // Ethiopic colon | |||
| CLAUSE_COLON, // Ethiopic preface colon | |||
| CLAUSE_QUESTION, // Ethiopic question mark | |||
| CLAUSE_PARAGRAPH, // Ethiopic paragraph | |||
| CLAUSE_PARAGRAPH, // Georgian paragraph | |||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period | |||
| CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth | |||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER, | |||
| 0 | |||
| }; | |||
| int clause_type_from_codepoint(uint32_t c) | |||
| { | |||
| for (int ix = 0; punct_chars[ix] != 0; ++ix) { | |||
| if (punct_chars[ix] == c) | |||
| return punct_attributes[ix]; | |||
| } | |||
| return CLAUSE_NONE; | |||
| } | |||
| @@ -235,6 +235,8 @@ extern "C" | |||
| #define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE) | |||
| #define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE) | |||
| int clause_type_from_codepoint(uint32_t c); | |||
| //@} | |||
| #define SAYAS_CHARS 0x12 | |||
| @@ -186,6 +186,7 @@ | |||
| <ClCompile Include="..\libespeak-ng\synthdata.c" /> | |||
| <ClCompile Include="..\libespeak-ng\synthesize.c" /> | |||
| <ClCompile Include="..\libespeak-ng\synth_mbrola.c" /> | |||
| <ClCompile Include="..\libespeak-ng\tokenizer.c" /> | |||
| <ClCompile Include="..\libespeak-ng\translate.c" /> | |||
| <ClCompile Include="..\libespeak-ng\tr_languages.c" /> | |||
| <ClCompile Include="..\libespeak-ng\voices.c" /> | |||
| @@ -0,0 +1,174 @@ | |||
| /* | |||
| * Copyright (C) 2017 Reece H. Dunn | |||
| * | |||
| * This program is free software; you can redistribute it and/or modify | |||
| * it under the terms of the GNU General Public License as published by | |||
| * the Free Software Foundation; either version 3 of the License, or | |||
| * (at your option) any later version. | |||
| * | |||
| * This program is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| * GNU General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU General Public License | |||
| * along with this program; if not, write see: | |||
| * <http://www.gnu.org/licenses/>. | |||
| */ | |||
| #include "config.h" | |||
| #include <assert.h> | |||
| #include <stdint.h> | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| #include <espeak-ng/espeak_ng.h> | |||
| #include "encoding.h" | |||
| #include "speech.h" | |||
| #include "phoneme.h" | |||
| #include "synthesize.h" | |||
| #include "translate.h" | |||
| void | |||
| test_latin_common() | |||
| { | |||
| printf("testing Latin/Common (Latn/Zyyy) script classification\n"); | |||
| assert(clause_type_from_codepoint('a') == CLAUSE_NONE); | |||
| assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD); | |||
| assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION); | |||
| assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION); | |||
| assert(clause_type_from_codepoint(',') == CLAUSE_COMMA); | |||
| assert(clause_type_from_codepoint(':') == CLAUSE_COLON); | |||
| assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON); | |||
| assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON); | |||
| assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON); | |||
| assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| } | |||
| void | |||
| test_greek() | |||
| { | |||
| printf("testing Greek (Grek) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION); | |||
| assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON); | |||
| } | |||
| void | |||
| test_armenian() | |||
| { | |||
| printf("testing Armenian (Armn) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x055B) == (CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD)); | |||
| assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD)); | |||
| assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA); | |||
| assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD)); | |||
| assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| } | |||
| void | |||
| test_arabic() | |||
| { | |||
| printf("testing Arabic (Arab) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA); | |||
| assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON); | |||
| assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION); | |||
| assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD); | |||
| } | |||
| void | |||
| test_devanagari() | |||
| { | |||
| printf("testing Devanagari (Deva) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| } | |||
| void | |||
| test_tibetan() | |||
| { | |||
| printf("testing Tibetan (Tibt) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH); | |||
| } | |||
| void | |||
| test_sinhala() | |||
| { | |||
| printf("testing Sinhala (Sinh) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| } | |||
| void | |||
| test_georgian() | |||
| { | |||
| printf("testing Georgian (Geor) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH); | |||
| } | |||
| void | |||
| test_ethiopic() | |||
| { | |||
| printf("testing Ethiopic (Ethi) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD); | |||
| assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA); | |||
| assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON); | |||
| assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON); | |||
| assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON); | |||
| assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION); | |||
| assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH); | |||
| } | |||
| void | |||
| test_ideographic() | |||
| { | |||
| printf("testing Ideographic (Hani) script classification\n"); | |||
| assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| } | |||
| void | |||
| test_fullwidth() | |||
| { | |||
| printf("testing Full Width/Common (Zyyy) script classification\n"); | |||
| assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||
| } | |||
| int | |||
| main(int argc, char **argv) | |||
| { | |||
| test_latin_common(); | |||
| test_greek(); | |||
| test_armenian(); | |||
| test_arabic(); | |||
| test_devanagari(); | |||
| test_tibetan(); | |||
| test_sinhala(); | |||
| test_georgian(); | |||
| test_ethiopic(); | |||
| test_ideographic(); | |||
| test_fullwidth(); | |||
| printf("done\n"); | |||
| return EXIT_SUCCESS; | |||
| } | |||