| src/speak-ng.1.html \ | src/speak-ng.1.html \ | ||||
| README.html | README.html | ||||
| check: tests/encoding.test | |||||
| tests/encoding.test | |||||
| ##### build targets: | ##### build targets: | ||||
| espeak_includedir = $(includedir)/espeak | espeak_includedir = $(includedir)/espeak | ||||
| src/libespeak-ng/synthdata.c \ | src/libespeak-ng/synthdata.c \ | ||||
| src/libespeak-ng/synthesize.c \ | src/libespeak-ng/synthesize.c \ | ||||
| src/libespeak-ng/synth_mbrola.c \ | src/libespeak-ng/synth_mbrola.c \ | ||||
| src/libespeak-ng/tokenizer.c \ | |||||
| src/libespeak-ng/translate.c \ | src/libespeak-ng/translate.c \ | ||||
| src/libespeak-ng/tr_languages.c \ | src/libespeak-ng/tr_languages.c \ | ||||
| src/libespeak-ng/voices.c \ | src/libespeak-ng/voices.c \ | ||||
| src/libespeak-ng/mnemonics.c \ | src/libespeak-ng/mnemonics.c \ | ||||
| tests/encoding.c | tests/encoding.c | ||||
| noinst_bin_PROGRAMS += tests/tokenizer.test | |||||
| tests_tokenizer_test_CFLAGS = \ | |||||
| -Isrc/libespeak-ng -Isrc/include -Isrc/include/compat -I src/ucd-tools/src/include \ | |||||
| -D _POSIX_C_SOURCE=200112L ${AM_CFLAGS} | |||||
| tests_tokenizer_test_SOURCES = \ | |||||
| src/libespeak-ng/tokenizer.c \ | |||||
| tests/tokenizer.c | |||||
| check: tests/encoding.test tests/tokenizer.test | |||||
| tests/encoding.test | |||||
| tests/tokenizer.test | |||||
| ##### phoneme data: | ##### phoneme data: | ||||
| espeak-ng-data/phondata: phsource/phonemes.stamp | espeak-ng-data/phondata: phsource/phonemes.stamp |
| src/libespeak-ng/synthdata.c \ | src/libespeak-ng/synthdata.c \ | ||||
| src/libespeak-ng/synthesize.c \ | src/libespeak-ng/synthesize.c \ | ||||
| src/libespeak-ng/synth_mbrola.c \ | src/libespeak-ng/synth_mbrola.c \ | ||||
| src/libespeak-ng/tokenizer.c \ | |||||
| src/libespeak-ng/translate.c \ | src/libespeak-ng/translate.c \ | ||||
| src/libespeak-ng/tr_languages.c \ | src/libespeak-ng/tr_languages.c \ | ||||
| src/libespeak-ng/voices.c \ | src/libespeak-ng/voices.c \ |
| static int sayas_start; | static int sayas_start; | ||||
| static int ssml_ignore_l_angle = 0; | static int ssml_ignore_l_angle = 0; | ||||
| // punctuations symbols that can end a clause | |||||
| static const unsigned short punct_chars[] = { | |||||
| ',', '.', '?', '!', ':', ';', | |||||
| 0x00a1, // inverted exclamation | |||||
| 0x00bf, // inverted question | |||||
| 0x2013, // en-dash | |||||
| 0x2014, // em-dash | |||||
| 0x2026, // elipsis | |||||
| 0x037e, // Greek question mark (looks like semicolon) | |||||
| 0x0387, // Greek semicolon, ano teleia | |||||
| 0x0964, // Devanagari Danda (fullstop) | |||||
| 0x0589, // Armenian period | |||||
| 0x055d, // Armenian comma | |||||
| 0x055c, // Armenian exclamation | |||||
| 0x055e, // Armenian question | |||||
| 0x055b, // Armenian emphasis mark | |||||
| 0x060c, // Arabic , | |||||
| 0x061b, // Arabic ; | |||||
| 0x061f, // Arabic ? | |||||
| 0x06d4, // Arabic . | |||||
| 0x0df4, // Singhalese Kunddaliya | |||||
| 0x0f0d, // Tibet Shad | |||||
| 0x0f0e, | |||||
| 0x1362, // Ethiopic period | |||||
| 0x1363, | |||||
| 0x1364, | |||||
| 0x1365, | |||||
| 0x1366, | |||||
| 0x1367, | |||||
| 0x1368, | |||||
| 0x10fb, // Georgian paragraph | |||||
| 0x3001, // ideograph comma | |||||
| 0x3002, // ideograph period | |||||
| 0xff01, // fullwidth exclamation | |||||
| 0xff0c, // fullwidth comma | |||||
| 0xff0e, // fullwidth period | |||||
| 0xff1a, // fullwidth colon | |||||
| 0xff1b, // fullwidth semicolon | |||||
| 0xff1f, // fullwidth question mark | |||||
| 0 | |||||
| }; | |||||
| // indexed by (entry num. in punct_chars) + 1 | |||||
| // bits 0-7 pause x 10mS, bits 12-14 intonation type, bit 15 don't need following space or bracket | |||||
| static const unsigned int punct_attributes[] = { | |||||
| 0, | |||||
| CLAUSE_COMMA, | |||||
| CLAUSE_PERIOD, | |||||
| CLAUSE_QUESTION, | |||||
| CLAUSE_EXCLAMATION, | |||||
| CLAUSE_COLON, | |||||
| CLAUSE_SEMICOLON, | |||||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation | |||||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question | |||||
| CLAUSE_SEMICOLON, // en-dash | |||||
| CLAUSE_SEMICOLON, // em-dash | |||||
| CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis | |||||
| CLAUSE_QUESTION, // Greek question mark | |||||
| CLAUSE_SEMICOLON, // Greek semicolon | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop) | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period | |||||
| CLAUSE_COMMA, // Armenian comma | |||||
| CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation | |||||
| CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question | |||||
| CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark | |||||
| CLAUSE_COMMA, // Arabic , | |||||
| CLAUSE_SEMICOLON, // Arabic ; | |||||
| CLAUSE_QUESTION, // Arabic question mark | |||||
| CLAUSE_PERIOD, // Arabic full stop | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period | |||||
| CLAUSE_PARAGRAPH, | |||||
| CLAUSE_PERIOD, // Ethiopic period | |||||
| CLAUSE_COMMA, // Ethiopic comma | |||||
| CLAUSE_SEMICOLON, // Ethiopic semicolon | |||||
| CLAUSE_COLON, // Ethiopic colon | |||||
| CLAUSE_COLON, // Ethiopic preface colon | |||||
| CLAUSE_QUESTION, // Ethiopic question mark | |||||
| CLAUSE_PARAGRAPH, // Ethiopic paragraph | |||||
| CLAUSE_PARAGRAPH, // Georgian paragraph | |||||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period | |||||
| CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth | |||||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_SEMICOLON, // spare | |||||
| 0 | |||||
| }; | |||||
| // stack for language and voice properties | // stack for language and voice properties | ||||
| // frame 0 is for the defaults, before any ssml tags. | // frame 0 is for the defaults, before any ssml tags. | ||||
| typedef struct { | typedef struct { | ||||
| if (c1 == '-') | if (c1 == '-') | ||||
| return CLAUSE_NONE; // no pause | return CLAUSE_NONE; // no pause | ||||
| attributes = punct_attributes[lookupwchar(punct_chars, c1)]; | |||||
| attributes = clause_type_from_codepoint(c1); | |||||
| short_pause = CLAUSE_SHORTFALL; | short_pause = CLAUSE_SHORTFALL; | ||||
| if ((attributes & CLAUSE_INTONATION_TYPE) == 0x1000) | if ((attributes & CLAUSE_INTONATION_TYPE) == 0x1000) | ||||
| int phoneme_mode = 0; | int phoneme_mode = 0; | ||||
| int n_xml_buf; | int n_xml_buf; | ||||
| int terminator; | int terminator; | ||||
| int punct; | |||||
| int found; | int found; | ||||
| int any_alnum = 0; | int any_alnum = 0; | ||||
| int self_closing; | int self_closing; | ||||
| if ((c2 == '\n') && (option_linelength == -1)) { | if ((c2 == '\n') && (option_linelength == -1)) { | ||||
| // single-line mode, return immediately on NL | // single-line mode, return immediately on NL | ||||
| if ((punct = lookupwchar(punct_chars, c1)) == 0) { | |||||
| if ((terminator = clause_type_from_codepoint(c1)) == CLAUSE_NONE) { | |||||
| charix[ix] = count_characters - clause_start_char; | charix[ix] = count_characters - clause_start_char; | ||||
| *charix_top = ix; | *charix_top = ix; | ||||
| ix += utf8_out(c1, &buf[ix]); | ix += utf8_out(c1, &buf[ix]); | ||||
| terminator = CLAUSE_PERIOD; // line doesn't end in punctuation, assume period | terminator = CLAUSE_PERIOD; // line doesn't end in punctuation, assume period | ||||
| } else | |||||
| terminator = punct_attributes[punct]; | |||||
| } | |||||
| buf[ix] = ' '; | buf[ix] = ' '; | ||||
| buf[ix+1] = 0; | buf[ix+1] = 0; | ||||
| return terminator; | return terminator; | ||||
| } | } | ||||
| punct_data = 0; | punct_data = 0; | ||||
| if ((punct = lookupwchar(punct_chars, c1)) != 0) { | |||||
| punct_data = punct_attributes[punct]; | |||||
| if ((punct_data = clause_type_from_codepoint(c1)) != CLAUSE_NONE) { | |||||
| if (punct_data & CLAUSE_PUNCTUATION_IN_WORD) { | if (punct_data & CLAUSE_PUNCTUATION_IN_WORD) { | ||||
| // Armenian punctuation inside a word | // Armenian punctuation inside a word | ||||
| stressed_word = 1; | stressed_word = 1; |
| /* Tokenizer APIs. | |||||
| * | |||||
| * Copyright (C) 2005 to 2015 by Jonathan Duddington | |||||
| * email: [email protected] | |||||
| * Copyright (C) 2017 Reece H. Dunn | |||||
| * | |||||
| * This program is free software; you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation; either version 3 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * This program is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with this program; if not, see: <http://www.gnu.org/licenses/>. | |||||
| */ | |||||
| #include "config.h" | |||||
| #include <errno.h> | |||||
| #include <stdint.h> | |||||
| #include <stdio.h> | |||||
| #include <stdlib.h> | |||||
| #include <string.h> | |||||
| #include <espeak-ng/espeak_ng.h> | |||||
| #include "encoding.h" | |||||
| #include "speech.h" | |||||
| #include "phoneme.h" | |||||
| #include "synthesize.h" | |||||
| #include "translate.h" | |||||
| // punctuations symbols that can end a clause | |||||
| static const unsigned short punct_chars[] = { | |||||
| ',', '.', '?', '!', ':', ';', | |||||
| 0x00a1, // inverted exclamation | |||||
| 0x00bf, // inverted question | |||||
| 0x2013, // en-dash | |||||
| 0x2014, // em-dash | |||||
| 0x2026, // elipsis | |||||
| 0x037e, // Greek question mark (looks like semicolon) | |||||
| 0x0387, // Greek semicolon, ano teleia | |||||
| 0x0964, // Devanagari Danda (fullstop) | |||||
| 0x0589, // Armenian period | |||||
| 0x055d, // Armenian comma | |||||
| 0x055c, // Armenian exclamation | |||||
| 0x055e, // Armenian question | |||||
| 0x055b, // Armenian emphasis mark | |||||
| 0x060c, // Arabic , | |||||
| 0x061b, // Arabic ; | |||||
| 0x061f, // Arabic ? | |||||
| 0x06d4, // Arabic . | |||||
| 0x0df4, // Singhalese Kunddaliya | |||||
| 0x0f0d, // Tibet Shad | |||||
| 0x0f0e, | |||||
| 0x1362, // Ethiopic period | |||||
| 0x1363, | |||||
| 0x1364, | |||||
| 0x1365, | |||||
| 0x1366, | |||||
| 0x1367, | |||||
| 0x1368, | |||||
| 0x10fb, // Georgian paragraph | |||||
| 0x3001, // ideograph comma | |||||
| 0x3002, // ideograph period | |||||
| 0xff01, // fullwidth exclamation | |||||
| 0xff0c, // fullwidth comma | |||||
| 0xff0e, // fullwidth period | |||||
| 0xff1a, // fullwidth colon | |||||
| 0xff1b, // fullwidth semicolon | |||||
| 0xff1f, // fullwidth question mark | |||||
| 0 | |||||
| }; | |||||
| // indexed by entry num. in punct_chars | |||||
| static const unsigned int punct_attributes[] = { | |||||
| CLAUSE_COMMA, | |||||
| CLAUSE_PERIOD, | |||||
| CLAUSE_QUESTION, | |||||
| CLAUSE_EXCLAMATION, | |||||
| CLAUSE_COLON, | |||||
| CLAUSE_SEMICOLON, | |||||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation | |||||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question | |||||
| CLAUSE_SEMICOLON, // en-dash | |||||
| CLAUSE_SEMICOLON, // em-dash | |||||
| CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER, // elipsis | |||||
| CLAUSE_QUESTION, // Greek question mark | |||||
| CLAUSE_SEMICOLON, // Greek semicolon | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop) | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period | |||||
| CLAUSE_COMMA, // Armenian comma | |||||
| CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation | |||||
| CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question | |||||
| CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark | |||||
| CLAUSE_COMMA, // Arabic , | |||||
| CLAUSE_SEMICOLON, // Arabic ; | |||||
| CLAUSE_QUESTION, // Arabic question mark | |||||
| CLAUSE_PERIOD, // Arabic full stop | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period | |||||
| CLAUSE_PARAGRAPH, | |||||
| CLAUSE_PERIOD, // Ethiopic period | |||||
| CLAUSE_COMMA, // Ethiopic comma | |||||
| CLAUSE_SEMICOLON, // Ethiopic semicolon | |||||
| CLAUSE_COLON, // Ethiopic colon | |||||
| CLAUSE_COLON, // Ethiopic preface colon | |||||
| CLAUSE_QUESTION, // Ethiopic question mark | |||||
| CLAUSE_PARAGRAPH, // Ethiopic paragraph | |||||
| CLAUSE_PARAGRAPH, // Georgian paragraph | |||||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period | |||||
| CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth | |||||
| CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER, | |||||
| 0 | |||||
| }; | |||||
| int clause_type_from_codepoint(uint32_t c) | |||||
| { | |||||
| for (int ix = 0; punct_chars[ix] != 0; ++ix) { | |||||
| if (punct_chars[ix] == c) | |||||
| return punct_attributes[ix]; | |||||
| } | |||||
| return CLAUSE_NONE; | |||||
| } |
| #define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE) | #define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE) | ||||
| #define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE) | #define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE) | ||||
| int clause_type_from_codepoint(uint32_t c); | |||||
| //@} | //@} | ||||
| #define SAYAS_CHARS 0x12 | #define SAYAS_CHARS 0x12 |
| <ClCompile Include="..\libespeak-ng\synthdata.c" /> | <ClCompile Include="..\libespeak-ng\synthdata.c" /> | ||||
| <ClCompile Include="..\libespeak-ng\synthesize.c" /> | <ClCompile Include="..\libespeak-ng\synthesize.c" /> | ||||
| <ClCompile Include="..\libespeak-ng\synth_mbrola.c" /> | <ClCompile Include="..\libespeak-ng\synth_mbrola.c" /> | ||||
| <ClCompile Include="..\libespeak-ng\tokenizer.c" /> | |||||
| <ClCompile Include="..\libespeak-ng\translate.c" /> | <ClCompile Include="..\libespeak-ng\translate.c" /> | ||||
| <ClCompile Include="..\libespeak-ng\tr_languages.c" /> | <ClCompile Include="..\libespeak-ng\tr_languages.c" /> | ||||
| <ClCompile Include="..\libespeak-ng\voices.c" /> | <ClCompile Include="..\libespeak-ng\voices.c" /> |
| /* | |||||
| * Copyright (C) 2017 Reece H. Dunn | |||||
| * | |||||
| * This program is free software; you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation; either version 3 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * This program is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with this program; if not, write see: | |||||
| * <http://www.gnu.org/licenses/>. | |||||
| */ | |||||
| #include "config.h" | |||||
| #include <assert.h> | |||||
| #include <stdint.h> | |||||
| #include <stdlib.h> | |||||
| #include <stdio.h> | |||||
| #include <espeak-ng/espeak_ng.h> | |||||
| #include "encoding.h" | |||||
| #include "speech.h" | |||||
| #include "phoneme.h" | |||||
| #include "synthesize.h" | |||||
| #include "translate.h" | |||||
| void | |||||
| test_latin_common() | |||||
| { | |||||
| printf("testing Latin/Common (Latn/Zyyy) script classification\n"); | |||||
| assert(clause_type_from_codepoint('a') == CLAUSE_NONE); | |||||
| assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD); | |||||
| assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION); | |||||
| assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION); | |||||
| assert(clause_type_from_codepoint(',') == CLAUSE_COMMA); | |||||
| assert(clause_type_from_codepoint(':') == CLAUSE_COLON); | |||||
| assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON); | |||||
| assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON); | |||||
| assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON); | |||||
| assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| } | |||||
| void | |||||
| test_greek() | |||||
| { | |||||
| printf("testing Greek (Grek) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION); | |||||
| assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON); | |||||
| } | |||||
| void | |||||
| test_armenian() | |||||
| { | |||||
| printf("testing Armenian (Armn) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x055B) == (CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD)); | |||||
| assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD)); | |||||
| assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA); | |||||
| assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD)); | |||||
| assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| } | |||||
| void | |||||
| test_arabic() | |||||
| { | |||||
| printf("testing Arabic (Arab) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA); | |||||
| assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON); | |||||
| assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION); | |||||
| assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD); | |||||
| } | |||||
| void | |||||
| test_devanagari() | |||||
| { | |||||
| printf("testing Devanagari (Deva) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| } | |||||
| void | |||||
| test_tibetan() | |||||
| { | |||||
| printf("testing Tibetan (Tibt) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH); | |||||
| } | |||||
| void | |||||
| test_sinhala() | |||||
| { | |||||
| printf("testing Sinhala (Sinh) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| } | |||||
| void | |||||
| test_georgian() | |||||
| { | |||||
| printf("testing Georgian (Geor) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH); | |||||
| } | |||||
| void | |||||
| test_ethiopic() | |||||
| { | |||||
| printf("testing Ethiopic (Ethi) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD); | |||||
| assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA); | |||||
| assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON); | |||||
| assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON); | |||||
| assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON); | |||||
| assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION); | |||||
| assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH); | |||||
| } | |||||
| void | |||||
| test_ideographic() | |||||
| { | |||||
| printf("testing Ideographic (Hani) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| } | |||||
| void | |||||
| test_fullwidth() | |||||
| { | |||||
| printf("testing Full Width/Common (Zyyy) script classification\n"); | |||||
| assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER)); | |||||
| } | |||||
| int | |||||
| main(int argc, char **argv) | |||||
| { | |||||
| test_latin_common(); | |||||
| test_greek(); | |||||
| test_armenian(); | |||||
| test_arabic(); | |||||
| test_devanagari(); | |||||
| test_tibetan(); | |||||
| test_sinhala(); | |||||
| test_georgian(); | |||||
| test_ethiopic(); | |||||
| test_ideographic(); | |||||
| test_fullwidth(); | |||||
| printf("done\n"); | |||||
| return EXIT_SUCCESS; | |||||
| } |