Browse Source

ucd-tools: Optional_Space_After eSpeakNG extended property support; use in clause_type_from_codepoint.

master
Reece H. Dunn 8 years ago
parent
commit
1131d0924b

+ 12
- 32
src/libespeak-ng/tokenizer.c View File

@@ -39,26 +39,10 @@ static const unsigned short punct_chars[] = {
0x00a1, // inverted exclamation
0x00bf, // inverted question

0x0964, // Devanagari Danda (fullstop)

0x0589, // Armenian period
0x055c, // Armenian exclamation
0x055e, // Armenian question
0x055b, // Armenian emphasis mark

0x0df4, // Singhalese Kunddaliya
0x0f0d, // Tibet Shad

0x3001, // ideograph comma
0x3002, // ideograph period

0xff01, // fullwidth exclamation
0xff0c, // fullwidth comma
0xff0e, // fullwidth period
0xff1a, // fullwidth colon
0xff1b, // fullwidth semicolon
0xff1f, // fullwidth question mark

0
};

@@ -67,26 +51,10 @@ static const unsigned int punct_attributes[] = {
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted exclamation
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER, // inverted question

CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Devanagari Danda (fullstop)

CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Armenian period
CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian exclamation
CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD, // Armenian question
CLAUSE_PERIOD | CLAUSE_PUNCTUATION_IN_WORD, // Armenian emphasis mark

CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Singhalese period
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // Tibet period

CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph comma
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER, // ideograph period

CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER, // fullwidth
CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER,
CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER,

0
};

@@ -106,17 +74,29 @@ int clause_type_from_codepoint(uint32_t c)
{
case ESPEAKNG_PROPERTY_FULL_STOP:
return CLAUSE_PERIOD;
case ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_QUESTION_MARK:
return CLAUSE_QUESTION;
case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
return CLAUSE_EXCLAMATION;
case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_COMMA:
return CLAUSE_COMMA;
case ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_COLON:
return CLAUSE_COLON;
case ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_SEMI_COLON:
case ESPEAKNG_PROPERTY_EXTENDED_DASH:
return CLAUSE_SEMICOLON;
case ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
return CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_ELLIPSIS:
return CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER;
case ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR:

+ 18
- 0
src/ucd-tools/data/espeak-ng/PropList.txt View File

@@ -1,5 +1,6 @@
# espeak-ng/PropList-9.0.0.txt
# Date: 2017-04-17, 20:19:00 GMT
# Copyright (C) 2005 to 2015 by Jonathan Duddington
# Copyright (C) 2017 Reece H. Dunn
#
# This is an extension to the Unicode Character Database PropList.txt file,
@@ -188,3 +189,20 @@ FE19 ; Ellipsis # Po PRESENTATION FORM FOR VERTICAL HORIZONTAL EL
FE31..FE32 ; Extended_Dash # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH

# Total code points: 6

# ================================================

00A1 ; Optional_Space_After # Po INVERTED EXCLAMATION MARK
00BF ; Optional_Space_After # Po INVERTED QUESTION MARK
0589 ; Optional_Space_After # Po ARMENIAN FULL STOP
0964 ; Optional_Space_After # Po DEVANAGARI DANDA
0DF4 ; Optional_Space_After # Po SINHALA PUNCTUATION KUNDDALIYA
0F0D ; Optional_Space_After # Po TIBETAN MARK SHAD
3001..3002 ; Optional_Space_After # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
FF01 ; Optional_Space_After # Po FULLWIDTH EXCLAMATION MARK
FF0C ; Optional_Space_After # Po FULLWIDTH COMMA
FF0E ; Optional_Space_After # Po FULLWIDTH FULL STOP
FF1A..FF1B ; Optional_Space_After # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON
FF1F ; Optional_Space_After # Po FULLWIDTH QUESTION MARK

# Total code points: 14

+ 1
- 0
src/ucd-tools/src/include/ucd/ucd.h View File

@@ -363,6 +363,7 @@ typedef uint64_t ucd_property;
#define UCD_PROPERTY_PREPENDED_CONCATENATION_MARK 0x0000000100000000ull /**< @brief Prepended_Concatenation_Mark */

// eSpeak NG extended properties:
#define ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER 0x0040000000000000ull /**< @brief Optional_Space_After */
#define ESPEAKNG_PROPERTY_EXTENDED_DASH 0x0080000000000000ull /**< @brief Extended_Dash */
#define ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR 0x0100000000000000ull /**< @brief Paragraph_Separator */
#define ESPEAKNG_PROPERTY_ELLIPSIS 0x0200000000000000ull /**< @brief Ellipsis */

+ 14
- 14
src/ucd-tools/src/proplist.c View File

@@ -1342,9 +1342,9 @@ static ucd_property properties_Po(codepoint_t c)
if (c == 0x003A) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_COLON;
if (c == 0x003B) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_SEMI_COLON;
if (c == 0x003F) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_QUESTION_MARK;
if (c == 0x00A1) return UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_EXCLAMATION_MARK;
if (c == 0x00A1) return UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0x00B7) return UCD_PROPERTY_DIACRITIC | UCD_PROPERTY_EXTENDER | UCD_PROPERTY_OTHER_ID_CONTINUE;
if (c == 0x00BF) return UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_QUESTION_MARK;
if (c == 0x00BF) return UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
return UCD_PROPERTY_PATTERN_SYNTAX;
case 0x0300:
if (c == 0x037E) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_QUESTION_MARK;
@@ -1354,7 +1354,7 @@ static ucd_property properties_Po(codepoint_t c)
if (c >= 0x055B && c <= 0x055C) return ESPEAKNG_PROPERTY_EXCLAMATION_MARK;
if (c == 0x055D) return ESPEAKNG_PROPERTY_COMMA;
if (c == 0x055F) return ESPEAKNG_PROPERTY_QUESTION_MARK;
if (c == 0x0589) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0x0589) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0x05C3) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0600:
@@ -1383,18 +1383,18 @@ static ucd_property properties_Po(codepoint_t c)
if (c == 0x085E) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0900:
if (c == 0x0964) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0x0964) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0x0965) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR;
break;
case 0x0D00:
if (c == 0x0DF4) return ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0x0DF4) return ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
break;
case 0x0E00:
if (c >= 0x0E5A && c <= 0x0E5B) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
break;
case 0x0F00:
if (c == 0x0F08) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0F0D) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0x0F0D) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0x0F0E) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR;
if (c >= 0x0F0E && c <= 0x0F12) return UCD_PROPERTY_TERMINAL_PUNCTUATION;
if (c == 0x0F14) return ESPEAKNG_PROPERTY_COMMA;
@@ -1483,8 +1483,8 @@ static ucd_property properties_Po(codepoint_t c)
if (c == 0x2E41) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_COMMA;
return UCD_PROPERTY_PATTERN_SYNTAX;
case 0x3000:
if (c == 0x3001) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_COMMA;
if (c == 0x3002) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0x3001) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0x3002) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | UCD_PROPERTY_PATTERN_SYNTAX | ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0x3003) return UCD_PROPERTY_PATTERN_SYNTAX;
if (c == 0x30FB) return UCD_PROPERTY_HYPHEN;
break;
@@ -1539,16 +1539,16 @@ static ucd_property properties_Po(codepoint_t c)
if (c == 0xFE68) return UCD_PROPERTY_OTHER_MATH;
break;
case 0xFF00:
if (c == 0xFF01) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_EXCLAMATION_MARK;
if (c == 0xFF01) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0xFF02) return UCD_PROPERTY_QUOTATION_MARK;
if (c == 0xFF07) return UCD_PROPERTY_QUOTATION_MARK;
if (c == 0xFF0C) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_COMMA;
if (c == 0xFF0E) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0xFF0C) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0xFF0E) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0xFF3C) return UCD_PROPERTY_OTHER_MATH;
if (c == 0xFF65) return UCD_PROPERTY_HYPHEN;
if (c == 0xFF1A) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_COLON;
if (c == 0xFF1B) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_SEMI_COLON;
if (c == 0xFF1F) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_QUESTION_MARK;
if (c == 0xFF1A) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0xFF1B) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0xFF1F) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER;
if (c == 0xFF61) return UCD_PROPERTY_TERMINAL_PUNCTUATION | UCD_PROPERTY_SENTENCE_TERMINAL | ESPEAKNG_PROPERTY_FULL_STOP;
if (c == 0xFF64) return UCD_PROPERTY_TERMINAL_PUNCTUATION | ESPEAKNG_PROPERTY_COMMA;
break;

+ 1
- 0
src/ucd-tools/tools/printdata.py View File

@@ -161,6 +161,7 @@ def properties(data):
props += (2 ** 31) * data.get('Pattern_Syntax', 0)
props += (2 ** 32) * data.get('Prepended_Concatenation_Mark', 0)
# eSpeak NG extended properties:
props += (2 ** 54) * data.get('Optional_Space_After', 0)
props += (2 ** 55) * data.get('Extended_Dash', 0)
props += (2 ** 56) * data.get('Paragraph_Separator', 0)
props += (2 ** 57) * data.get('Ellipsis', 0)

Loading…
Cancel
Save