utf8_nbytes() is only used in dictionary.c. Others are moved to common.cmaster
| @@ -57,4 +57,111 @@ void strncpy0(char *to, const char *from, int size) | |||
| strncpy(to, from, size); | |||
| to[size-1] = 0; | |||
| } | |||
| int utf8_in(int *c, const char *buf) | |||
| { | |||
| /* Read a unicode characater from a UTF8 string | |||
| * Returns the number of UTF8 bytes used. | |||
| * buf: position of buffer is moved, if character is read | |||
| * c: holds UTF-16 representation of multibyte character by | |||
| * skipping UTF-8 header bits of bytes in following way: | |||
| * 2-byte character "ā": | |||
| * hex binary | |||
| * c481 1100010010000001 | |||
| * | 11000100 000001 | |||
| * V \ \ | | | |||
| * 0101 0000000100000001 | |||
| * 3-byte character "ꙅ": | |||
| * ea9985 111010101001100110000101 | |||
| * 1010 011001 000101 | |||
| * | + +--.\ \ | | | |||
| * V `--. \`. `.| | | |||
| * A645 1010011001000101 | |||
| * 4-byte character "𠜎": | |||
| * f0a09c8e 11110000101000001001110010001110 | |||
| * V 000 100000 011100 001110 | |||
| * 02070e 000000100000011100001110 | |||
| */ | |||
| return utf8_in2(c, buf, 0); | |||
| } | |||
| #pragma GCC visibility pop | |||
| int utf8_out(unsigned int c, char *buf) | |||
| { | |||
| // write a UTF-16 character into a buffer as UTF-8 | |||
| // returns the number of bytes written | |||
| int n_bytes; | |||
| int j; | |||
| int shift; | |||
| static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
| if (c < 0x80) { | |||
| buf[0] = c; | |||
| return 1; | |||
| } | |||
| if (c >= 0x110000) { | |||
| buf[0] = ' '; // out of range character code | |||
| return 1; | |||
| } | |||
| if (c < 0x0800) | |||
| n_bytes = 1; | |||
| else if (c < 0x10000) | |||
| n_bytes = 2; | |||
| else | |||
| n_bytes = 3; | |||
| shift = 6*n_bytes; | |||
| buf[0] = code[n_bytes] | (c >> shift); | |||
| for (j = 0; j < n_bytes; j++) { | |||
| shift -= 6; | |||
| buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
| } | |||
| return n_bytes+1; | |||
| } | |||
| int utf8_in2(int *c, const char *buf, int backwards) | |||
| { | |||
| // Reads a unicode characater from a UTF8 string | |||
| // Returns the number of UTF8 bytes used. | |||
| // c: holds integer representation of multibyte character | |||
| // buf: position of buffer is moved, if character is read | |||
| // backwards: set if we are moving backwards through the UTF8 string | |||
| int c1; | |||
| int n_bytes; | |||
| int ix; | |||
| static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 }; | |||
| // find the start of the next/previous character | |||
| while ((*buf & 0xc0) == 0x80) { | |||
| // skip over non-initial bytes of a multi-byte utf8 character | |||
| if (backwards) | |||
| buf--; | |||
| else | |||
| buf++; | |||
| } | |||
| n_bytes = 0; | |||
| if ((c1 = *buf++) & 0x80) { | |||
| if ((c1 & 0xe0) == 0xc0) | |||
| n_bytes = 1; | |||
| else if ((c1 & 0xf0) == 0xe0) | |||
| n_bytes = 2; | |||
| else if ((c1 & 0xf8) == 0xf0) | |||
| n_bytes = 3; | |||
| c1 &= mask[n_bytes]; | |||
| for (ix = 0; ix < n_bytes; ix++) | |||
| { | |||
| if (!*buf) | |||
| /* Oops, truncated */ | |||
| break; | |||
| c1 = (c1 << 6) + (*buf++ & 0x3f); | |||
| } | |||
| n_bytes = ix; | |||
| } | |||
| *c = c1; | |||
| return n_bytes+1; | |||
| } | |||
| @@ -25,6 +25,10 @@ | |||
| extern ESPEAK_NG_API int GetFileLength(const char *filename); | |||
| extern ESPEAK_NG_API void strncpy0(char *to, const char *from, int size); | |||
| ESPEAK_NG_API int utf8_in(int *c, const char *buf); | |||
| int utf8_in2(int *c, const char *buf, int backwards); | |||
| int utf8_out(unsigned int c, char *buf); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| @@ -2856,6 +2856,21 @@ static const char *LookupDict2(Translator *tr, const char *word, const char *wor | |||
| return 0; | |||
| } | |||
| static int utf8_nbytes(const char *buf) | |||
| { | |||
| // Returns the number of bytes for the first UTF-8 character in buf | |||
| unsigned char c = (unsigned char)buf[0]; | |||
| if (c < 0x80) | |||
| return 1; | |||
| if (c < 0xe0) | |||
| return 2; | |||
| if (c < 0xf0) | |||
| return 3; | |||
| return 4; | |||
| } | |||
| /* Lookup a specified word in the word dictionary. | |||
| Returns phonetic data in 'phonetic' and bits in 'flags' | |||
| @@ -32,6 +32,7 @@ | |||
| #include <espeak-ng/encoding.h> | |||
| #include "numbers.h" | |||
| #include "common.h" | |||
| #include "dictionary.h" // for Lookup, TranslateRules, EncodePhonemes, Look... | |||
| #include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonEND_WORD, phonP... | |||
| #include "readclause.h" // for WordToString2, towlower2 | |||
| @@ -31,6 +31,7 @@ | |||
| #include <espeak-ng/speak_lib.h> | |||
| #include <espeak-ng/encoding.h> | |||
| #include "common.h" | |||
| #include "setlengths.h" // for SetLengthMods | |||
| #include "translate.h" // for Translator, LANGUAGE_OPTIONS, L, NUM... | |||
| @@ -33,6 +33,7 @@ | |||
| #include <espeak-ng/encoding.h> | |||
| #include "translate.h" | |||
| #include "common.h" | |||
| #include "dictionary.h" // for TranslateRules, LookupDictList, Cha... | |||
| #include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonPAUSE_... | |||
| #include "phonemelist.h" // for MakePhonemeList | |||
| @@ -237,129 +238,6 @@ int IsBracket(int c) | |||
| return lookupwchar(brackets, c); | |||
| } | |||
| int utf8_nbytes(const char *buf) | |||
| { | |||
| // Returns the number of bytes for the first UTF-8 character in buf | |||
| unsigned char c = (unsigned char)buf[0]; | |||
| if (c < 0x80) | |||
| return 1; | |||
| if (c < 0xe0) | |||
| return 2; | |||
| if (c < 0xf0) | |||
| return 3; | |||
| return 4; | |||
| } | |||
| int utf8_in2(int *c, const char *buf, int backwards) | |||
| { | |||
| // Reads a unicode characater from a UTF8 string | |||
| // Returns the number of UTF8 bytes used. | |||
| // c: holds integer representation of multibyte character | |||
| // buf: position of buffer is moved, if character is read | |||
| // backwards: set if we are moving backwards through the UTF8 string | |||
| int c1; | |||
| int n_bytes; | |||
| int ix; | |||
| static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 }; | |||
| // find the start of the next/previous character | |||
| while ((*buf & 0xc0) == 0x80) { | |||
| // skip over non-initial bytes of a multi-byte utf8 character | |||
| if (backwards) | |||
| buf--; | |||
| else | |||
| buf++; | |||
| } | |||
| n_bytes = 0; | |||
| if ((c1 = *buf++) & 0x80) { | |||
| if ((c1 & 0xe0) == 0xc0) | |||
| n_bytes = 1; | |||
| else if ((c1 & 0xf0) == 0xe0) | |||
| n_bytes = 2; | |||
| else if ((c1 & 0xf8) == 0xf0) | |||
| n_bytes = 3; | |||
| c1 &= mask[n_bytes]; | |||
| for (ix = 0; ix < n_bytes; ix++) | |||
| { | |||
| if (!*buf) | |||
| /* Oops, truncated */ | |||
| break; | |||
| c1 = (c1 << 6) + (*buf++ & 0x3f); | |||
| } | |||
| n_bytes = ix; | |||
| } | |||
| *c = c1; | |||
| return n_bytes+1; | |||
| } | |||
| #pragma GCC visibility push(default) | |||
| int utf8_in(int *c, const char *buf) | |||
| { | |||
| /* Read a unicode characater from a UTF8 string | |||
| * Returns the number of UTF8 bytes used. | |||
| * buf: position of buffer is moved, if character is read | |||
| * c: holds UTF-16 representation of multibyte character by | |||
| * skipping UTF-8 header bits of bytes in following way: | |||
| * 2-byte character "ā": | |||
| * hex binary | |||
| * c481 1100010010000001 | |||
| * | 11000100 000001 | |||
| * V \ \ | | | |||
| * 0101 0000000100000001 | |||
| * 3-byte character "ꙅ": | |||
| * ea9985 111010101001100110000101 | |||
| * 1010 011001 000101 | |||
| * | + +--.\ \ | | | |||
| * V `--. \`. `.| | | |||
| * A645 1010011001000101 | |||
| * 4-byte character "𠜎": | |||
| * f0a09c8e 11110000101000001001110010001110 | |||
| * V 000 100000 011100 001110 | |||
| * 02070e 000000100000011100001110 | |||
| */ | |||
| return utf8_in2(c, buf, 0); | |||
| } | |||
| #pragma GCC visibility pop | |||
| int utf8_out(unsigned int c, char *buf) | |||
| { | |||
| // write a UTF-16 character into a buffer as UTF-8 | |||
| // returns the number of bytes written | |||
| int n_bytes; | |||
| int j; | |||
| int shift; | |||
| static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 }; | |||
| if (c < 0x80) { | |||
| buf[0] = c; | |||
| return 1; | |||
| } | |||
| if (c >= 0x110000) { | |||
| buf[0] = ' '; // out of range character code | |||
| return 1; | |||
| } | |||
| if (c < 0x0800) | |||
| n_bytes = 1; | |||
| else if (c < 0x10000) | |||
| n_bytes = 2; | |||
| else | |||
| n_bytes = 3; | |||
| shift = 6*n_bytes; | |||
| buf[0] = code[n_bytes] | (c >> shift); | |||
| for (j = 0; j < n_bytes; j++) { | |||
| shift -= 6; | |||
| buf[j+1] = 0x80 + ((c >> shift) & 0x3f); | |||
| } | |||
| return n_bytes+1; | |||
| } | |||
| char *strchr_w(const char *s, int c) | |||
| { | |||
| // return NULL for any non-ascii character | |||
| @@ -661,11 +661,6 @@ extern int (*phoneme_callback)(const char *); | |||
| #define LEADING_2_BITS 0xC0 // 0b11000000 | |||
| #define UTF8_TAIL_BITS 0x80 // 0b10000000 | |||
| ESPEAK_NG_API int utf8_in(int *c, const char *buf); | |||
| int utf8_in2(int *c, const char *buf, int backwards); | |||
| int utf8_out(unsigned int c, char *buf); | |||
| int utf8_nbytes(const char *buf); | |||
| int lookupwchar(const unsigned short *list, int c); | |||
| char *strchr_w(const char *s, int c); | |||
| int IsBracket(int c); | |||