Browse Source

code cleanup: move utf8_* functions.

utf8_nbytes() is only used in dictionary.c. Others are moved to common.c
master
Juho Hiltunen 2 years ago
parent
commit
ac3b9f6199

+ 107
- 0
src/libespeak-ng/common.c View File

@@ -57,4 +57,111 @@ void strncpy0(char *to, const char *from, int size)
strncpy(to, from, size);
to[size-1] = 0;
}

int utf8_in(int *c, const char *buf)
{
/* Read a unicode characater from a UTF8 string
* Returns the number of UTF8 bytes used.
* buf: position of buffer is moved, if character is read
* c: holds UTF-16 representation of multibyte character by
* skipping UTF-8 header bits of bytes in following way:
* 2-byte character "ā":
* hex binary
* c481 1100010010000001
* | 11000100 000001
* V \ \ | |
* 0101 0000000100000001
* 3-byte character "ꙅ":
* ea9985 111010101001100110000101
* 1010 011001 000101
* | + +--.\ \ | |
* V `--. \`. `.| |
* A645 1010011001000101
* 4-byte character "𠜎":
* f0a09c8e 11110000101000001001110010001110
* V 000 100000 011100 001110
* 02070e 000000100000011100001110
*/
return utf8_in2(c, buf, 0);
}
#pragma GCC visibility pop

int utf8_out(unsigned int c, char *buf)
{
// write a UTF-16 character into a buffer as UTF-8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

int utf8_in2(int *c, const char *buf, int backwards)
{
// Reads a unicode characater from a UTF8 string
// Returns the number of UTF8 bytes used.
// c: holds integer representation of multibyte character
// buf: position of buffer is moved, if character is read
// backwards: set if we are moving backwards through the UTF8 string

int c1;
int n_bytes;
int ix;
static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };

// find the start of the next/previous character
while ((*buf & 0xc0) == 0x80) {
// skip over non-initial bytes of a multi-byte utf8 character
if (backwards)
buf--;
else
buf++;
}

n_bytes = 0;

if ((c1 = *buf++) & 0x80) {
if ((c1 & 0xe0) == 0xc0)
n_bytes = 1;
else if ((c1 & 0xf0) == 0xe0)
n_bytes = 2;
else if ((c1 & 0xf8) == 0xf0)
n_bytes = 3;

c1 &= mask[n_bytes];
for (ix = 0; ix < n_bytes; ix++)
{
if (!*buf)
/* Oops, truncated */
break;
c1 = (c1 << 6) + (*buf++ & 0x3f);
}
n_bytes = ix;
}
*c = c1;
return n_bytes+1;
}

+ 4
- 0
src/libespeak-ng/common.h View File

@@ -25,6 +25,10 @@
extern ESPEAK_NG_API int GetFileLength(const char *filename);
extern ESPEAK_NG_API void strncpy0(char *to, const char *from, int size);

ESPEAK_NG_API int utf8_in(int *c, const char *buf);
int utf8_in2(int *c, const char *buf, int backwards);
int utf8_out(unsigned int c, char *buf);

#ifdef __cplusplus
}
#endif

+ 15
- 0
src/libespeak-ng/dictionary.c View File

@@ -2856,6 +2856,21 @@ static const char *LookupDict2(Translator *tr, const char *word, const char *wor
return 0;
}


static int utf8_nbytes(const char *buf)
{
// Returns the number of bytes for the first UTF-8 character in buf

unsigned char c = (unsigned char)buf[0];
if (c < 0x80)
return 1;
if (c < 0xe0)
return 2;
if (c < 0xf0)
return 3;
return 4;
}

/* Lookup a specified word in the word dictionary.
Returns phonetic data in 'phonetic' and bits in 'flags'


+ 1
- 0
src/libespeak-ng/numbers.c View File

@@ -32,6 +32,7 @@
#include <espeak-ng/encoding.h>

#include "numbers.h"
#include "common.h"
#include "dictionary.h" // for Lookup, TranslateRules, EncodePhonemes, Look...
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonEND_WORD, phonP...
#include "readclause.h" // for WordToString2, towlower2

+ 1
- 0
src/libespeak-ng/tr_languages.c View File

@@ -31,6 +31,7 @@
#include <espeak-ng/speak_lib.h>
#include <espeak-ng/encoding.h>

#include "common.h"
#include "setlengths.h" // for SetLengthMods
#include "translate.h" // for Translator, LANGUAGE_OPTIONS, L, NUM...


+ 1
- 123
src/libespeak-ng/translate.c View File

@@ -33,6 +33,7 @@
#include <espeak-ng/encoding.h>

#include "translate.h"
#include "common.h"
#include "dictionary.h" // for TranslateRules, LookupDictList, Cha...
#include "phoneme.h" // for phonSWITCH, PHONEME_TAB, phonPAUSE_...
#include "phonemelist.h" // for MakePhonemeList
@@ -237,129 +238,6 @@ int IsBracket(int c)
return lookupwchar(brackets, c);
}

int utf8_nbytes(const char *buf)
{
// Returns the number of bytes for the first UTF-8 character in buf

unsigned char c = (unsigned char)buf[0];
if (c < 0x80)
return 1;
if (c < 0xe0)
return 2;
if (c < 0xf0)
return 3;
return 4;
}

int utf8_in2(int *c, const char *buf, int backwards)
{
// Reads a unicode characater from a UTF8 string
// Returns the number of UTF8 bytes used.
// c: holds integer representation of multibyte character
// buf: position of buffer is moved, if character is read
// backwards: set if we are moving backwards through the UTF8 string

int c1;
int n_bytes;
int ix;
static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };

// find the start of the next/previous character
while ((*buf & 0xc0) == 0x80) {
// skip over non-initial bytes of a multi-byte utf8 character
if (backwards)
buf--;
else
buf++;
}

n_bytes = 0;

if ((c1 = *buf++) & 0x80) {
if ((c1 & 0xe0) == 0xc0)
n_bytes = 1;
else if ((c1 & 0xf0) == 0xe0)
n_bytes = 2;
else if ((c1 & 0xf8) == 0xf0)
n_bytes = 3;

c1 &= mask[n_bytes];
for (ix = 0; ix < n_bytes; ix++)
{
if (!*buf)
/* Oops, truncated */
break;
c1 = (c1 << 6) + (*buf++ & 0x3f);
}
n_bytes = ix;
}
*c = c1;
return n_bytes+1;
}

#pragma GCC visibility push(default)
int utf8_in(int *c, const char *buf)
{
/* Read a unicode characater from a UTF8 string
* Returns the number of UTF8 bytes used.
* buf: position of buffer is moved, if character is read
* c: holds UTF-16 representation of multibyte character by
* skipping UTF-8 header bits of bytes in following way:
* 2-byte character "ā":
* hex binary
* c481 1100010010000001
* | 11000100 000001
* V \ \ | |
* 0101 0000000100000001
* 3-byte character "ꙅ":
* ea9985 111010101001100110000101
* 1010 011001 000101
* | + +--.\ \ | |
* V `--. \`. `.| |
* A645 1010011001000101
* 4-byte character "𠜎":
* f0a09c8e 11110000101000001001110010001110
* V 000 100000 011100 001110
* 02070e 000000100000011100001110
*/
return utf8_in2(c, buf, 0);
}
#pragma GCC visibility pop

int utf8_out(unsigned int c, char *buf)
{
// write a UTF-16 character into a buffer as UTF-8
// returns the number of bytes written

int n_bytes;
int j;
int shift;
static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

if (c < 0x80) {
buf[0] = c;
return 1;
}
if (c >= 0x110000) {
buf[0] = ' '; // out of range character code
return 1;
}
if (c < 0x0800)
n_bytes = 1;
else if (c < 0x10000)
n_bytes = 2;
else
n_bytes = 3;

shift = 6*n_bytes;
buf[0] = code[n_bytes] | (c >> shift);
for (j = 0; j < n_bytes; j++) {
shift -= 6;
buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
}
return n_bytes+1;
}

char *strchr_w(const char *s, int c)
{
// return NULL for any non-ascii character

+ 0
- 5
src/libespeak-ng/translate.h View File

@@ -661,11 +661,6 @@ extern int (*phoneme_callback)(const char *);
#define LEADING_2_BITS 0xC0 // 0b11000000
#define UTF8_TAIL_BITS 0x80 // 0b10000000

ESPEAK_NG_API int utf8_in(int *c, const char *buf);
int utf8_in2(int *c, const char *buf, int backwards);
int utf8_out(unsigned int c, char *buf);
int utf8_nbytes(const char *buf);

int lookupwchar(const unsigned short *list, int c);
char *strchr_w(const char *s, int c);
int IsBracket(int c);

Loading…
Cancel
Save