mahta.fetrat
/
HomoFast-eSpeak-Persian


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
							/*
 * Copyright (C) 2005 to 2013 by Jonathan Duddington
 * email: [email protected]
 * Copyright (C) 2013-2017 Reece H. Dunn
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see: <http://www.gnu.org/licenses/>.
 */

#include "config.h"

#include <ctype.h>
#include <errno.h>
#include <locale.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <wctype.h>

#include <espeak-ng/espeak_ng.h>
#include <espeak-ng/speak_lib.h>
#include <espeak-ng/encoding.h>
#include <ucd/ucd.h>

#include "common.h"
#include "translate.h"

#pragma GCC visibility push(default)

int GetFileLength(const char *filename)
{
	struct stat statbuf;

	if (stat(filename, &statbuf) != 0)
		return -errno;

	if (S_ISDIR(statbuf.st_mode))
		return -EISDIR;

	return statbuf.st_size;
}

void strncpy0(char *to, const char *from, int size)
{
	// strcpy with limit, ensures a zero terminator
	strncpy(to, from, size);
	to[size-1] = 0;
}

int utf8_in(int *c, const char *buf)
{
	/* Read a unicode characater from a UTF8 string
	 * Returns the number of UTF8 bytes used.
	 * buf: position of buffer is moved, if character is read
	 * c: holds UTF-16 representation of multibyte character by
	 * skipping UTF-8 header bits of bytes in following way:
	 * 2-byte character "ā":
	 * hex            binary
	 * c481           1100010010000001
	 *    |           11000100  000001
	 *    V              \    \ |    |
	 * 0101           0000000100000001
	 * 3-byte character "ꙅ":
	 * ea9985 111010101001100110000101
	 *            1010  011001  000101
	 *    |       +  +--.\   \  |    |
	 *    V        `--.  \`.  `.|    |
	 *   A645         1010011001000101
	 * 4-byte character "𠜎":
	 * f0a09c8e 11110000101000001001110010001110
	 *    V          000  100000  011100  001110
	 *   02070e         000000100000011100001110
	 */
	return utf8_in2(c, buf, 0);
}
#pragma GCC visibility pop

int utf8_out(unsigned int c, char *buf)
{
	// write a UTF-16 character into a buffer as UTF-8
	// returns the number of bytes written

	int n_bytes;
	int j;
	int shift;
	static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };

	if (c < 0x80) {
		buf[0] = c;
		return 1;
	}
	if (c >= 0x110000) {
		buf[0] = ' '; // out of range character code
		return 1;
	}
	if (c < 0x0800)
		n_bytes = 1;
	else if (c < 0x10000)
		n_bytes = 2;
	else
		n_bytes = 3;

	shift = 6*n_bytes;
	buf[0] = code[n_bytes] | (c >> shift);
	for (j = 0; j < n_bytes; j++) {
		shift -= 6;
		buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
	}
	return n_bytes+1;
}

int utf8_in2(int *c, const char *buf, int backwards)
{
	// Reads a unicode characater from a UTF8 string
	// Returns the number of UTF8 bytes used.
	// c: holds integer representation of multibyte character
	// buf: position of buffer is moved, if character is read
	// backwards: set if we are moving backwards through the UTF8 string

	int c1;
	int n_bytes;
	static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };

	// find the start of the next/previous character
	while ((*buf & 0xc0) == 0x80) {
		// skip over non-initial bytes of a multi-byte utf8 character
		if (backwards)
			buf--;
		else
			buf++;
	}

	n_bytes = 0;

	if ((c1 = *buf++) & 0x80) {
		if ((c1 & 0xe0) == 0xc0)
			n_bytes = 1;
		else if ((c1 & 0xf0) == 0xe0)
			n_bytes = 2;
		else if ((c1 & 0xf8) == 0xf0)
			n_bytes = 3;

		c1 &= mask[n_bytes];
		int ix;
		for (ix = 0; ix < n_bytes; ix++)
		{
			if (!*buf)
				/* Oops, truncated */
				break;
			c1 = (c1 << 6) + (*buf++ & 0x3f);
		}
		n_bytes = ix;
	}
	*c = c1;
	return n_bytes+1;
}


int IsAlpha(unsigned int c)
{
	// Replacement for iswalph() which also checks for some in-word symbols

	static const unsigned short extra_indic_alphas[] = {
		0xa70, 0xa71, // Gurmukhi: tippi, addak
		0
	};

	if (iswalpha(c))
		return 1;

	if (c < 0x300)
		return 0;

	if ((c >= 0x901) && (c <= 0xdf7)) {
		// Indic scripts: Devanagari, Tamil, etc
		if ((c & 0x7f) < 0x64)
			return 1;
		if (lookupwchar(extra_indic_alphas, c) != 0)
			return 1;
		if ((c >= 0xd7a) && (c <= 0xd7f))
			return 1; // malaytalam chillu characters

		return 0;
	}

	if ((c >= 0x5b0) && (c <= 0x5c2))
		return 1; // Hebrew vowel marks

	if (c == 0x0605)
		return 1;

	if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
		return 1; // arabic vowel marks

	if ((c >= 0x300) && (c <= 0x36f))
		return 1; // combining accents

	if ((c >= 0xf40) && (c <= 0xfbc))
		return 1; // tibetan

	if ((c >= 0x1100) && (c <= 0x11ff))
		return 1; // Korean jamo

	if ((c >= 0x2800) && (c <= 0x28ff))
		return 1; // braille

	if ((c > 0x3040) && (c <= 0xa700))
		return 1; // Chinese/Japanese.  Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure

	return 0;
}

// brackets, also 0x2014 to 0x021f which don't need to be in this list
static const unsigned short brackets[] = {
	'(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`',
	0xab,   0xbb,   // double angle brackets
	0x300a, 0x300b, // double angle brackets (ideograph)
	0xe000+'<',     // private usage area
	0
};

int IsBracket(int c)
{
	if ((c >= 0x2014) && (c <= 0x201f))
		return 1;
	return lookupwchar(brackets, c);
}

int IsDigit09(unsigned int c)
{
	if ((c >= '0') && (c <= '9'))
		return 1;
	return 0;
}

int IsDigit(unsigned int c)
{
	if (iswdigit(c))
		return 1;

	if ((c >= 0x966) && (c <= 0x96f))
		return 1;

	return 0;
}

int IsSpace(unsigned int c)
{
	if (c == 0)
		return 0;
	if ((c >= 0x2500) && (c < 0x25a0))
		return 1; // box drawing characters
	if ((c >= 0xfff9) && (c <= 0xffff))
		return 1; // unicode specials
	return iswspace(c);
}

int isspace2(unsigned int c)
{
	// can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
	if ( ((c & 0xff) == 0) || (c > ' '))
		return 0;
	return 1;
}

int is_str_totally_null(const char* str, int size) {
	// Tests if all bytes of str are null up to size
	// This should never be reimplemented with integers, because
	// this function has to work with unaligned char*
	// (casting to int when unaligned may result in ungaranteed behaviors)
	return (*str == 0 && memcmp(str, str+1, size-1) == 0);
}

int Read4Bytes(FILE *f)
{
	// Read 4 bytes (least significant first) into a word
	int ix;
	int acc = 0;

	for (ix = 0; ix < 4; ix++) {
		unsigned char c;
		c = fgetc(f) & 0xff;
		acc += (c << (ix*8));
	}
	return acc;
}

unsigned int StringToWord(const char *string)
{
	// Pack 4 characters into a word
	int ix;
	unsigned char c;
	unsigned int word;

	if (string == NULL)
		return 0;

	word = 0;
	for (ix = 0; ix < 4; ix++) {
		if (string[ix] == 0) break;
		c = string[ix];
		word |= (c << (ix*8));
	}
	return word;
}

int towlower2(unsigned int c, Translator *translator)
{
	// check for non-standard upper to lower case conversions
	if (c == 'I' && translator->langopts.dotless_i)
		return 0x131; // I -> ı

	return ucd_tolower(c);
}