Browse Source

Add support for Luxembourgish

master
Marco BARNIG 3 years ago
parent
commit
be962b067b

+ 1
- 0
CHANGELOG.md View File

@@ -68,6 +68,7 @@ new languages:
* haw (Hawaiian) -- Valdis Vitolins
* he (Hebrew) -- boracasli98, Valdis Vitolins
* io (Ido) -- José Miguel López, Valdis Vitolins
* lb (Luxembourgish) -- Marco Barnig, Valdis Vitolins
* ltg (Latgalian) -- Valdis Vitolins
* nog (Nogai) -- boracasli98, Valdis Vitolins
* piqd (Klingon) -- Valdis Vitolins

+ 5
- 0
Makefile.am View File

@@ -390,6 +390,7 @@ phsource/phonemes.stamp: \
phsource/ph_langbelta \
phsource/ph_latin \
phsource/ph_latvian \
phsource/ph_luxembourgish \
phsource/ph_lithuanian \
phsource/ph_lojban \
phsource/ph_lule_saami \
@@ -567,6 +568,7 @@ dictionaries: \
espeak-ng-data/ku_dict \
espeak-ng-data/ky_dict \
espeak-ng-data/la_dict \
espeak-ng-data/lb_dict \
espeak-ng-data/lfn_dict \
espeak-ng-data/lt_dict \
espeak-ng-data/lv_dict \
@@ -789,6 +791,9 @@ espeak-ng-data/ky_dict: dictsource/ky_list dictsource/ky_rules dictsource/ky_ext
la: espeak-ng-data/la_dict
espeak-ng-data/la_dict: dictsource/la_list dictsource/la_rules dictsource/la_extra

lb: espeak-ng-data/lb_dict
espeak-ng-data/lb_dict: dictsource/lb_list dictsource/lb_rules dictsource/lb_emoji

lfn: espeak-ng-data/lfn_dict
espeak-ng-data/lfn_dict: dictsource/lfn_list dictsource/lfn_rules dictsource/lfn_extra


+ 37
- 0
dictsource/lb_emoji View File

@@ -0,0 +1,37 @@
// espeak-ng-lb
// lb_emoji
// created by Marco Barnig [email protected]
// first version : 30.9.2021
// Emoji's and other symbol pronunciations for Luxembourgish
// full list of emoji's : https://unicode.org/emoji/charts/full-emoji-list.html
//
// pictographs
⌨ tasta:tur
//
// Emojis
🕰️ TSaIt
🧭 nort
💨 vant
🌞 zon
💪 Sta:rk
🚶‍♂️ vand@EreR
🧥 mant@El
🛤️ ve:
🖕️ eS
👬️ Sa:rel, toma:
👦 asto:r
👧 kapuzin
👩 tessa
🎪 TSirkus
🦍 gorila:
🦒 giraf
🐘 elefant
🦏 rino:zerus
//
// Symbols
+ pluz
- minuz
& an
€ eUro
$ dola:r
£ pont

+ 35670
- 0
dictsource/lb_list
File diff suppressed because it is too large
View File


+ 341
- 0
dictsource/lb_rules View File

@@ -0,0 +1,341 @@
// espeak-ng
// Lëtzebuergesch
// lb_rules
// created by Marco Barnig ([email protected])
// First version 15.9.2018
// Last update : 8.11.2021
//
// References :
// Peter Gilles; Jürgen Trouvain
// International Journal of the IPA
//
// Wikipedia
//
// https://infolux.uni.lu
// https://github.com/PeterGilles/Luxembourgish-language-resources
// https://lod.lu
// https://ortho.lod.lu
// https://en.wikipedia.org/wiki/Luxembourgish
// https://en.wikipedia.org/wiki/Luxembourgish_phonology
// https://en.wikipedia.org/wiki/Help:IPA/Luxembourgish
//
//=========================================================
// Lëtzebuergesch
//=========================================================
//
// Vokaler
// lëtzebuergesch Vokal Buchstawen : a, ä, e, é, ë, i, o, u, y
// zousätzlech deitsch Vokal Buchstawen : ö, ü
// zousätzlech franséisch Vokal Buchstawen : à, â, è, ê, ï, ô, û
// Vokal Phonemen IPA : ɑ, aː, ɛː, e, æ, eː, ə, ɐ, i, iː, o, oː, u, uː, y:, ãː, ɛ̃ː, õː, œː
// Vokal Phonemen-ID's eSpeak : A, aA, aE, e, E, eE, @E, eR, I, iI, O, oO, U, uU, UE, eA, iA, oA, OU

// lëtzebuergesch Vokalkoppel Buchstawen : ai, au, äi, ei, eu, éi, ie, ou, ue
// Vokalkoppel Phonemen IPA : æːɪ, ɑʊ, æːʊ, ɑɪ, ɜɪ, oɪ, iə, əʊ, uə
// Vokalkoppel Phonemen-ID's eSpeak : aI, aU, AU, eI, OI, eU, iE, oU, uE
// Total : 30 phonemes

.group a
a a // a : IPA ɑ
_) an (_ an // an : IPA ɑn
_) af (_ a:f // Af ; IPA aː
C) a (C_ a: // Kap : IPA aː
a (CC a // Kapp, blann : IPA ɑ
a (CA a: // Fabel IPA : aː
age arR@E
aller al@Er
awer a:v@Er

.group aa
aa a: // aacht, naass : IPA aː
aach a:x

.group ai
ai eI // daierlech : IPA ɑɪ
.group au
au aU // raumen, Auto : IPA ɑʊ
.group ä
ä aE // Kären : IPA ɛː
ä (CC e // Männer, Käpp : IPA æ
äi aI // Zäit, wäiss, räich : IPA : æːɪ
är aEeR // Stär :IPA ɛːɐ
äer aEeR // Päerd : IPA ɛːɐ

.group e
e @E // e : IPA ə
C) e (C e: // Meter : IPA eː
er eR // Kanner, Auer : IPA ɐ
ech @EX // ech : IPA əɕ
C) ech EX

.group ee
ee e: // Keess : IPA eː
eens e:ns // eens I IPA eːns
eech e:X

.group ei
ei eI // Leit, Weis, deier : IPA ɑɪ

.group eu
eu eU // Euro : IPA oɪ

.group é
é E // Méck, drécken : IPA e
ée e: // Arrivée : IPA eː
éi OI // schéin, fréi : IPA ɜɪ
.group ë
_) ë oE // ëffentlech : IPA ø
ë @E // hëllefen : IPA ə

.group i
i i // i : IPA i
_) i (_ i: // Buchstaw i
C) i (C i: // riseg, siwen : IPA iː
C) i (CC i // midd, Hiwwel : IPA i
ir i:eR // Stir : IPA iːR
ier ieR // Biergem : IPA iːR
iich i:X
issen is@En
.group ii
ii i: // liicht : IPA iː
.group io
iou ioU // Okkasioun : IPA iəʊ
.group ie
ie iE // Wieder, hien : IPA iə

.group o
o o // o : IPA o
_) o (_ o: // Buchstaw o
o (_ o: // Auto : IPA oː
o (C o: // Schof, droleg : IPA oː
o (CC o // Loft, Sonn : IPA o
.group oo
oo o: // Sprooch : IPA oː
ooch o:x
.group ou
ou oU // Schoul, Okkasioun : IPA əʊ
.group oy
oy eU // boy : IPA oɪ

.group u
u u // u : IPA u
_) u (_ u: // Buchstaw u
u (C u: // Tut, Bur : IPA uː
u (CC u // gutt : IPA u
um um // um : IPA um
uge u:rR@E
.group uu
uu u: // Duuscht, Luucht : IPA uː
uuch u:x
.group ue
ue uE // Buedem, Wued : IPA uə

.group y
y i // Syntheess : IPA i
_) y (_ ipzilon // Buchstaw y
_) y J // Yoga : IPA ʒ
y (_ i: // Baby : IPA iː

.group
ö OU // lösen, blöd, können : IPA œː
ü y: // Süden, für : IPA y:
è e: // IPA eː
à a: // IPA aː
â a: // IPA aː
ê e: // IPA ɛː
ô o: // IPA oː
û u: // IPA uː


// Konsonanten : Total 27 phonemes
// lëtzebuergesch Konsonanten Buchstawen : b, c, d, f, g, h, j, k, l, m, n, p, q, r, s, t, v, w, x, z
// lëtzebuergesch Konsonanten IPA Phonemen : b, c, d, f, g, h, j, k, l, m, n, ŋ, p, R, s, t, v, w, z, ʒ, ʁ, ʑ, ʦ, ʃ, dʒ, X, ɕ
// lëtzebuergesch Konsonanten Phonemen-ID's eSpeak : b, c, d, f, g, h, j, k, l, m, n, N, p, r, s, t, v, w, z, J, rR, Z, TS, S, dZ, x, X
//
//// Nasal Buchstawen : m, n
// Nasal Phonemen : m, n, ŋ

.group m
_) m (_ em // Buchstaw m
m m // Ham : IPA m
mm m // Mamm : IPA m
mat mat
mir mir

.group n
_) n (_ en // Buchstaw n
n n // Nues : IPA n
nen n@En // ..nen : IPA nən
nnen n@En
nn n // Mann : IPA n
ng N // keng : IPA ŋ
nk N // IPA : ŋ

// Plosiv Buchstawen : p, b, t, d, k, g, q, x
// Plosiv Phonemen : p, b, t, d, k, g, ʒ, ʁ, X
.group p
_) p (_ pe: // Buchstaw p
p p // paken : IPA p
pp p // Papp : IPA p
pen pEn

.group b
_) b (_ be: // Buchstaw b
b b // baken : IPA b
bb b // labber : IPA b

.group t
_) t (_ te: // Buchstaw t
t t // Téi : IPA t
tt t // Blutt : IPA t
tel t@El
tten t@En

.group d
_) d (_ de: // Buchstaw d
d d // Däiwel : IPA d
dd d // Brudder : IPA d
C) d t // Nord, Wand : IPA t
den d@En // den : IPA dən
dden d@En
der deR // der : IPA dɐ
derer d@EreR // Wanderer :

.group k
_) k (_ ka: // Buchstaw k
k k // Keess : IPA k
kk k // Okkasioun : IPA k
ks ks

// De Buchstaw g huet siwen verschidden Aussproochen !

.L01 e i y
.group g
_) g (_ ge: // Buchstaw g
A) g (_ k // Drog : IPA k
g (L01 J // Spigel : IPA ʒ
a) g (_ x // Dag : IPA X
u) g (_ x // Dug : IPA X
ge (_ S // Plage : IPA ʃ
u) g (e rR // Kugel, Jugend : IPA ʁ
a) g (e rR // Lager : IPA ʁ
g g // Gees, goen, Drogen, Negatioun : IPA g
ge (C g@E
gin gin

.group q
_) q (_ ku: // Buchstaw q
_) qu kw // Quell : IPA kw
q k

.group x
_) x (_ ikz // Buchstaw x
x ks // Box : IPA ks

// Frikativ Buchstawen : f, v, s, z, h, c
// zousätzlech deitsch Buchstawen : ß
// zousätzlech franséisch Buchstawen : ç
// Frikativ Phonemen : f, v, s, z, ʦ, h, ʃ, dʒ, χ

.group f
_) f (_ ef // Buchstaw f
f f // Fësch : IPA f
ff f // Peffer : IPA f
fir fieR // fir : IPA fiɐ
fen f@En

.group v
_) v (_ faU // Buchstaw v
v v // Vakanz : IPA v
vun fun

.group s
_) s (_ ez // Buchstaw s
s z // Tasen, Summer : IPA z
ss s // Taass : IPA s
sch S // schéin : IPA ʃ
setz zETS
sen s@En
.group st
_) st (A S // staark : IPA ʃt
str Sr // stramm : IPA ʃr

.group ß
_) ß (_ eszet // Buchstaw ß
ß s // IPA s

.group z
_) z (_ zet // Buchstaw z
z TS // Zuch, schwätzen : IPA ʦ
ze TS@E

.group h
_) h (_ ha: // Buchstaw h
h h // hei, haut : IPA h

.group c
_) c (_ ze: // Buchstaw c
c k // Coca Cola : IPA k

.group ç
ç s // François : IPA s

.group ce
ce s // Glace : IPA s

.group ch
i) ch S // liicht : IPA ʃ
a) ch x // aacht : IPA X
u) ch x // Kuch : IPA X

.group ck
ck k // kucken : IPA k
ckelt k@Elt

.group w
_) w (_ ve: // Buchstaw fw
w v // wëschen, Hiwwel : IPA v
ww v // Hiwwel : IPA v
wwert iv@Ert

// Approximant Buchstawen : j, l
// Approximant Phonemen : j, ʒ, l

.group j
_) j (_ jot // Buchstaw j
j j // Juli, jäizen : IPA j
i) j J // héijen : IPA ʒ

.group l
_) l (_ el // Buchstaw l
log lo:x
loge lo:rR@E
lag la:x
lage la:rR@E
l l // Loft : IPA l
ll l // kill : IPA l

// Trill Buchstaw : r
// Trill Phonem : r

.group r
_) r (_ er // Buchstaw r
r r // Rou : IPA R
rr r // Arrivée : IPA R

+ 2
- 1
docs/languages.md View File

@@ -1,6 +1,6 @@
# Languages

Development version of eSpeak NG supports 126 languages and accents,
Development version of eSpeak NG supports 127 languages and accents,
which are listed in table below.

To check, what languages are supported by distributed version, run `espeak-ng --voices` command.
@@ -75,6 +75,7 @@ To check, what languages are supported by distributed version, run `espeak-ng --
| `trk` | `kk` | Turkic | Kazakh | |
| `trk` | `ky` | Turkic | Kyrgyz | |
| `itc` | `la` | Italic | Latin | |
| `gmw` | `lb` | West Germanic | Luxembourgish | |
| `bat` | `ltg` | Baltic | Latgalian | |
| `bat` | `lv` | Baltic | Latvian | |
| `art` | `lfn` | Constructed | Lingua Franca Nova<sup>\[1,2\]</sup> | |

+ 374
- 0
docs/languages/gmw/lb.md View File

@@ -0,0 +1,374 @@
## Introduction
The present fork adds [Luxembourgish](https://en.wikipedia.org/wiki/Luxembourgish) as 127th language to the eSpeak-NG text-to-speech (TTS) synthesizer (version.1.50.1).

The main purpose of the project is the creation of a rule based International Phonetic Alphabet ([IPA](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet)) phonemizer for transcriptions of luxembourgish audio utterances.

The phonemized text allows to create and use luxembourgish high-quality synthetic voices, trained with deep machine learning (ML) speech models, based on neural networks.

The speech generated with the associated eSpeak-NG sound engine is based on formant synthesis techniques and is of low quality. The related luxembourgish voice `Luxi` is intelligible, but I did no sound optimization because my focus is put on the phonemization front-end process.

## Luxembourgish customization
Four files are needed to include Luxembourgish as additional language in the eSpeak-NG project :

* [phsource/ph_luxembourgish](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/phsource/ph_letzebuergesch)
* [dictsource/lb_rules](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/dictsource/lb_rules)
* [dictsource/lb_list](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/dictsource/lb_list)
* [dictsource/lb_emoji](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/dictsource/lb_emoji)

The customization process can be splitted into two steps :

1. convert the characters (graphemes) of text into [phonetic descriptions](https://en.wikipedia.org/wiki/Luxembourgish_phonology) (phoneme-ID's)
2. convert the phoneme-ID's into [IPA phonemes](https://en.wikipedia.org/wiki/Help:IPA/Luxembourgish) and define instructions how to generate sounds for each phoneme.

eSpeak-NG is an evolution of the famous speak-program for the [ACORN/RISC_OS](https://en.wikipedia.org/wiki/RISC_OS) computers developed in 1995 by Jonathan Duddington. In 2007 the program was renamed [eSpeak](https://en.wikipedia.org/wiki/ESpeak). After the disappearance of the original developer early 2015, a new project eSpeak-NG (new generation) was started in December 2015 under the direction of [Reece H. Dunn](https://github.com/rhdunn).

When speak was created the phoneme-ID's were based on the text encoding ASCII, available at this time. This was never changed in the eSpeak-NG fork to avoid too much divergence from the original code. A few years ago, the support of IPA names in the output was added, but this work-around is still laborious, compared to a state-of-art solution where IPA names are used as phoneme-ID's. The current problems with the eSpeak-NG architecture are stated in this [roadmap](https://github.com/espeak-ng/espeak-ng/wiki/eSpeak-NG-roadmap) document.

Let's start to describe the `ph_luxembourgish` file which is the heart of the whole process.

### Phoneme inventory
We distinguish two types of phonemes : [vowels]( https://en.wikipedia.org/wiki/Vowel) ([monophtongs](https://en.wikipedia.org/wiki/Monophthong) and [diphtongs](https://en.wikipedia.org/wiki/Diphthong)) and [consonants](https://en.wikipedia.org/wiki/Consonant).

I use the word `Auto` with two vowels `au, o` and one consonant `t` to explain the definition of the related phonemes in the `ph_luxembourgish` file. The code is formed by simple text strings. The minimal code for the word `Auto` has the following format :

```
phoneme aU
vwl starttype #a endtype #u
ipa ɑʊ
length 190
FMT(vdiph/au)
endphoneme

phoneme t
vls alv stp
ipa t
WAV(ustop/t, 90)
endphoneme

phoneme o:
vwl starttype #o endtype #o
ipa oː
length 170
FMT(vowel/o)
endphoneme
```

The minimal phoneme definition includes 5 parts :

* the term `phoneme` followed by the phoneme-ID
* the description of the articulation
* the IPA symbol
* the parameters for the sound generation (length, FMT, WAV, etc)
* the closing term `endphoneme`

To specify the sound generation for a phoneme, eSpeak-NG provides a whole set of specific parameters and conditions for fine-tuning. A typical more complex example is shown hereafter :

```
phoneme t
vls alv stp
voicingswitch d
lengthmod 2
Vowelin f1=0 f2=1700 -300 300 f3=-100 80
Vowelout f1=0 f2=1700 -300 250 f3=-100 80 rms=20

IF nextPh(isPause2) THEN
WAV(ustop/t_)
ELIF nextPh(r) THEN
WAV(ustop/t_)
ELIF nextPh(@-) THEN
WAV(ustop/t_dnt, 50)
ENDIF
WAV(ustop/t, 90)
endphoneme
```
I mentionned in the introduction that my goal is not the perfect sound design. Therefore I specified the strict minimum of parameters for each phoneme. [Interested users](https://github.com/espeak-ng/espeak-ng/issues/1013) can easely modify the code to enhance the speech quality. The official documentation provides the necessary guidance :

* [Phoneme Model](https://github.com/espeak-ng/espeak-ng/blob/master/docs/phoneme_model.md)
* [Phoneme features and IPA](https://github.com/espeak-ng/espeak-ng/blob/master/docs/phonemes.md)
* [Phoneme Tables](https://github.com/espeak-ng/espeak-ng/blob/master/docs/phontab.md)

It is not necessary to specify all the phonemes for a language, they can be inherited from a `master phoneme file` or from another language. Most of the 126 languages supported by eSpeak-NG use this mecanism. Some languages even rely only on inherited phoneme definitions.

To get the full control of the phonemization process and to rest independent from code changes in other languages, I prefer however to define all the used phonemes for the luxembourgish language.

Speech models based on deep ML models are usually trained with about hundred hours of audio records and related transcriptions. To train a deep ML neural network with small datasets, which is the case for Luxembourgish, it is necessary to use a low number of different phonemes to obtain valid results. Based on my earlier experience with ML-TTS models I selected the following sets of phonemes (total : 56) for this first release of my project :

```
Monophtongs: (20)
IPA phoneme symbols : ɑ, aː, ɛː, e, æ, eː, ə, ɐ, i, iː, o, oː, u, uː, y, yː, ɑ̃ː, ɛ̃ː, õː, œː
my eSpeak phoneme-ID's : a, a:, aE, e, E, e:, @E, eR, i, i:, o, o:, u, u:, y, y:, eA, iA, oA, OU

Diphtongs: (9)
IPA phoneme symbols : æːɪ, ɑʊ, æːʊ, ɑɪ, ɜɪ, oɪ, iə, əʊ, uə
my eSpeak phoneme-ID's : aI, aU, AU, eI, OI, eU, iE, oU, uE

Consonants: (27)
IPA phoneme symbols b, c, d, f, g, h, j, k, l, m, n, ŋ, p, ʀ, s, t, v, w, z, ʒ, ʁ, ʑ, ʦ, ʃ, dʒ, X, ɕ
my eSpeak phoneme-ID's : b, c, d, f, g, h, j, k, l, m, n, N, p, r, s, t, v, w, z, J, rR, Z, TS, S, dZ, x, X
```
To add more phonemes, if required in the future, it's easy to modify the `ph_luxembourgish` file and to adapt the related files `lb_rules`, `lb_list` and `lb_emoji`.

In the next chapter I will describe the process to link letters (characters, graphemes) to phoneme-ID's.

### Text to Phoneme translation
#### lb_rules
The main file to define the translation (correspondence) between letters and phoneme-ID's is [lb_rules](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/dictsource/lb_rules). It's a sort of large table with the following format, starting with a `.group` line :

```
.group <one character>

or

.group <two characters>

then

previous letter sequence ) current letter sequence ( next letter sequence phoneme-ID's sequence // eventually comments
```

Let's take again the `Auto` example to explain the text code by considering that the the word starts with the dipthtong `au`, followed by the consonant `t`. The word ends with the monophtong `o`, preceded by the consonant `t`. The consonant `t` itself is preceded by the dipthtong `au` and followed by the monophtong `o`.

```
.group aU
_) au (t aU
.group o
t) o (_ o:
.group t
au) t (o t
```
Again, eSpeak-NG provides a large set of specific commands and parameters to define the translation rules in a very detailed and comprehensive manner. A simple example are the capital letter A which indicates any vowel and the capital letter C which indicates any consonant. If we use A and C in the above example, we can easely define the rules to phonemize the words `Aula` and `Auto` with combined code :

```
.group aU
_) au (C aU
.group o
C) o (_ o:

.group a
C) a (_ a:
.group t
A) t (A t
.group l
A) l (A l
```
Another way for coding to obtain the same result is :

```
.group aU
_) au (C aU
.group to
A) to (_ to:
.group la
A) la (_ la:
```
To find the pronunciation of a word, the rules are searched and any which match the letters at the current position in the word are given a score, depending on how many letters are matched. The pronunciation from the best matching rule is chosen. The position pointer inside the word is then advanced past those consumed letters which have been matched and the process is repeated until all the letters of the word have been processed.
There are numerous possibilities to code the phonemization rules with `prev`, `current` and `post` letter sequences (or placeholders, symbols, options, letter groups with some common features, etc).
There are also some restrictions. The first letter of a rule-group-ID with two letters must be a 7bit-ASCII character (less than 0x80). In the case of a one-letter-group-ID extended ASCII-characters are accepted (as `è ö î` etc).

The guide [Text to Phoneme Translation](https://github.com/espeak-ng/espeak-ng/blob/master/docs/dictionary.md) provides all available options to set up a complete and complex translation table. I opted however for a few simple rules because I have a preference to include a luxembourgish pronunciation lexicon into the file `lb_list` described below. This way I take into account that Luxembourgish contains numerous loanwords from german, french and english, which makes the definition of global rules more difficult.
An interested user can easely extend and adapt the rules in the `lb_rules` file to his own needs.

#### lb_list
Usually a text to synthesize does not only contain common words from a specific language, but also punctuations, numbers, symbols, personal names, abbreviations, loanwords and eventually [Speech Synthesis Markup Language](https://en.wikipedia.org/wiki/Speech_Synthesis_Markup_Language) (SSML) code. These exceptions are handled in the [lb_list](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/dictsource/lb_list). It's a simple text lexicon with the following entries :
```
letter-name phoneme-ID or $accent
word phoneme-ID sequence
word other similar word $text
word (stressed, unstressed, ..) $u // intonation
abbreviation phoneme-ID sequence or $$abbrev
number phoneme-ID sequence
symbol phoneme-ID sequence
```

Here is a small excerpt from the luxembourgish `lb_list` file to show the principles :
```
// letter names
_a a:
_e e:
ä $accent
û $accent
// intonation
de $u // unstress
den $u
// numbers
_0 nul
_1 e:nt
_2 TSve:
_0C honert
_0M1 daUz@End
// punctuations
_. punkt
_, koma
// symbols
+ pluz
& an
€ eUro
CSV $abbrev
OGBL $abbrev
asbl $abbrev
ASCII azki
// personal names
Marco marko:
Barnig barniS
Simone zimon
```
The flag $accent tells the processor to say the letter name, followed by the name of the [diacritics](https://en.wikipedia.org/wiki/Diacritic). The flag $abbrev indicates to say the letter names in sequence. Speaking the names of punctuations works only if the flag `--punc` is specified at inference.

Again, eSpeak-NG offers a lot of options to finetune the list entries. My `lb_list` file for the first version of this project is work in progress. I will update it progressively to comply to my needs. An interested user can do the same.

Note: the supported SSML tags are documented in the document [SSML and HTML Support](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/docs/markup.md).
#### lb_emoji
Today, graphics are gaining more and more importance in text. For this reason eSpeak-NG replaced the former file `language_extra` by a file called `language_emoji`. [Emoji's](https://en.wikipedia.org/wiki/Emoji) are small pictures which can be represented as encoded characters. Originating on Japanese mobile phones in 1997, emoji's became increasingly popular worldwide in the 2010s after being added to several mobile operating systems. To assure an interoperability between browsers, mobiles and messaging systems, the emoji's are standardized since 2014 by the [Unicode Consortium](https://en.wikipedia.org/wiki/Unicode_Consortium). Every year additional emoji's are added to the standard upon public proposals.
The [full list of emoji's](https://unicode.org/emoji/charts/full-emoji-list.html) is available at the home-page of the Unicode Consortium.

Here are two examples how to include emoji's (graphics) in a luxembourgish sentence to submit to the eSpeak-NG phonemizer :

An der &#x1F570; hunn sech den &#x1F9ED;&#x1F4A8; an d’&#x1F31E; gestridden, wie vun hinnen zwee wuel méi &#x1F4AA; wier, wéi e &#x1F6B6;, deen an ee waarme &#x1F9E5; agepak war, iwwert de &#x1F6E4; koum.

Haut sinn &#x261D; mat mengen Enkelkanner &#x1F9D1;&#x200D;&#x1F91D;&#x200D;&#x1F9D1; , &#x1F466; , &#x1F467; , an &#x1F469; an den &#x1F3AA; gaangen. Do hunn mer e &#x1F98D;, eng &#x1F992;, en &#x1F418; an en &#x1F98F; gesinn.
To phonemize the above sentences I added the following entries into the [lb_emoji](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/dictsource/lb_emoji) file :
```
&#x1F570; TSaIt
&#x1F9ED; nort
&#x1F32C; vant
&#x1F31E; zon
&#x1F4AA; Sta:rk
&#x1F6B6; vand@EreR
&#x1F9E5; mant@El
&#x1F6E4; ve:
&#x261D; eS
&#x1F9D1;&#x200D;&#x1F91D;&#x200D;&#x1F9D1; Sa:rel, toma:
&#x1F466; asto:r
&#x1F467; kapuzin
&#x1F469; tessa
&#x1F3AA; TSirkus
&#x1F98D; gorila:
&#x1F992; giraf
&#x1F418; elefant
&#x1F98F; rino:zerus
```
A more comfortable option is to insert emoji's from the popup menu in the text-editor :

![insert emoji](https://github.com/mbarnig/espeak-ng-lb/blob/l%C3%ABtzebuergesch/_layouts/insert-emoji.png)

Here are the results of the phonemization :

```
ɑn dɐ ‚ʦæ:ɪt / hun zeɕ dən ’noʀtvɑnt ɑn ‚dzon gə’ʃtʀidən / viə fun hinən ‚ʦve: vuəl ‚meɪ ʃta:ʀk viɐ / veɪ eː ‚vɑndəʀɐ / de:n ɑn eː ‚va:ʀmə ‚mɑntəl ‚a:ɡəpa:k va:ʀ / ivɐt də ‚veː kəʊm //

haUt sin eS mat mengen enkelkaneR Sa:rel / toma: / asto:r / kapuzin an tessa an den TSirkus ga:ngen // do: hun meR e gorila: / eN giraf / en elefant an e rino:zerus gesin //
```

For convenience I moved the symbols from the `lb_list` file to the `lb_emoji` file.

## Integration
In the last step the four luxembourgish files are embedded into the source code of the eSpeak-NG project. The following source files are modified :

* [Makefile.am](https://github.com/mbarnig/espeak-ng-lb/blob/master/Makefile.am)
* [phsource/phonemes](https://github.com/mbarnig/espeak-ng-lb/blob/master/phsource/phonemes)
* [docs/languages.md](https://github.com/mbarnig/espeak-ng-lb/blob/master/docs/languages.md)

In the `Makefile.am` file I added the following lines :

in position 392 :
```
phsource/ph_luxembourgish \
```
in position 570 :
```
espeak-ng-data/lb_dict \
```
in position 792 :
```
lb: espeak-ng-data/lb_dict
espeak-ng-data/lb_dict: dictsource/lb_list dictsource/lb_rules dictsource/lb_emoji
```
In the `phsource/phonemes` file I added these lines at position 1763 :
```
phonemetable lb base1
include ph_luxembourgish
```
In the `docs/languages.md` file I added this line after the latin language :

`gmw` | `lb` | West Germanic | Lëtzebuergesch

Two additional files must be created :
* [espeak-ng-data/lang/gmw/lb](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/espeak-ng-data/lang/gmw/lb)
* [espeak-ng-data/voices/!v/Luxi](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/espeak-ng-data/voices/!v/Luxi)

Both files are very simple if we include only the strict minimum.

Here comes the content of the language file `lb` :
```
name Lëtzebuergesch
language lb
```
Here is the voice file `Luxi` :
```
name Luxi
language lb
maintainer mbarnig
```
As usual, eSpeak-NG provides numerous options to customize these files. Please read the guide [Voice and Language files](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/docs/voices.md) to get a detailed documentation about all available features.
folder
When building the project the first time, an additional file `lb_dict` is created inside the folder [espeak-ng-data](https://github.com/mbarnig/espeak-ng-lb/tree/l%C3%ABtzebuergesch/espeak-ng-data), which is a compressed binary combination of the three files `lb_rules`, `lb_list` and `lb_emoji`.

## Build and use the project
Now the forked eSpeak-NG source code is ready for compilation, hopefully without problems. Compilation is easy if you have a personal computer with a well configured development environment and all required tools for C-compilation. On my Ubuntu 20.04 system this is the case and I build and install the project with 4 commands :

```
./autogen.sh
./configure
make
sudo make install
```

![build espeak-ng-lb](https://github.com/mbarnig/espeak-ng-lb/blob/l%C3%ABtzebuergesch/_layouts/espeak-ng-lb.png)

The [building guide](https://github.com/mbarnig/espeak-ng-lb/blob/master/docs/building.md) provides detailed info how to compile and build eSpeak NG from the source on different operating systems. If you need help, please visit the [issues section](https://github.com/espeak-ng/espeak-ng/issues) from the master eSpeak-NG project. Currently there are 333 open and 311 closed issues, so probably another user found already a solution for your problem.

To use the project, I need only one command :

`espeak-ng -v lb -q --ipa -f <text file>`

The content of the text file is phonemized with luxembourgish rules (flag `-v lb`) with IPA symbols (flag `--ipa`), without producing speech (flag `-q`). All the optional configuration flags are explained in the [--help document](https://github.com/mbarnig/espeak-ng-lb/blob/lëtzebuergesch/src/espeak-ng.1.ronn).

A more comfortable option is to use the great tool [espeak-phonemizer](https://github.com/rhasspy/espeak-phonemizer), created by [Michael Hansen](https://www.linkedin.com/in/michael-hansen-9885b2105/) (alias [synesthesiam](https://synesthesiam.com)), which transforms the output from espeak-NG in a format ready for submission as input to train a deep machine learning luxembourgish TTS model, based on neural networks.

My favorite Github projects for my luxembourgish ML-TTS experiments are :

* [Rhasppy/Larynx](https://github.com/rhasspy/larynx)
* [Coqui-TTS](https://github.com/coqui-ai/TTS)
* [Implementations by Keon Lee](https://github.com/keonlee9420)
* [Implementations by Jaehyeon Kim](https://github.com/jaywalnut310)


## References
* [The best of two breeds](https://www.web3.lu/the-best-of-two-breeds/), 2021, mbarnig
* [Synthèse de la parole](https://www.amazon.fr/Synthèse-parole-électrique-électronique-informatique/dp/2322238600/), 2020, Marco Barnig
* [Synthèse vocale](https://www.web3.lu/synthese-vocale/), 2019, mbarnig
* [Emojis et Unicode](https://www.web3.lu/timeline/emojis-et-unicode/), 2018, mbarnig
* [Evolution of character encoding](https://www.web3.lu/evolution-of-character-encoding/), 2016, mbarnig
* [Speech Corpora for TTS](https://www.web3.lu/speech-corpora-tts/), 2015, mbarnig
* [Festival Text-to-Speech Package](https://www.web3.lu/festival-text-speech-package/), 2015, mbarnig
* [eSpeak Formant Synthesizer](https://www.web3.lu/espeak-formant-synthesizer/), 2014, mbarnig
* [Mary TTS (Text To Speech)](https://www.web3.lu/marytts-text-speech/), 2014, mbarnig
* [Language : fr, de, en, lb, eo](https://www.web3.lu/languages/), 2014, mbarnig
* [Spectrograms and speech processing](https://www.web3.lu/spectrogram-speech-processing/), 2014, mbarnig
* [Phonemes, phones, graphemes and visemes](https://www.web3.lu/phonemes-phones-graphemes-visemes/), 2014, mbarnig
* [FreeTTS : a Java speech synthesizer](https://www.web3.lu/freetts-a-java-speech-synthesizer/), 2005, mbarnig

+ 2
- 0
espeak-ng-data/lang/gmw/lb View File

@@ -0,0 +1,2 @@
name Luxembourgish
language lb

+ 412
- 0
phsource/ph_luxembourgish View File

@@ -0,0 +1,412 @@
// ph_luxembourgish
// created by Marco Barnig ([email protected])
// first version : 24.9.2014
// https://www.web3.lu/espeak-formant-synthesizer
// new version : 15.9.2018
// latest update : 10.11.2021
//=========================================================
// Lëtzebuergesch
//=========================================================
// References :
// Peter Gilles; Jürgen Trouvain
// International Journal of the IPA
// https://infolux.uni.lu
// https://github.com/PeterGilles/Luxembourgish-language-resources
// https://lod.lu
// https://ortho.lod.lu
//
// Wikipedia
// https://en.wikipedia.org/wiki/Luxembourgish
// https://en.wikipedia.org/wiki/Luxembourgish_phonology
// https://en.wikipedia.org/wiki/Help:IPA/Luxembourgish
//
// virtual class of vowels : #@, #a, #e, #i, #o, #u
// IPA Vokaler (20) : ɑ, aː, ɛː, e, æ, eː, ə, ɐ, i, iː, o, oː, u, uː, y, y:, ɑ̃ː, ɛ̃ː, õː, œː
// IPA Vokalkoppelen (9) : æːɪ, ɑʊ, æːʊ, ɑɪ, ɜɪ, oɪ, iə, əʊ, uə

phoneme a // K[a]pp ; kurz geschwate Vokal a
vwl starttype #a endtype #a
ipa ɑ
length 120
FMT(vowel/a)
endphoneme

phoneme a: // K[a]p ; laang geschwate Vokal a
vwl starttype #a endtype #a
ipa aː
length 190
FMT(vowel/aa_6)
endphoneme

phoneme aE // St[ä]ren
vwl starttype #a endtype #e
ipa ɛː
length 190
FMT(vdiph/ae_2)
endphoneme

phoneme e // Méck
vwl starttype #e endtype #e
ipa e
length 120
FMT(vowel/e)
endphoneme

phoneme E // h[e]ll ; kurzen oppenen Vokal e
vwl starttype #e endtype #e
ipa æ
length 120
FMT(vowel/e)
endphoneme

phoneme e: // K[ee]ss ; laangen Vokal e
vwl starttype #e endtype #e
ipa eː
length 190
FMT(vowel/e)
endphoneme

phoneme @E // lies[e]n, h[ë]ll[e]f[e]n, (schwa)
vwl starttype #@ endtype #@
ipa ə
length 190
FMT(vowel/@)
endphoneme

phoneme eR // Kann[er] ; open-schwa
vwl starttype #@ endtype #@
ipa ɐ
length 150
FMT(vowel/@)
endphoneme

phoneme i // m[i]dd
vwl starttype #i endtype #i
ipa i
length 120
FMT(vowel/i)
endphoneme

phoneme i: // B[ii]scht
vwl starttype #i endtype #i
ipa iː
length 190
FMT(vowel/i)
endphoneme

phoneme o // Spr[o]ch
vwl starttype #o endtype #o
ipa o
length 120
FMT(vowel/o)
endphoneme

phoneme o: // Spr[oo]ch
vwl starttype #o endtype #o
ipa oː
length 190
FMT(vowel/o)
endphoneme

phoneme u // g[u]tt
vwl starttype #u endtype #u
ipa u
length 120
FMT(vowel/u)
endphoneme

phoneme u: // D[uu]scht
vwl starttype #u endtype #u
ipa uː
length 190
FMT(vowel/u)
endphoneme

phoneme y
vwl starttype #u endtype #e
ipa y
length 120
FMT(vdiph2/uu@)
endphoneme

phoneme y: // S[ü]den
vwl starttype #u endtype #e
ipa yː
length 190
FMT(vdiph2/uu@)
endphoneme

phoneme eA // Restaur[an]t
vwl starttype #a endtype #a
ipa ɑ̃ː
length 190
FMT(vnasal/aa_n4)
endphoneme

phoneme iA // Cous[in], D[in]de
vwl starttype #i endtype #i
ipa ɛ̃ː
length 190
FMT(vnasal/i_n2)
endphoneme

phoneme oA // Sais[on], Pard[on], F[on]d
vwl starttype #o endtype #o
ipa õː
length 190
FMT(vnasal/o_n5)
endphoneme

phoneme OU // interi[eu]r, lösen
vwl starttype #o endtype #e
ipa œː
length 190
FMT(vdiph2/o@)
endphoneme

phoneme aI // Z[äi]t
vwl starttype #e endtype #i
ipa æːɪ
length 190
FMT(vdiph/ai_6)
endphoneme

phoneme aU // [Au]to
vwl starttype #a endtype #u
ipa ɑʊ
length 190
FMT(vdiph/au)
endphoneme

phoneme AU // R[au]m
vwl starttype #a endtype #u
ipa æːʊ
length 190
FMT(vdiph/au_4)
endphoneme

phoneme eI // L[ei]t
vwl starttype #e endtype #i
ipa ɑɪ
length 190
FMT(vdiph/eei_2)
endphoneme

phoneme OI // fr[éi]
vwl starttype #e endtype #i
ipa ɜɪ
length 190
FMT(vdiph/eei_5)
endphoneme

phoneme eU // [Eu]ro
vwl starttype #e endtype #u
ipa oɪ
length 190
FMT(vdiph/ooi)
endphoneme

phoneme iE // h[ie]n
vwl starttype #i endtype #e
ipa iə
length 190
FMT(vdiph2/ie)
endphoneme

phoneme oU // Sch[ou]l
vwl starttype #o endtype #u
ipa əʊ
length 190
FMT(vdiph/ou_2)
endphoneme

phoneme uE // B[ue]dem
vwl starttype #u endtype #e
ipa uə
length 190
FMT(vdiph2/u@)
endphoneme

// ******************************************

// Konsonanten : Total : 27

// Nasal Phonemen : m, n, ŋ

phoneme m
vcd blb nas
ipa m
FMT(m/mj)
endphoneme

phoneme n
vcd alv nas
ipa n
FMT(n/nj)
endphoneme

phoneme N
vcd rfx nas
ipa ɳ
FMT(nn/nnj)
endphoneme

// Plosiv Phonemen : p, b, t, d, k, g

phoneme p
vls blb stp
ipa p
WAV(ustop/p)
endphoneme

phoneme b
vcd blb stp
ipa b
FMT(b/b) addWav(x/b)
endphoneme

phoneme t
vls alv stp
ipa t
WAV(ustop/t, 90)
endphoneme

phoneme d
vcd alv stp
ipa d
FMT(d/dr) addWav(x/d)
endphoneme

phoneme k
vls vel stp
ipa k
WAV(ustop/k)
endphoneme

phoneme g
vcd vel stp
ipa g
FMT(g/g) addWav(x/g2, 150)
endphoneme

// Affricate Phomenen : ʦ, dʒ

phoneme TS
vls pla afr sib
ipa ʦ
WAV(ustop/tsh)
endphoneme

phoneme dZ
vcd pla afr sib
ipa dʒ
FMT(dzh/dzh) addWav(x/dzh)
endphoneme

// Frikativ Phonemen : f, v, w, s, z, ʃ, ʒ, X, ɕ, ʁ, ʑ, h

phoneme f
vls lbd frc
ipa f
WAV(ufric/f, 80)
endphoneme

phoneme v
vcd lbd frc
ipa v
FMT(voc/v) addWav(vocw/v)
endphoneme

phoneme w
liquid
ipa w
FMT(w/w)
endphoneme

phoneme s
vcd alv frc sib
ipa s
FMT(voc/z) addWav(ufric/s_, 85)
endphoneme

phoneme z
vcd alv frc sib
ipa z
FMT(voc/z) addWav(ufric/s_, 45)
endphoneme

phoneme S
vls pla frc sib
ipa ʃ
WAV(ufric/sh, 45)
endphoneme

phoneme J
vcd pal afr sib pzd
ipa ʒ
FMT(dzh/dz_pzd) addWav(x/dzh, 45)
endphoneme

phoneme x
vls vel frc
ipa X
WAV(ufric/x_hr, 20)
endphoneme

phoneme X
vls alp sib frc
ipa ɕ
WAV(ufric/sh_pzd)
endphoneme

phoneme rR
vcd uvl frc
ipa ʁ
FMT(vwl_fr/r_) addWav(r3/rx, 20)
endphoneme

phoneme Z
vcd alp sib frc
ipa ʑ
FMT(voc/z_pzd) addWav(ufric/sh_pzd, 80)
endphoneme

phoneme h
vls glt apr
ipa h
IF nextPh(#@) THEN
WAV(h/h@)
ELIF nextPh(#a) THEN
WAV(h/ha)
ELIF nextPh(#e) THEN
WAV(h/he)
ELIF nextPh(#i) THEN
WAV(h/hi)
ELIF nextPh(#o) THEN
WAV(h/ho)
ELIF nextPh(#u) THEN
WAV(h/hu)
ENDIF
endphoneme

// Approximant Phonemen : l, j

phoneme l
liquid
ipa l
FMT(l/l)
endphoneme

phoneme j
liquid pal
ipa j
FMT(j/j_)
endphoneme

// Trill Phonem : ʀ

phoneme r
vcd uvl frc
ipa ʀ
FMT(r3/r_uvl) addWav(r3/r_uvl.wav, 70)
endphoneme

+ 3
- 0
phsource/phonemes View File

@@ -1761,6 +1761,9 @@ include ph_korean
phonemetable la base1
include ph_latin

phonemetable lb base1
include ph_luxembourgish

phonemetable lt base1
include ph_lithuanian


+ 1
- 0
tests/language-phonemes.test View File

@@ -84,6 +84,7 @@ test_phwav kok 91e9bf35df942daecaa3e260807e24fb93470a55 "ma na n.a n^a Na pa ta
test_phwav ku 4254d35e08fd4a71848ae75cd0aefc76f115a167 "ma na Na pa ba ta da ka ga qa ?a tSa dZa fa va sa za Sa Za xa ca Ja ha ja *a Ra la _:_ mi my mu mI mU me mo mE mE# ma m8 mV meI meU"
test_phwav ky bee9e683218b0c53c29a89709501f8a2486b71b2 "ma na Na pa ba ta da ka ga tsa tSa dZa fa va sa za Sa xa Xa la La ja *a ra _:_ ma me mi mo mu mI my mO ma: me: mi: mo: mu: mI: my: mO:"
test_phwav la 080bd53c20991eae7baec73b8c735eacc8aae076 "ma na Na pa p<h>a ba ta t<h>a da ka k<h>a ga fa sa za ha Ra la ja wa _:_ ma mE mI mO mU ma: me: mi: mo: mu: my my: maU maI meI mEU mOI"
test_phwav lb 423fae731272b6cbcb4bb8669bd2f7da7ccacacd "a a: aE e E e: @E eR i i: o o: u u: y y: eA iA oA OU aI aU AU eI OI eU iE oU uE ma na Na pa ba ta da ka ga TSa dZa fa va wa sa za Sa Ja xa Xa rRa Za ha la ja ra"
test_phwav lfn 044e27a5100528760a185e0773dccaca504b5bd4 "ma na Na pa ba ta da ka ga fa va sa za Sa Za ha la ja R2a **a wa _:_ ma me mi mo mu maI maU meU moI"
test_phwav lt 615e503b996ea5f7b267ebd77b91e77c5b874e18 "ma m;a na n;a pa p;a ta t;a ka k;a ba b;a da d;a ga g;a tsa ts;a tSa tS;a dza dz;a dZa dZ;a fa f;a sa s;a Sa S;a xa x;a va v;a za z;a Za Z;a la l;a ra r;a ja _:_ m@ ma mA ma: me mE me: mee meA mi mI mi: mo mO mo: mu mU mu: mw mW mai mei mau muo moi mui mie maU meU moU maI meI"
test_phwav ltg 9e0ee2a095cd074860c3db7aa89b2011ffb38ee8 "ma m;a na n;a pa p;a ta t;a ka k;a ba b;a da d;a ga g;a tsa ts;a tSa tS;a dza dz;a dZa dZ;a fa f;a sa s;a Sa S;a xa x;a va v;a za z;a Za Z;a la l;a ra r;a ja _:_ m@ ma mA ma: me mE me: mee meA mi mI mi: mo mO mo: mu mU mu: mw mW mai mei mau muo moi mui mie maU meU moU maI meI"

Loading…
Cancel
Save