Added attributes in voice file: voicing, breath, breathw. Added "en-wisper" voice. espeakedit, fixed crash when playing sounds using a voice file with a high pitch. lang=it, use phoneme [u], not [U] Added directory docs to svn. git-svn-id: https://espeak.svn.sourceforge.net/svnroot/espeak/trunk@45 d46cf337-b52f-0410-862d-fd96e6ae7743master
@@ -164,7 +164,7 @@ t tS ts v z Z z2 | |||
Dictionary it_dict | |||
@- a a/ aI aU e E i | |||
o O oI U u | |||
o O oI u | |||
* : b d dZ f g j | |||
k l l^ m n N n^ p | |||
@@ -285,8 +285,8 @@ Dictionary pt_dict | |||
& &/ &U~ &~ @ @- a A | |||
aI aU e E eI EI eU EU | |||
e~ i i/ iU o O oI OI | |||
o~ U u uI u~ y | |||
i i/ iU i~ o O oI OI | |||
o~ u U uI u~ y | |||
* : ; b C d dZ f | |||
g h j k l l^ m n |
@@ -7,7 +7,7 @@ | |||
_cap k'apital | |||
_?? s'imbolo | |||
_! p'Untoesklamat'ivo | |||
_! p'untoesklamat'ivo | |||
_" viRgolet:e | |||
_# kantSellet:o | |||
_$ dolla:Ro | |||
@@ -15,37 +15,37 @@ _% peRtS'ento | |||
_& _!e: | |||
_' ap'OstRofo | |||
_( ap'eRtapaR'entezi | |||
_) kj,UzapaR'entezi | |||
_) kj,uzapaR'entezi | |||
_* asteRisko | |||
_+ pjU | |||
_+ pju | |||
_, v'iRgola | |||
_- t@-*at:ino | |||
_. pUnto | |||
_. punto | |||
_/ slaS | |||
_: d,Uep'Unte | |||
_; p'Untoev'iRgola | |||
_: d,uep'unte | |||
_; p'untoev'iRgola | |||
_< ap'eRta'angolo | |||
_= Ugwale | |||
_> kj,Uza'angolo | |||
_? p'UntointeRogat'ivo | |||
_= ugwale | |||
_> kj,uza'angolo | |||
_? p'untointeRogat'ivo | |||
_@ ki'otS:ola | |||
_[ ap'eRtakw'ad@-*a | |||
_\ bakslaS | |||
_] kj,Uzakw'ad@-*a | |||
_^ kap:UtS:o | |||
__ 'UndeRskoRe | |||
_] kj,uzakw'ad@-*a | |||
_^ kap:utS:o | |||
__ 'undeRskoRe | |||
_` bakp@-*aIm | |||
_{ ap'eRtag@-*'af:a | |||
_| b'aRa | |||
_} kj,Uzag@-*'af:a | |||
_} kj,uzag@-*'af:a | |||
_~ tilde | |||
// numbers | |||
_0 dz'eRo | |||
_1 'Uno | |||
_2 d'Ue | |||
_1 'uno | |||
_2 d'ue | |||
_3 tR'e | |||
_4 kw'at:Ro | |||
_5 tS'inkwe |
@@ -266,22 +266,22 @@ | |||
tt t: | |||
.group u | |||
_) u(_ U: | |||
u U: | |||
u (CK U | |||
_) u(_ u: | |||
u u: | |||
u (CK u | |||
u (A w2 | |||
Kr) u (A U | |||
Kl) u (A U | |||
Kr) u (A u | |||
Kl) u (A u | |||
@) u (dine_ 'u | |||
@) u (colo_ 'u | |||
.group v | |||
_) v(_ vU: | |||
_) v(_ vu: | |||
v v | |||
.group w | |||
_) w(_ d'op:jav'U | |||
_) w(_ d'op:jav'u | |||
w w2 | |||
.group x | |||
@@ -315,8 +315,8 @@ | |||
ó (CK 'o | |||
ò 'O: | |||
ò (CK 'O | |||
ù 'U: | |||
ù (CK 'U | |||
ù 'u: | |||
ù (CK 'u | |||
% pertS'ento | |||
$ dolla:ro | |||
@@ -325,14 +325,14 @@ | |||
& _!e | |||
@ ki'otS:ola | |||
© k'opiraIt | |||
+ pjU | |||
. pUnto | |||
+ pju | |||
. punto | |||
, (D v'irgola | |||
: d,Uep'Unte | |||
; p'Untoev'irgola | |||
! _p'Untoesklamat'ivo_ | |||
: d,uep'unte | |||
; p'untoev'irgola | |||
! _p'untoesklamat'ivo_ | |||
!) ! | |||
? _p'UntointeRogat'ivo_ | |||
? _p'untointeRogat'ivo_ | |||
?) ? | |||
\\ bakslaS | |||
/ slaS |
@@ -0,0 +1,132 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<HTML> | |||
<HEAD> | |||
<META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8"> | |||
<TITLE></TITLE> | |||
</HEAD> | |||
<BODY LANG="en-GB" DIR="LTR"> | |||
<A href="docindex.html">Back</A> | |||
<HR> | |||
<H2>6. ADDING OR IMPROVING A LANGUAGE</H2> | |||
<HR> | |||
Most of the work doesn't need any programming knowledge. Just an understanding of the language, an | |||
awareness of its features, patience and attention to detail. Wikipedia is a good source of basic phonetic information, eg | |||
<a href="http://en.wikipedia.org/wiki/Vowel">http://en.wikipedia.org/wiki/Vowel</a> | |||
<P> | |||
In many cases it should be fairly easy to add a rough implementation of a new language, hopefully | |||
enough to be intelligible.<br> | |||
After that it's a gradual process of improvement to: | |||
<ul> | |||
<li>Make the spelling-to-phoneme translation rules more accurate, including the position of stressed | |||
syllables within words. Some languages are easier than others. I expect most are easier than English. | |||
<p><li>Improve the sounds of the phonemes. This may consist of making small adjustments to vowel and diphthong quality, | |||
or adjusting the strength of consonants. Bigger changes may be recording new or replacement consonant | |||
sounds, or even writing program code to implement new types of sounds. | |||
<p><li>Marking some common words in the dictionary that should be unstressed (words such as "the", "is"), should be preceded | |||
by a short pause (such as "and", "but"), or have other attributes, in order to make the speech flow better. | |||
<p><li>Improve the rhythm of the speech by adjusting the relative lengths of vowels in different contexts, eg. stressed/unstressed syllable, | |||
or depending on the following phonemes. This is important for making the speech sound good for the language. | |||
<p><li>Identify or implement new functions in the program to improve the speech, or to deal with | |||
characteristics of the language which are not currently implemented. For example, a different intonation module. | |||
</ul> | |||
If you are interested in working on a language, please contact me to set up the initial data and to | |||
discuss the features of the language. | |||
<HR> | |||
<H3>6.1 Language Code</H3> | |||
<P>Generally, the language's international ISO 639-1 code is used to | |||
identify the language. It is used in the filenames which | |||
contains the language's data. In the examples below the code "<B>en</B>" | |||
(English) is used as an example. Replace this with the code of your | |||
language.<p> | |||
It is possible to have different variants of a language, for example where the sound of some phonemes changed, | |||
or where some of the pronunciation rules differ. | |||
<HR> | |||
<H3>6.2 Phoneme File</H3> | |||
<P>You must first decide on the set of phonemes to be used for the | |||
language. These should be listed and defined in a phonemes file such as | |||
<B>ph_english</B>. A reference to this file is then included at the end of | |||
the <B>phonemes,</B> file (the master phoneme file), eg:</P> | |||
<PRE> phonemetable en base | |||
include ph_english</PRE><P> | |||
This example defines a phoneme table "<B>en</B>" which inherits | |||
the contents of phoneme table "<B>base</B>". Its contents are | |||
found in the file <B>ph_english</B>.</P> | |||
<P>The <B>base</B> phoneme table contains definitions of a basic set of | |||
consonants, and also some "control" phonemes such as stress marks and | |||
pauses. The phoneme table for a language will generally inherit this, | |||
or alternatively it may inherit the phoneme table of another language | |||
which in turn inherits the <B>base</B> phoneme table.</P> | |||
<P>The phonemes file for the language defines those additional | |||
phonemes which are not inherited (generally the vowels and diphthongs, plus any additional | |||
consonants), or phonemes whose definitions differ from the | |||
inherited version (eg. the redefinition of a consonant).</P> | |||
<P>Details of the contents of phonemes files are given in | |||
<A href="phontab.html">phontab.html</A>.</P> | |||
The <B>Compile phoneme data</B> function of the <B>espeakedit</B> | |||
program compiles the phonemes files to produce the files | |||
<B>espeak-data/phontab</B>, <B>phonindex</B>, and <B>phondata.</B><P> | |||
For information on how to analyse recorded sounds of the language and to | |||
prepare the corresponding phoneme data, see (not yet written).<p> | |||
For an initial draft a language will often be able to use vowels and | |||
consonants which have already been set up for another language. | |||
<HR> | |||
<H3>6.3 Dictionary Files</H3> | |||
<P STYLE="font-weight: medium">Once the language's phonemes have been | |||
defined, then pronunciation dictionary data can be produced in order | |||
to translate the language's source text into phonemes. This consists | |||
of two source files: <B>en_rules</B> (the spelling to phoneme rules) and | |||
<B>en_list</B> (an exceptions list, and attributes of certain words). The corresponding compiled data | |||
file is <B>espeak-data/en_dict</B> which is produced from <B>en_rules</B> | |||
and <B>en_list</B> sources by the command: <B>speak --compile=en</B>.</P> | |||
<P STYLE="font-weight: medium">Details of the contents of the | |||
dictionary files are given in <A href="dictionary.html">dictionary.html</A>.</P> | |||
<P STYLE="font-weight: medium">The <B>en_list</B> file contains not | |||
only pronunciation exceptions, but also gives attributes to specific | |||
words, Most notable of these are:</P> | |||
<P STYLE="font-weight: medium"><B>$u </B>Some common words should be | |||
marked as "unstressed" in order to make the speech flow better. | |||
These words generally include articles (eg: a, the, this, that), | |||
auxillary verbs (eg: is, have, will, can, may), pronouns and | |||
possessive adjectives (eg: he, his), some common prepositions (eg: | |||
of, to, in, of), some common conjunctions (eg. and, or, if)., some | |||
common adverbs and adjectives (eg. any, already)</P> | |||
<P><B>$pause </B>Some words should be marked to have a short pause | |||
before then, in order to produce natural pauses in long sentences. | |||
These include conjunctions (eg. and, or, but, however) and perhaps | |||
some prepositions.</P> | |||
<HR> | |||
<H3>6.4 Voice File</H3> | |||
<P STYLE="font-weight: medium">Each language should have one or more | |||
voice files in <B>espeak-data/voices</B>. The filename of the default voice | |||
for a language should be the same as the language code.</P> | |||
<P STYLE="font-weight: medium">Details of the contants of voice files | |||
are given in <A href="voices.html">voices.html</A>.</P> | |||
<P STYLE="font-weight: medium">The simplest voice file would contain | |||
just a single line to give the language code, eg:</P> | |||
<PRE STYLE="margin-bottom: 0.5cm"> language en</PRE><P STYLE="font-weight: medium"> | |||
This language code specifies the phoneme table (i.e. <b>phonemetable en</b> and the | |||
dictionary (i.e. <B>espeak-data/en_dict</B>) to be used. If needed, these can be | |||
overridden by <B>phonemes</B> and <B>dictionary</B> attributes in the | |||
voices file.</P> | |||
<HR> | |||
<H3>6.5 Program Code</H3> | |||
<P STYLE="font-weight: medium">The behaviour of the speak program is | |||
controlled by various options (eg. whether words are stressed on the first, | |||
last, or penultimate syllable). The function <B>SetTranslator()</B> at the | |||
start of the <B>tr_languages.cpp</B> file recognizes the language | |||
code and sets the appropriate set of options.</P> | |||
<P STYLE="font-weight: medium">For a new language, you would add its | |||
language code and the required options in <B>SetTranslator()</B>. However, this | |||
may not be necessary during testing because most of the options can also be | |||
set from the voice file in | |||
<B>espeak-data/voices</B>.</P> | |||
<P STYLE="font-weight: medium">If necessary, you can define a new | |||
translator class for a language, and select this in the | |||
SetTranslator() function. This inherits the standard functions | |||
from the base translator class, but allows you to replace these where | |||
needed by new functions which are written specially for this | |||
language.</P> | |||
<hr> | |||
</BODY> | |||
</HTML> |
@@ -0,0 +1,69 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title></title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>ANALYSIS</h2> | |||
<hr> | |||
(Further notes are needed) | |||
<p> | |||
Recordings of spoken words and phrases can be analysed to try and make eSpeak match a language more closely. | |||
Unlike most other (larger and better quality) synthesizers, eSpeak's data is not produced directly from recorded sounds. To use an analogy, it's like a drawing or sketch compared with a photograph. Or vector graphics compared with a bitmap image. It's smaller, less accurate, with less subtlety, but it can sometimes show some aspects of the picture more clearly than a more accurate image. | |||
<h4>Recording Sounds</h4> | |||
Recordings should be made while speaking slowly, clearly, and firmly and loudly (but not shouting). Speak about half a metre from the microphone. Try to avoid background noise and hum interference from electrical power cables. | |||
<h4>Praat</h4> | |||
I use a modified version of the praat program (<a href="www.praat.org">www.praat.org</a>) to view and analyse both sound recordings and output from eSpeak. The modification adds a new function (<code>Spectrum->To_eSpeak</code>) which analysis a voiced sound and produces a file which can be loaded into espeakedit. Details of the modification are in the <code>"praat-mod"</code> directory in the espeakedit package. | |||
The analysis contains a sequence of frames, one per cycle at the speech's fundamental frequency. Each frame is a short time spectrum, together with praat's estimation of the f1 to f5 formant frequencies at the time of that cycle. | |||
I also use Praat's <code>New->Record_mono_sound</code> function to make sound recordings. | |||
<h3>Vowels and Diphthongs</h3> | |||
<h4>Analysing a Recording</h4> | |||
Make a recording, with a male voice, and trim it in Praat to keep just the required vowel sound. Then use the new <code>Spectrum->To_eSpeak</code> modification (this was named <code>To_Spectrogram2</code> in earlier versions) to analyse the sound. It produces a file named <code>"spectrum.dat"</code>. | |||
Load the <code>"spectrum.dat"</code> file into espeakedit. Espeakedit has two Open functions, <code>File->Open</code> and <code>File->Open2</code>. They are the same, except that they remember different paths. I generally use <code>File->Open2</code> for reading the <code>"spectrum.dat"</code> file. | |||
The data is displayed in espeakedit as a sequence of spectrum frames (see <a href="editor.html">editor.html</a>). | |||
<h4>Tone Quality</h4> | |||
It can be difficult to match the tonal quality of a new vowel to be compatible with existing vowel files. This is determined by the relative heights and widths of the formant peaks. These vary depending on how the recording was made, the microphone, and the strength and tone of the voice. Also the positions of the higher peaks (F3 upwards) can vary depending on the characteristics of the speaker's voice. Formant peaks correspond to resonances within the mouth and throat, and they depend on its size and shape. With a female voice, all the formants (F1 upwards) are generally shifted to higher frequencies. | |||
For these reasons, it's best to use a male voice, and to use its analysed spectra only as guidance. Rather than construct formant-peaks entirely to match the analysed data, instead copy keyframes from a similar existing vowel. Then make small adjustments to match the position of the F1, F2, F3 formant peaks and hopefully produce the required vowel sound. | |||
<h4>Using an Existing Vowel File</h4> | |||
Choose a similar vowel file from <code>phsource/vowel</code> and open it into espeakedit. It may be useful to use <code>phsource/vowel/vowelchart</code> as a map to show how vowel files compare with each other. You can select a keyframe from the vowel file and use CTRL-C and CTRL-V to copy the green formant peaks onto a frame of the new spectrum sequence. Then adjust the peaks to match the new frame. Press F1 to hear the sound of the formant peaks in the selected frame. | |||
The F0 peak is provided in order to adjust the correct balance of low frequencies, below the F1 peak. If the sound is too muffled, or conversely, too "thin", try adjusting the amplitude or position of the F0 peak. | |||
<h4>Length and Amplitude</h4> | |||
Use an existing vowel file as a guide for how to set the amplitude and length of the keyframes. At the right of each keyframe, its length is shown in mS and under that is its relative (RMS) amplitude. | |||
The second keyframe should be marked with a red marker (use CTRL-M to toggle this). This divides the vowel into the front-part (with one frame), and the rest. | |||
Use F2 to play the sound of the new vowel sequence. It will also produce a WAV file (the default name is speech.wav) which you can read into praat to see whether it has a sensible shape. | |||
<h4>Using the New Vowel</h4> | |||
Make a new directory (eg. vwl_xx) in phsource for your new vowels. Save the spectrum sequence with a name which you have chosen for it. | |||
You can then edit the phoneme file for your language (eg. phsource/ph_xxx), and change a phoneme to refer to your new vowel file. Then do <code>Data->Compile_Phoneme_Data</code> from espeakedit's menubar to re-compile the phoneme data. | |||
</body> | |||
</html> |
@@ -0,0 +1,169 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak Speech Synthesizer</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> | |||
</head> | |||
<body> | |||
<A href="index.html">Back</A> | |||
<hr> | |||
<h2>2.1 INSTALLATION</h2> | |||
<hr> | |||
(This section only applies to Linux and other Posix systems).<br> | |||
There are two versions of the command line program. They both have the same command parameters (see below). | |||
<ol> | |||
<li><strong>espeak</strong> uses speech engine in the <strong>libespeak</strong> shared library. The libespeak library must first be installed. | |||
<p> | |||
<li><strong>speak</strong> is a stand-alone version which includes its own copy of the speech engine. | |||
</ol> | |||
Place the <strong>espeak</strong> or <strong>speak</strong> executable file in the command path, eg in <strong>/usr/local/bin</strong> | |||
<p> | |||
Place the "<strong>espeak-data</strong>" directory in /usr/share as <strong>/usr/share/espeak-data</strong>.<br> | |||
Alternatively if it is placed in the user's home directory (i.e. <strong>/home/<user>/espeak-data</strong>) | |||
then that will be used instead. | |||
<p> | |||
<h4>Dependencies</h4> | |||
<strong>espeak</strong> uses the PortAudio sound library (version 18), so you will need to have the <strong>libportaudio0</strong> library package installed. It may be already, since it's used by other software, such as OpenOffice.org and the Audacity sound editor.<p> | |||
Some Linux distrubitions (eg. SuSe 10) have version 19 of PortAudio which has a slightly different API. The speak program can be compiled to use version 19 of PortAudio by copying the file portaudio19.h to portaudio.h before compiling.<p> | |||
The speak program may be compiled without using PortAudio, by removing the line<pre> #define USE_PORTAUDIO | |||
</pre>in the file speech.h. | |||
<p> <hr> | |||
<h2>2.2 COMMAND OPTIONS</h2> | |||
<hr> | |||
<h3>2.2.1 Examples</h3> | |||
To use at the command line, type:<br> | |||
<strong>espeak "This is a test"</strong><br> | |||
or<br> | |||
<strong>espeak -f <text file></strong> | |||
<p> | |||
Or just type<br> | |||
<strong>espeak</strong><br> | |||
followed by text on subsequent lines. Each line is spoken when | |||
RETURN is pressed.<br>Use <strong>espeak -x</strong> to see the corresponding phoneme codes. | |||
<p> <hr> | |||
<h3>2.2.2 Use with KDE Text-to-Speech (KTTS)</h3> | |||
To add to KDE-Text-to-Speech Manager (KTTSMgr), use it as a "Command" talker | |||
with "command for speaking texts" set to:<br> | |||
<strong>cat %f | espeak --stdin -w %w</strong> | |||
<p> | |||
Note: | |||
<ul> | |||
<li>When used by the KTTS system, I noticed a slight background hiss with the speech, which is not present when I use <strong>espeak</strong> directly from the command line. This was because KDE sound default was set to "8 bits" rather than 16 bits. | |||
<li>KTTSMgr breaks the text into sentences to pass to the speech engine, but it mistakenly assumes sentence breaks when dots follow abbreviations and therefore pauses after the dots in "eg. Mr. John B. Smith etc." Speaking a text file directly with <strong>espeak</strong> gives better results in this respect. | |||
<li>Speaking text from a web page using KTTS often causes headings and image captions to be run together with the following text as a single sentence. Speaking the HTML directly with the <strong>-m</strong> option set (i.e. using <strong>espeak -m -f text.html</strong>), may help if this is a problem. | |||
</ul> | |||
<p> <hr> | |||
<h3>2.2.3 The Command Line Options</h3> | |||
<dl> | |||
<dt> | |||
<strong>espeak [options] ["words"]</strong><br> | |||
<dd>Text input can be taken either from a file, from a string in the command, or from stdin. | |||
<p> | |||
<dt> | |||
<strong>-f <text file></strong><br> | |||
<dd>Speaks a text file. | |||
<p> | |||
<dt> | |||
<strong> --stdin</strong><br> | |||
<dd>Takes the text input from stdin. | |||
<p> | |||
<dt> | |||
If neither -f nor --stdin is given, then the text input is taken from "words" (a text string within double quotes). <br>If that is not present then text is taken from stdin, but each line is treated as a separate sentence. | |||
<p> | |||
<dt> | |||
<strong>-a <integer></strong><br> | |||
<dd>Sets amplitude (volume) in a range of 0 to 200. The default is 100. | |||
<p> | |||
<dt> | |||
<strong>-p <integer></strong><br> | |||
<dd>Adjusts the pitch in a range of 0 to 99. The default is 50. | |||
<p> | |||
<dt> | |||
<strong>-s <integer></strong><br> | |||
<dd>Sets the speed in words-per-minute (approximate values for the default voice, others may | |||
differ slightly). The default value is 170. I generally use a faster speed | |||
of 190. Range 80 to 370. | |||
<p> | |||
<dt> | |||
<strong>-b</strong><br> | |||
<dd>Indicates that the input text is in the 8-bit character set which corresponds to the language (eg. Latin-2 for Polish). Without this option, eSpeak assumes text is UTF8, but will automatically switch to the 8-bit character set if it finds an illegal UTF8 sequence. That may give wrong results if some 8-bit character sequences look like valid UFT8 multibyte characters. | |||
<p> | |||
<dt> | |||
<strong>-l <integer></strong><br> | |||
<dd>Line-break length, default value 0. If set, then lines which are shorter | |||
than this are treated as separate clauses and spoken separately with a | |||
break between them. This can be useful for some text files, but bad for | |||
others. | |||
<p> | |||
<dt> | |||
<strong>-m</strong><br> | |||
<dd>Indicates that the text contains SSML (Speech Synthesis Markup Language) tags or other XML tags. Those SSML tags which are supported are interpreted. Other tags, including HTML, are ignored, except that some HTML tags such as <hr> <h2> and <li> ensure a break in the speech. | |||
<p> | |||
<dt> | |||
<strong>-v <voice filename>[+<variant>]</strong><br> | |||
<dd>Sets a Voice for the speech, usually to select a language. eg: | |||
<pre> espeak -vaf</pre> | |||
To use the Afrikaans voice. A modifier after the voice name can be used to vary the tone of the voice, eg: | |||
<pre> espeak -vaf+3</pre> | |||
The variants are <code> +1 +2 +3 +4 +5 </code> for male voices and <code> +11 +12 +13 +14 </code> which simulate female voices by using higher pitches. | |||
<p> | |||
<voice filename> is a file within the <code>espeak-data/voices</code> directory.<br> | |||
Voice files can specify a language, different pitches, tonal qualities, and prosody for the voice. | |||
See the <a href="voices.html">voices.html</a> file.<p> | |||
Voice names which start with <b>mb-</b> are for use with Mbrola diphone voices, see <a href="mbrola.html">mbrola.html</a><p> | |||
Some languages may need additional dictionary data, see <a href="languages.html">languages.html</a> | |||
<p> | |||
<dt> | |||
<strong>-w <wave file></strong><br> | |||
<dd>Writes the speech output to a file in WAV format, rather than speaking it. | |||
<p> | |||
<dt> | |||
<strong>-x</strong><br> | |||
<dd>The phoneme mnemonics, into which the input text is translated, are | |||
shown on stdout. | |||
<p> | |||
<dt> | |||
<strong>-X</strong><br> | |||
<dd>As -x, but in addition, details are shown of the pronunciation rule and dictionary list lookup. This can be useful to see why a certain pronunciation is being produced. Each matching pronunciation rule is listed, together with its score, the highest scoring rule being used in the translation. "Found:" indicates the word was found in the dictionary lookup list, and "Flags:" means the word was found with only properties and not a pronunciation. You can see when a word has been retranslated after removing a prefix or suffix. | |||
<p> | |||
<dt><strong>-q</strong><br><dd> | |||
Quiet. No sound is generated. This may be useful with the -x option. | |||
<p> | |||
<dt> | |||
<strong>-z</strong><br> | |||
<dd>The option removes the end-of-sentence pause which normally occurs at the end of the text. | |||
<p> | |||
<dt> | |||
<strong>--stdout</strong><br> | |||
<dd>Writes the speech output to stdout rather than speaking it. | |||
<p> | |||
<dt><strong>--compile[=<voice name>]</strong><br> | |||
<dd> | |||
Compile the pronunciation rule and dictionary lookup data from their source files in the current directory. The Voice determines which language's files are compiled. For example, if it's an English voice, then <em>en_rules</em>, <em>en_list</em>, and <em>en_extra</em> (if present), are compiled to replace <em>en_dict</em> in the <em>speak-data</em> directory. If no Voice is specified then the default Voice is used. | |||
<p> | |||
<dt><strong>--punct[="<characters>"]</strong><br> | |||
<dd> | |||
Speaks the names of punctuation characters when they are encountered in the text. If <characters> are given, then only those listed punctuation characters are spoken, eg. <code> --punct=".,;?"</code> | |||
<p> | |||
<dt> | |||
<strong>--voices[=<language code>]</strong><br> | |||
<dd>Lists the available voices.<br> | |||
If =<language code> is present then only those voices which are suitable for that language are listed.<br> | |||
</dl> | |||
<p> <hr> | |||
<h3>2.2.4 The Input Text</h3> | |||
<dl> | |||
<dt><b>HTML Input</b> | |||
<dd> | |||
If the -m option is used to indicate marked-up text, then HTML can be spoken directly. | |||
<p> | |||
<dt><b>Phoneme Input</b> | |||
<dd> | |||
As well as plain text, phoneme mnemonics can be used in the text input to <strong>espeak</strong>. They are enclosed within double square brackets. Spaces are used to separate words and all stressed syllables must be marked explicitly.<br> | |||
eg: <code> [[D,Is Iz sVm f@n'EtIk t'Ekst 'InpUt]] </code> | |||
</dl> | |||
</body> | |||
</b> |
@@ -0,0 +1,566 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak: Pronunciation Dictionaries</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>4. TEXT TO PHONEME TRANSLATION</h2> | |||
<hr> | |||
<h3>4.1 Translation Files</h3> | |||
There is a separate set of pronunciation files for each language, their names starting with the language name. | |||
<p> | |||
There are two separate methods for translating words into phonemes: | |||
<ul> | |||
<li>Pronunciation Rules. These are an attempt to define the pronunciation rules for the language. The source file is:<br> | |||
<strong><em><language>_rules</em></strong> (eg. en_rules)<br> | |||
<p> | |||
<li> | |||
Lookup Dictionary. A list of individual words and their pronunciations and/or various other properties. The source files are:<br> | |||
<strong><em><language>_list</em></strong> (eg. en_list) and optionally <strong><em><language>_extra</em></strong> <br> | |||
</ul> | |||
These two files are compiled into the file | |||
<strong><em><language>_dict</em></strong> in the espeak-data directory (eg. espeak-data/en_dict) | |||
<p> <hr> | |||
<h3>4.2 Phoneme names</h3> | |||
Each of the language's phonemes is represented by a mnemonic of 1, 2, 3, or 4 characters. Together with a number of utility codes (eg. stress marks and pauses), these are defined in the phoneme data file (see *spec not yet available*). | |||
<p> | |||
The utility 'phonemes' are: | |||
<ul><table> | |||
<tbody align="left"> | |||
<tr> | |||
<td><strong>' </strong></td> | |||
<td>primary stress</td> | |||
</tr> | |||
<tr> | |||
<td><strong>, </strong></td> | |||
<td>secondary stress</td> | |||
</tr> | |||
<tr> | |||
<td><strong>% </strong></td> | |||
<td>unstressed syllable</td> | |||
</tr> | |||
<tr> | |||
<td><strong>= </strong></td> | |||
<td>put the primary stress on the preceding syllable</td> | |||
</tr> | |||
<tr> | |||
<td><strong>_:</strong></td> | |||
<td>short pause</td> | |||
</tr> | |||
<tr> | |||
<td><strong>_</strong></td> | |||
<td>a shorter pause</td> | |||
</tr> | |||
<tr> | |||
<td><strong>|| </strong></td> | |||
<td>indicates a word boundary within a phoneme string</td> | |||
</tr> | |||
<tr> | |||
<td><strong>| </strong></td> | |||
<td>can be used to separate two adjacent characters, to prevent them from being considered as a multi-character phoneme mnemonic</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</ul> | |||
It is not necessary to specify the stress of every syllable. Stress markers are only needed in order to change the effect of the language's default stress rule. | |||
<p> | |||
The phonemes which are used to represent a language's sounds are based on the Kirshenbaum ascii character representation of the International Phonetic Alphabet <a href="http://www.kirshenbaum.net/IPA/ascii-ipa.pdf">www.kirshenbaum.net/IPA/ascii-ipa.pdf</a> | |||
<p> <hr> | |||
<h3>4.3 Pronunciation Rules</h3> | |||
The rules in the <strong><em><language>_rules</em></strong> file specify the phonemes which are used to pronounce each letter, or sequence of letters. Some rules only apply when the letter or letters are preceded by, or followed by, other specified letters. | |||
<p> | |||
To find the pronunciation of a word, the rules are searched and any which match the letters at the in the word are given a score depending on how many letters are matched. The pronunciation from the best matching rule is chosen. The pointer into the source word is then advanced past those letters which have been matched and the process is repeated until all the letters of the word have been processed. | |||
<p> | |||
<h4>4.3.1 Rule Groups</h4> | |||
The rules are organized in groups, each starting with a ".group" line: | |||
<ul><dl> | |||
<dt><strong>.group <character></strong><br><dd> | |||
A group for each letter or character. | |||
<p> | |||
<dt><strong>.group <2 characters></strong><br><dd> | |||
Optional groups for some common 2 letter combinations. This is only needed, for efficiency, in cases where there are many rules for a particular letter. They would not be needed for a language which has regular spelling rules. | |||
<p> | |||
<dt><strong>.group</strong><br><dd> | |||
A group for other characters which don't have their own group. | |||
<p> | |||
<dt><strong>.L<nn></strong><br><dd> | |||
Defines a group of letter sequences, any of which can match with <strong>Lnn</strong> in a <strong>post</strong> rule (see below). <strong>nn</strong> is a 2 digit decimal number in the range 01 to 20. eg:<br> | |||
<code>.L01 b bl br pl pr</code> | |||
</dl> | |||
</ul>When matching a word, firstly the 2-letter group for the two letters at the current position in the word (if such a group exists) is searched, and then the single-letter group. The highest scoring rule in either of those two groups is used. | |||
<h4>4.3.2 Rules</h4> | |||
Each rule is on separate line, and has the syntax: | |||
<ul> | |||
[<pre>)] <match> [(<post>] <phoneme string> | |||
</ul> | |||
eg. | |||
<ul><pre>.group o | |||
o 0 // "o" is pronounced as [0] | |||
oo u: // but "oo" is pronounced as [u:] | |||
b) oo (k U | |||
</pre> | |||
</ul> "oo" is pronounced as [u:], but when also preceded by "b" and followed by "k", it is pronounced [U]. | |||
<p> | |||
In the case of a single-letter group, the first character of <match> much be the group letter. In the case of a 2-letter group, the first two characters of <match> must be the group letters. The second and third rules above may be in either .group o or .group oo | |||
<p> | |||
Alphabetic characters in the <pre>, <match>, and <post> parts must be lower case, and matching is case-insensitive. Some upper case letters are used in <pre> and <post> with special meanings. | |||
<p> | |||
<h4>4.3.3 Special characters in <phoneme string>:</h4> | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td><strong>_^_<language code> </strong></td> | |||
<td>Translate using a different language.</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
If this rule is selected when translating a word, then the translation is aborted and the word is re-translated using the specified different language. <language code> may be upper or lower case. This can be used to recognise certain letter combinations as being foreign words and to use the foreign pronunciation for them. eg: | |||
<pre> | |||
th (_ _^_EN | |||
</pre> | |||
indicates that a word which ends in "th" is translated using the English translation rules and spoken with English phonemes. | |||
</ul> | |||
<h4>4.3.4 Special Characters in both <pre> and <post>:</h4> | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td><strong>_</strong></td> | |||
<td>Beginning or end of a word (or a hyphen).</td> | |||
</tr> | |||
<tr> | |||
<td><strong>-</strong></td> | |||
<td>Hyphen.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>A</strong></td> | |||
<td>Any vowel (the set of vowel characters may be defined for a particular language).</td> | |||
</tr> | |||
<tr> | |||
<td><strong>C</strong></td> | |||
<td>Any consonant.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>B H F G Y </strong></td> | |||
<td>These may indicate other sets of characters (defined for a particular language).</td> | |||
</tr> | |||
<tr> | |||
<td><strong>D</strong></td> | |||
<td>Any digit.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>K</strong></td> | |||
<td>Not a vowel (i.e. a consonant or word boundary or non-alphabetic character).</td> | |||
</tr> | |||
<tr> | |||
<td><strong>X</strong></td> | |||
<td>There is no vowel until the word boundary.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>Z</strong></td> | |||
<td>A non-alphabetic character.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>%</strong></td> | |||
<td>Doubled (placed before a character in <pre> and after it in <post>.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>/</strong></td> | |||
<td>The following character is treated literally.</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</ul> | |||
The sets of letters indicated by A, B, C, E, F G may be defined differently for each language. | |||
<p> | |||
Examples of rules: | |||
<pre> _) a // "a" at the start of a word | |||
a (CC // "a" followed by two consonants | |||
a (C% // "a" followed by a double consonant (the same letter twice) | |||
a (/% // "a" followed by a percent sign | |||
%C) a // "a" preceded by a double consonants | |||
</pre> | |||
<h4>4.3.5 Special characters only in <pre>:</h4> | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td><strong>@ </strong></td> | |||
<td>Any syllable.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>&</strong></td> | |||
<td>A syllable which may be stressed (i.e. is not defined as unstressed).</td> | |||
</tr> | |||
<tr> | |||
<td><strong>V</strong></td> | |||
<td>Matches only if a previous word has indicated that a verb form is expected.</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</ul> | |||
eg. | |||
<pre> @@) bi // "bi" preceded by at least two syllables | |||
@@a) bi // "bi" preceded by at least 2 syllables and following 'a' | |||
</pre> | |||
Note, that matching characters in the <pre> part do not affect the syllable counting. | |||
<p> | |||
<h4>4.3.6 Special characters only in <post>:</h4> | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td><strong>@</strong></td> | |||
<td>A vowel follows somewhere in the word.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>+</strong></td> | |||
<td>Force an increase in the score in this rule (may be repeated for more effect).</td> | |||
</tr> | |||
<tr> | |||
<td><strong>S<number> </strong></td> | |||
<td>This number of matching characters are a standard suffix, remove them and retranslate the word.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>P<number></strong></td> | |||
<td>This number of matching characters are a standard prefix, remove them and retranslate the word.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>Lnn</strong></td> | |||
<td><strong>nn</strong> is a 2-digit decimal number in the range 01 to 20<br> | |||
Matches with any of the letter sequences which have been defined for letter group <strong>nn</strong></td> | |||
</tr> | |||
<tr> | |||
<td><strong>N</strong></td> | |||
<td>Only use this rule if the word is not a retranslation after removing a suffix.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>T</strong></td> | |||
<td>Only use this rule if the word in found in the *_list file with the <b>$alt1</b> attribute.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>#</strong></td> | |||
<td>(English specific) change the next "e" into a special character "E"</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</ul> | |||
eg. | |||
<pre> @) ly (_$2 lI // "ly", at end of a word with at least one other | |||
// syllable, is a suffix pronounced [lI]. Remove | |||
// it and retranslate the word. | |||
_) un (@P2 ¬Vn // "un" at the start of a word is an unstressed | |||
// prefix pronounced [Vn] | |||
_) un (i ju: // ... except in words starting "uni" | |||
_) un (inP2 ,Vn // ... but it is for words starting "unin" | |||
</pre> | |||
S and P must be at the end of the <post> string. | |||
<p> | |||
S<number> may be followed by additonal letters (eg. S2ei ). Some of these are probably specific to English, but similar functions could be used for other languages. | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td><strong>q</strong></td> | |||
<td>query the _list file to find stress position or other attributes for the stem, but don't re-translate the word with the suffix removed.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>t</strong></td> | |||
<td>determine the stress pattern of the word <strong>before</strong> adding the suffix</td> | |||
</tr> | |||
<tr> | |||
<td><strong>d </strong></td> | |||
<td>the previous letter may have been doubled when the suffix was added.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>e</strong></td> | |||
<td>"e" may have been removed.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>i</strong></td> | |||
<td>"y" may have been changed to "i."</td> | |||
</tr> | |||
<tr> | |||
<td><strong>v</strong></td> | |||
<td>the suffix means the verb form of pronunciation should be used.</td> | |||
</tr> | |||
<tr> | |||
<td><strong>f</strong></td> | |||
<td>the suffix means the next word is likely to be a verb.</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</ul> | |||
<p> | |||
P<number> may be followed by additonal letters (eg. P3v ). | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td><strong>t </strong></td> | |||
<td>determine the stress pattern of the word <strong>before</strong> adding the prefix</td> | |||
</tr> | |||
<tr> | |||
<td><strong>v</strong></td> | |||
<td>the suffix means the verb form of pronunciation should be used.</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</ul> | |||
<p> <hr> | |||
<h3>4.4 Pronunciation Dictionary List</h3> | |||
The <strong><em><language>_list</em></strong> file contains a list of words whose pronunciations are given explicitly, rather than determined by the Pronunciation Rules. | |||
The <strong><em><language>_extra</em></strong> file, if present, is also used and it's contents are taken as coming after those in <strong><em><language>_list</em></strong>. | |||
<p> | |||
Also the list can be used to specify the stress pattern, or other properties, of a word. | |||
<p> | |||
If the Pronunciation rules are applied to a word and indicate a standard prefix or suffix, then the word is again looked up in Pronunciation Dictionary List after the prefix or suffix has been removed. | |||
<p> | |||
Lines in the dictionary list have the form: | |||
<ul> | |||
<word> [<phoneme string>] [<flags>] | |||
</ul>eg. | |||
<pre> book bUk | |||
</pre> | |||
Rather than a full pronunciation, just the stress may be given, to change where it would be otherwise placed by the Pronunciation Rules: | |||
<pre> berlin $2 // stress on second syllable | |||
absolutely $3 // stress on third syllable | |||
for $u // an unstressed word | |||
</pre> | |||
<h4>4.4.1 Multiple Words</h4> | |||
A pronunciation may also be specified for a group of words, when these appear together. Up to four words may be given, enclosed in brackets. This may be used for change the pronunciation or stress pattern when these words occur together, | |||
<pre> (de jure) deI||dZ'U@rI2 // note || used as a word break in the phoneme string</pre> | |||
or to run them together, pronounced as a single word | |||
<pre> (of a) @v@ | |||
</pre> | |||
or to give them a flag when they occur together | |||
<pre> (such as) sVtS||a2z $pause // precede with a pause | |||
</pre> | |||
<h4>4.4.2 Special characters in <phoneme string>:</h4> | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td><strong>_^_<language code> </strong></td> | |||
<td>Translate using a different language. See explanation in 4.3.3 above.</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</ul> | |||
<h4>4.4.3 Flags</h4> | |||
A word (or group of words) may be given one or more flags, either instead of, or as well as, the phonetic translation. | |||
<ul><table> | |||
<tbody> | |||
<tr> | |||
<td>$u</td> | |||
<td>The word is unstressed. In the case of a multi-syllable word, a slight stress is applied according to the default stress rules.</td> | |||
</tr> | |||
<tr> | |||
<td>$u1</td> | |||
<td>The word is unstressed, with a slight stress on its 1st syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$u2</td> | |||
<td>The word is unstressed, with a slight stress on its 2nd syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$u3</td> | |||
<td>The word is unstressed, with a slight stress on its 3rd syllable.</td> | |||
</tr> | |||
<tr> | |||
<td> </td> | |||
<td> </td> | |||
</tr> | |||
<tr> | |||
<td>$u+ $u1+ $u2+ $u3+</td> | |||
<td>As above, but the word has full stress if it's at the end of a clause.</td> | |||
</tr> | |||
<tr> | |||
<td> </td> | |||
<td> </td> | |||
</tr> | |||
<tr> | |||
<td>$1</tr> | |||
<td>Primary stress on the 1st syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$2</td> | |||
<td>Primary stress on the 2nd syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$3</td> | |||
<td>Primary stress on the 3rd syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$4</td> | |||
<td>Primary stress on the 4th syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$5</td> | |||
<td>Primary stress on the 5th syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$6</td> | |||
<td>Primary stress on the 6th syllable.</td> | |||
</tr> | |||
<tr> | |||
<td>$7</td> | |||
<td>Primary stress on the 7th syllable.</td> | |||
</tr> | |||
<tr> | |||
<td> </td> | |||
<td> </td> | |||
</tr> | |||
<tr> | |||
<td>$pause</td> | |||
<td>Ensure a short pause before this word (eg. for conjunctions such as "and", some prepositions, etc).</td> | |||
</tr> | |||
<tr> | |||
<td>$brk</td> | |||
<td>Ensure a very short pause before this word, shorter than $pause (eg. for some prepositions, etc).</td> | |||
</tr> | |||
<tr> | |||
<td>$only</td> | |||
<td>The rule does not apply if a prefix or suffix has already been removed.</td> | |||
</tr> | |||
<tr> | |||
<td>$onlys</td> | |||
<td>As $only, except that a standard plural ending is allowed.</td> | |||
</tr> | |||
<tr> | |||
<td>$stem</td> | |||
<td>The rule only applies if a suffix has already been removed.</td> | |||
</tr> | |||
<tr> | |||
<td>$strend</td> | |||
<td>Word is fully stressed if it's at the end of a clause.</td> | |||
</tr> | |||
<tr> | |||
<td>$strend2</td> | |||
<td>As $strend, but the word is also stressed if followed only by unstressed word(s).</td> | |||
</tr> | |||
<tr> | |||
<td>$unstressend </td> | |||
<td>Word is unstressed if it's at the end of a clause.</td> | |||
</tr> | |||
<tr> | |||
<td>$atend</td> | |||
<td>Use this pronunciation if it's at the end of a clause.</td> | |||
</tr> | |||
<tr> | |||
<td>$double</td> | |||
<td>Cause a doubling of the initial consonant of the following word (used for Italian).</td> | |||
</tr> | |||
<tr> | |||
<td>$capital</td> | |||
<td>Use this pronunciation if word has initial capital letter (eg. polish v Polish).</td> | |||
</tr> | |||
<tr> | |||
<td>$dot</td> | |||
<td>Ignore a . after this word even when followed by a capital letter (eg. Mr. Dr. ).</td> | |||
</tr> | |||
<tr> | |||
<td>$abbrev</td> | |||
<td>This has two meanings.<br> 1. If there is no phoneme string: Speak the word as individual letters, even if it contains a vowel (eg. "abc" should be spoken as "a" "b" "c").<br>2. If there is a phoneme string: Speak this word as the specified phoneme string, not as individual letters, even if it's all capital letters (eg, Roman numerals III, IV ).</td> | |||
</tr> | |||
<tr> | |||
<td> </td> | |||
<td> </td> | |||
</tr> | |||
<tr> | |||
<td>$alt $alt2</td> | |||
<td>These are language specific. Their use should be described in the language's **_list file</td> | |||
</tr> | |||
<tr> | |||
<tr> | |||
<td> </td> | |||
<td> </td> | |||
</tr> | |||
<tr> | |||
<td>$verb</td> | |||
<td>Use this pronunciation if it's a verb.</td> | |||
</tr> | |||
<td>$past</td> | |||
<td>Use this pronunciation if it's past tense.</td> | |||
</tr> | |||
<tr> | |||
<td>$verbf</td> | |||
<td>The following word is probably is a verb.</td> | |||
</tr> | |||
<tr> | |||
<td>$verbsf</td> | |||
<td>The following word is probably is a if it has an "s" suffix.</td> | |||
</tr> | |||
<tr> | |||
<td>$nounf</td> | |||
<td>The following word is probably not a verb.</td> | |||
</tr> | |||
<tr> | |||
<td>$pastf</td> | |||
<td>The following word is probably past tense.</td> | |||
</tr> | |||
<tr> | |||
<td>$verbextend</td> | |||
<td>Extend the influence of $verbf and $verbsf.</td> | |||
</tr> | |||
</tbody> | |||
</table></ul> | |||
The last group are probably English specific, but something similar may be useful in other languages. They are a crude attempt to improve the accuracy of pairs like ob'ject (verb) v 'object (noun) and read (present) v read (past). | |||
<p> | |||
The dictionary list is searched from bottom to top. The first match that satisfies any conditions is used (i.e. the one lowest down the list). So if we have: | |||
<pre> | |||
to t@ // unstressed version | |||
to tu: $atend // stressed version | |||
</pre> | |||
then if "to" is at the end of the clause, we get [tu:], if not then we get [t@]. | |||
<p> <hr> | |||
<h3>4.5 Numbers and Character Names</h3> | |||
<h4>4.5.1 Letter names</h4> | |||
The names of individual letters can be given either in the <b>_rules</b> or <b>_list</b> file. Sometimes an individual letter is also used as a word in the language and its pronunciation as a word differs from its letter name. If so, it should be listed in the <b>_list</b> file, preceded by an underscore, to give the letter name (as distinct from its pronunciation as a word). eg. in English: | |||
<pre> _a eI</pre> | |||
<h4>4.5.2 Numbers</h4> | |||
The operation the TranslateNumber() function is controlled by the language's <code>langopts.numbers</code> option. This constructs spoken numbers from fragments according to various options which can be set for each language. The number fragments are given in the <b>_list</b> file. | |||
<p> | |||
<ul> | |||
<table><tbody align="left"> | |||
<tr> | |||
<td> | |||
_0 to _9 | |||
<td>The numbers 0 to 9 | |||
</tr> | |||
<tr> | |||
<td>_2X _3X<td>Twenty, thirty, etc., used to make numbers 10 to 99 | |||
</tr> | |||
<tr> | |||
<td>_nn<td>Any two digit numbers with a special pronunciation (eg. _15 "fifteen"). | |||
</tr> | |||
<tr><TD>_0C<td>The word for "hundred"</td> | |||
<tr><TD>_1C _2C<td>Special pronunciation for one hundred, two hundred, etc., if needed.</tr> | |||
<tr><TD>_1C0<td>Special pronunciation (if needed) for 100 exactly</td> | |||
<tr><TD>_0M1<td>The word for "thousand"</tr> | |||
<tr><TD>_0M2<td>The word for "million"</tr> | |||
<tr><TD>_0M3<td>The word for 1000000000</tr> | |||
<tr><TD>_1M1 _2T1<td>Special pronunciation for one thousand, two thousand, tc, if needed</td> | |||
<tr><TD>_0and<td>Word for "and" when speaking numbers (eg. "two hundred and twenty").</tr> | |||
<tr><TD>_dpt<td>Word spoken for the decimnal point/comma</tr> | |||
<tr><TD>_dpt2<td>Word spoken (if any) at the end of all the digits after a decimal point.</tr> | |||
</tbody></table> | |||
</ul> | |||
</body> | |||
</html> |
@@ -0,0 +1,65 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak Speech Synthesizer</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<table border="1" cellpadding="10" background="images/sand-light.jpg" width="100%"> | |||
<tbody> | |||
<tr> | |||
<td width="15%"> | |||
<a href="http://sourceforge.net"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=159649&type=2" width="125" height="37" border="0" alt="SourceForge.net Logo" /></a> | |||
</td> | |||
<td> | |||
<div align="center"><h1>eSpeak - Documents</h1></div> | |||
</td> | |||
</tr> | |||
<tr> | |||
<td valign="top"> | |||
<font size="+1"><strong> | |||
<A href="index.html">Home</A> | |||
<p> | |||
<A href="commands.html">Usage</A> | |||
<p> | |||
<A href="languages.html">Languages</A> | |||
</strong></font> | |||
</td> | |||
<td> | |||
<h3><A href="voices.html">Voice Files</A></h3> | |||
Voice files specify a language and other characteristics of a voice. | |||
<h3><A href="mbrola.html">Mbrola Voices</A></h3> | |||
eSpeak can be used as a front-end for Mbrola diphone voices. | |||
<h3><A href="dictionary.html">Pronunciation Dictionary</A></h3> | |||
<ul> | |||
<li>How to add pronunciation corrections. | |||
<li>How to build up pronunciation rules for a new language. | |||
</ul><p> | |||
<h3><A href="add_language.html">Adding a Language</A></h3> | |||
How to add or improve a language. | |||
<h3><A href="phonemes.html">Phonemes</A></h3> | |||
The list of phoneme mnemonics, for use in the Pronunciation Dictionary. | |||
<h3><A href="phontab.html">Phoneme Tables</A></h3> | |||
The tables of the phonemes used by each language, with their properties and sound production. | |||
<h3><A href="speak_lib.h">eSpeak Libary API</A></h3> | |||
API definition and header file for a shared library version of eSpeak. | |||
<h3><A href="ssml.html">Markup tags</A></h3> | |||
SSML (Speech Synthesis Markup Language) and HTML tags recognized by eSpeak. | |||
<h3><A href="editor.html">The espeakedit program</A></h3> | |||
GUI software to edit vowel files and to compile the phoneme data for use by eSpeak.<br> | |||
<ul> | |||
<li><a href="editor_if.html">espeakedit program GUI details</a> | |||
<li><a href="analyse.html">Analysing sound recordings</a> | |||
<li><a href="makephonemes.html">Adjusting phoneme data</a> (to be written) | |||
</ul> | |||
</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</body> | |||
</html> |
@@ -0,0 +1,37 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>espeak: Downloads</title> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>ESPEAK DOWNLOADS</h2> | |||
<hr> | |||
This page gives direct links to eSpeak downloads on one of the Sourceforge mirrors.<br>To get other versions of eSpeak, or use other mirrors, use the <a href="http://sourceforge.net/project/showfiles.php?group_id=159649">Sourceforge download page</a> | |||
<p> | |||
Source code and dictionary data.<br> | |||
<a href="http://kent.dl.sourceforge.net/sourceforge/espeak/espeak-1.23-source.zip">espeak-1.23-source.zip</a> (0.9 MBytes) | |||
<p> | |||
Compiled for Linux i386 (Debian/Ubuntu based distros)<br> | |||
<a href="http://kent.dl.sourceforge.net/sourceforge/espeak/espeak-1.23-linux.zip">espeak-1.23-linux.zip</a> (0.7 MBytes) | |||
<p> | |||
Compiled for Windows. SAPI5 and command-line versions.<br> | |||
<a href="http://kent.dl.sourceforge.net/sourceforge/espeak/espeak-1.23-win.zip">espeak-1.23-win.zip</a> (0.7 MBytes) | |||
<p> | |||
Compiled for RISC OS<br> | |||
<a href="http://kent.dl.sourceforge.net/sourceforge/espeak/espeak-3.23-riscos.zip">espeak-3.23-riscos.zip</a> (0.7 MBytes) | |||
<hr><p> | |||
<b>espeakedit program and the phoneme data.</b> | |||
<p> | |||
Source code and compiled for Linux.<br> | |||
<a href="http://kent.dl.sourceforge.net/sourceforge/espeak/espeakedit-1.23.zip">espeakedit-1.23.zip</a> (6 MBytes) | |||
<p> | |||
Compiled for Windows.<br> | |||
<a href="http://kent.dl.sourceforge.net/sourceforge/espeak/espeakedit-1.23-win.zip">espeakedit-1.23-win.zip</a> (4 MBytes) | |||
</body> | |||
</html> |
@@ -0,0 +1,75 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>espeakedit</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>ESPEAKEDIT PROGRAM</h2> | |||
<hr> | |||
The <strong>espeakedit</strong> program is used to prepare phoneme data for the eSpeak speech synthesizer.<p> | |||
It has two main functions: | |||
<ul> | |||
<li>Prepare keyframe files for individual vowels and voiced consonants. These each contain a sequence of keyframes which define how formant peaks (peaks in the frequency spectrum) vary during the sound.<p> | |||
<li>Process the master <strong>phonemes</strong> file which, by including the phoneme files for the various languages, defines all their phonemes and references the keyframe files and the sound sample files which they use. <strong>espeakedit</strong> processes these and compiles them into the <strong>phondata</strong>, <strong>phonindex</strong>, and <strong>phontab</strong> files in the <strong>espeak-data</strong> directory which are used by the eSpeak speech synthesizer. | |||
</ul> | |||
<hr> | |||
<h3>Installation</h3> | |||
<strong>espeakedit</strong> needs the following packages:<br> | |||
(The package names mentioned here are those from the Ubuntu "Dapper" Linux distribution). | |||
<ul> | |||
<li><strong>sox</strong> (a universal sound sample translator) | |||
<li><strong>libwxgtk2.6-0</strong> (wxWidgets Cross-platform C++ GUI toolkit) | |||
<li><strong>portaudio0</strong> (Portaudio V18, portable audio I/O) | |||
</ul> | |||
In addition, a modified version of <strong>praat</strong> (<a href="www.praat.org">www.praat.org</a>) is used to view and analyse WAV sound files. | |||
This needs the package <strong>libmotif3</strong> to run and <strong>libmotif-dev</strong> to compile. | |||
<hr> | |||
<h3>Quick Guide</h3> | |||
This will quickly illustrate the main features. Details of the interface and key commands are given in <a href="editor_if.html">editor_if.html</a><p> | |||
For more detailed information on analysing sound recordings and preparing phoneme definitions and keyframe data see <a href="analyse.html">analyse.html</a> (to be written). | |||
<h4>Compiling Phoneme Data</h4> | |||
<ol> | |||
<li>Run the <strong>espeakedit</strong> program.<p> | |||
<li>Select <b>Data->Compile phoneme data</b> from the menu bar. Dialog boxes will ask you to locate the directory (<b>phsource</b>) which contains the master phonemes file, and the directory (<b>dictsource,</b>) which contains the dictionary files (en_rules, en_list, etc). Once specified, espeakedit will remember their locations, although they can be changed later from <b>Options->Paths</b>.<p> | |||
<li>A message in the status line at the bottom of the espeakedit window will indicate whether there are any errors in the phoneme data, and how many language's dictionary files have been compiled. The compiled data is placed into the <b>espeak-data</b> directory, ready for use by the speak program. If errors are found in the phoneme data, they are listed in a file <b>error_log</b> in the <b>phsource</b> directory.</li> | |||
<p> | |||
NOTE: espeakedit can be used from the command line to compile the phoneme data, with the command: <b> espeakedit --compile</b> | |||
<li>Select <b>Tools->Make vowels chart->From compiled phoneme data</b>. This will look for the vowels in the compiled phoneme data of each language and produce a vowel chart (.png file) in <b>phsource/vowelcharts</b>. These charts plot the vowels' F1 (formant 1) frequency against their F2 frequency, which corresponds approximately to their open/close and front/back positions. The colour in the circle for each vowel indicates its F3 frequency, red indicates a low F3, through yellow and green to blue and violet for a high F3. In the case of a diphthong, a line is drawn from the circle to the position of the end of the vowel. | |||
</ol> | |||
<h4>Keyframe Sequences</h4> | |||
<ol> | |||
<li>Select <b>File->Open</b> from the menu bar and select a vowel file, <b>phsource/vowel/a</b>. This will open a tab in the espeakedit window which contains a sequence of 4 keyframes. Each keyframe shows a black graph, which is the outline of an original analysed spectrum from a sound recording, and also a green line, which shows the formant peaks which have been added (using the black graph as a guide) and which produce the sound.<p> | |||
<li>Click in the "a" tab window and then press the <b>F2</b> key. This will produce and play the sound of the keyframe sequence. The first time you do this, you'll get a save dialog asking where you want the WAV file to be saved. Once you give a location all future sounds will be stored in that same location, although it can be changed from <b>Options->Paths</b>.<p> | |||
<li>Click on the second of the four frames, the one with the red square. Press <b>F1</b>. That plays the sound of just that frame.<p> | |||
<li>Press the <b>1</b> (number one) key. That selects formant F1 and a red triangle appears under the F1 formant peak to indicate that it's selected. Also an = sign appears next to formant 1 in the formants list in the left panel of the window.<p> | |||
<li>Press the left-arrow key a couple of times to move the F1 peak to the left. The red triangle and its associated green formant peak moves lower frequency. Its numeric value in the formants list in the left panel decreases.<p> | |||
<li>Press the <b>F1</b> key again. The frame will give a slightly different vowel sound. As you move the F1 peak slightly up and down and then press <b>F1</b> again, the sound changes. Similarly if you press the <b>2</b> key to select the F2 formant, then moving that will also change the sound. If you move the F1 peak down to about 700 Hz (and reduce its height a bit with the down-arrow key) and move F2 up to 1400 Hz, then you'll hear a "er" schwa [@] sound instead of the original [a].<p> | |||
<li>Select <b>File->Open</b> and choose <b>phsource/vowel/aI</b>. This opens a new tab labelled "aI" which contains more frames. This is the [aI] diphthong and if you click in the tab window and press <b>F2</b> you'll hear the English word "eye". If you click on each frame in turn and press <b>F1</b> then you can hear each of the keyframes in turn. They sound different, starting with an [A] sound (as in "palm"), going through something like [@] in "her" and ending with something like [I] in "kit" (or perhaps a French é). Together they make the diphthong [aI]. | |||
</ol> | |||
<h4>Text and Prosody Windows</h4> | |||
<ol> | |||
<li>Click on the <b>Text</b> tab in the left panel. Two text windows appear in the panel with buttons <b>Translate</b> and <b>Speak</b> below them.<p> | |||
<li>Type some text into the top window and click the <b>Translate</b> button. The phonetic translation will appear in the lower window.<p> | |||
<li>Click the <b>Speak</b> button. The text will be spoken and a <b>Prosody</b> tab will open in the main window.<p> | |||
<li>Click on a vowel phoneme which is displayed in the Prosody tab. A red line appears under it to indicate that it has been selected.<p> | |||
<li>Use the <b>up-arrow</b> or <b>down-arrow</b> key to move the vowel's blue pitch contour up or down. Then click the <b>Speak</b> button again to hear the effect of the altered pitch. If the adjacent phoneme also has a pitch contour then you may hear a discontinuity in the sound if it no longer matches with the one which you have moved.<p> | |||
<li>Hold down the <b>Ctrl</b> key while using the <b>up-arrow</b> or <b>down-arrow</b> keys. The gradient of the pitch contour will change.<p> | |||
<li>Click with the right mouse button over a phoneme. A menu allows you to select a different pitch envelope shape. Details of the currently selected phoneme appear in the Status line at the bottom of the window. The <b>Stress</b> number gives the stress level of the phoneme (see voices.html for a list).<p> | |||
<li>Click the <b>Translate</b> button. This re-translates the text and restores the original pitches.<p> | |||
<li>Click on a vowel phoneme in the Prosody window and use the <b><</b> and <b>></b> keys to shorten or lengthen it.<p> | |||
</ol> | |||
The Prosody window can be used to experiment with different phoneme lengths and different intonation.<p> | |||
<hr> | |||
</body> | |||
</html> | |||
@@ -0,0 +1,143 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>Editor - Spectrum</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>USER INTERFACE - FORMANT EDITOR</h2> | |||
<hr> | |||
<h3>Frame Sequence Display</h3> | |||
The eSpeak editor can display a number of frame-sequencies in tabbed windows. Each frame can contain a short-time frequency spectrum, covering the period of one cycle at the sound's pitch. Frames can also show: | |||
<ul> | |||
<LI>Blue vertical lines showing the estimated position of the f1 to f5 formants (if the sequence was produced by praat analysis). These should correspond with the peaks in the spectrum, but may not do so exactly<p> | |||
<li>Numbers at the right side of the frame showing the position from the start of the sequence in mS, and the pitch of the sound.<p> | |||
<li>Up to 9 formant peaks (numbered 0 to 9) added by the user, usually to match the peaks in the spectrum, in order to produce the required sound. These are shown in green, can be moved by keyboard presses as described below, and may merge if they are close together. If a frame has formant peaks then it is a Keyframe and is shown with a pale yellow background.<p> | |||
<li>If formant peaks are present, a relative amplitude (r.m.s.) value is shown at the right side of the frame. | |||
<li> | |||
</ul> | |||
<h3>Text Tab</h3> | |||
Enter text in the top left text window. Click the <b>Translate</b> button to see the phonetic transcription in the text window below. Then click the <b>Speak</b> button to speak the text and show the results in the <b>Prosody</b> tab, if that is open. | |||
<p> | |||
If changes are made in the <b>Prosody</b> tab, then clicking <b>Speak</b> will speak the modified prosody while <b>Translate</b> will revert to the default prosody settings for the text. | |||
<p> | |||
To enter phonetic symbols (Kirschenbaum encoding) in the top left text window, enclose them within [[ ]]. | |||
<h3>Spect Tab</h3> | |||
The "Spect" tab in the left panel of the eSpeak editor shows information about the currently selected frame and sequence. | |||
<ul> | |||
<li>The <strong>Formants</strong> section displays the Frequency, Height, and Width of each formant peak (peaks 0 to 8). Peaks 6, 7, 8 don't have a variable width.<p> | |||
<li><strong>% amp - Frame</strong> can be used to adjust the amplitiude of the frame. If you change this value then the rms amplitude value at the right side of the frame will change. The formant peaks don't change, just the overall amplitude of the frame.<p> | |||
<li><strong>mS</strong> shows the time in mS until the next keyframe (or end of sequence if there is none). The spin control initially shows the same value, but this can be changed in order to increase or decrease the effctive length of a keyframe.<p> | |||
<li><strong>% amp - Sequence</strong> /ul> adjusts the amplitude of the whole sequence. Changing this values changes the rms amplitudes of all the keyframes in the sequence.<p> | |||
<li><strong>% mS - Sequence</strong> /ul> shows the total length of the sequence.<p> | |||
<li><strong>Graph</strong><br> | |||
Yellow vertical lines show the position of keyframes within the sequence.<br> | |||
Black bars on these show the frequencies of formant peaks which have been set at these keyframes.<br> | |||
Thick red lines, if present, show the formants, as detected in the original analysis.<br> | |||
Thin black line, if present, shows the pitch profile measured in the original analysis. | |||
</ul> | |||
</li> | |||
</ul> | |||
<h3>Key Commands</h3> | |||
<ul> | |||
<li><strong>Selection</strong>.<p> | |||
The selected frame(s) are shown with a red border. The selected formant peak is also indicated by an equals ("=") sign next to its number in the "Spect" panel to the right of the window.<p> | |||
The selected formant peak is shown with a red triangle under the peak.<p> | |||
Keyframes are shown with a pale yellow background. A keyframe is any frame with any formant peaks which are not zero height. If all formant peaks become zero height, the frame is no longer a keyframe. If you increase a peak's height the frame becomes a keyframe. | |||
<dl> | |||
<dt><strong>Numbers 0 to 8</strong> | |||
<dd>Select formant peak number 0 to 8. | |||
<dt><strong>Page Up/Down</strong> | |||
<dd>Move to next/previous frame | |||
</dl> | |||
<li><strong>Formant movement</strong>. With the following keys, holding down <b>Shift</b> causes slower movement. | |||
<dl> | |||
<dt>Left | |||
<dd>Moves the selected formant peak to higher frequency. | |||
<dt>Right | |||
<dd>Moves the selected formant peak to lower frequency. | |||
<dt>Up | |||
<dd>Increases height of the selected formant peak. | |||
<dt>Down | |||
<dd>Decreases height of the selected formant peak. | |||
<dt><strong><</strong> | |||
<dd>Narrows the selected formant peak. | |||
<dt><strong>></strong> | |||
<dd>Widens the selected formant peak. | |||
<dt><strong>CTRL <</strong> | |||
<dd>Narrows the selected formant peak. | |||
<dt><strong>CTRL ></strong> | |||
<dd>Widens the selected formant peak. | |||
<dt><b>/</b> | |||
<dd>Makes the selected formant peak symmetrical. | |||
</dl> | |||
<li><strong>Frame Cut and Paste</strong> | |||
<dl> | |||
<dt><b>CTRL A</b> | |||
<dd>Select all frames in the sequence. | |||
<dt><b>CTRL C</b> | |||
<dd>Copy selected frames to (internal) clipboard. | |||
<dt><b>CTRL V</b> | |||
<dd>Paste frames from the clipboard to overwrite the contents of the selected frame and the frames which follow it. Only the formant peaks information is pasted. | |||
<dt><b>CTRL SHIFT V</b> | |||
<dd>Paste frames from the clippoard to insert them above the selected frame. | |||
<dt><b>CTRL X</b> | |||
<dd>Delete the selected frames. | |||
</dl> | |||
<li><strong>Frame editing</strong> | |||
<dl> | |||
<dt><b>CTRL D</b> | |||
<dd>Copy the formant peaks down to the selected frame from the next keyframe above. | |||
<dt><b>CTRL SHIFT D</b> | |||
<dd>Copy the formant peaks up to the selected frame from the next key-frame below. | |||
<dt><b>CTRL Z</b> | |||
<dd>Set all formant peaks in the selected frame to zero height. It is no longer a key-frame. | |||
<dt><b>CTRL I</b> | |||
<dd>Set the formant peaks in the selected frame as an interpolation between the next keyframes above and below it. A dialog box allows you to enter a percentage. 50% gives values half-way between the two adjacent key-frames, 0% gives values equal to the one above, and 100% equal to the one below. | |||
</dl> | |||
<li><strong>Display and Sound</strong> | |||
<dl> | |||
<dt><b>CTRL Q</b> | |||
<dd>Shows interpolated formant peaks on non-keyframes. These frames don't become keyframes until any of the peaks are edited to increase their height. | |||
<dt><b>CTRL SHIFT Q</b> | |||
<dd>Removes the interpolated formant peaks display. | |||
<dt><b>CTRL G</b> | |||
<dd>Toggle grid on and off. | |||
<dt><b>F1</b> | |||
<dd>Play sound made from the one selected keyframe. | |||
<dt><b>F2</b> | |||
<dd>Play sound made from all the keyframes in the sequence. | |||
</ul> | |||
<p> | |||
<hr> | |||
<h2>USER INTERFACE - PROSODY EDITOR</h2> | |||
<hr> | |||
<ul><LI> | |||
<dl> | |||
<dt><b>Left</b> | |||
<dd>Move to previous phoneme. | |||
<dt><b>Right</b> | |||
<dd>Move to next phoneme. | |||
<dt><b>Up</b> | |||
<dd>Increase pitch. | |||
<dt><b>Down</b> | |||
<dd>Decrease pitch. | |||
<dt><b>Ctrl Up</b> | |||
<dd>Increase pitch range. | |||
<dt><b>Ctrl Down</b> | |||
<dd>Decrease pitch range. | |||
<dt><b>></b> | |||
<dd>Increase length. | |||
<dt><b><</b> | |||
<dd>Decrease length. | |||
</dd> | |||
</dl> | |||
</LI> | |||
</ul> | |||
</body> | |||
</html> |
@@ -0,0 +1,78 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak: Speech Synthesizer</title> | |||
</head> | |||
<body> | |||
<table border="1" cellpadding="10" background="images/sand-light.jpg"> | |||
<tbody> | |||
<tr> | |||
<td width="15%" valign="top"> | |||
<a href="http://sourceforge.net"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=159649&type=2" width="125" height="37" border="0" alt="SourceForge.net Logo" /></a> | |||
</td> | |||
<td> | |||
<div align="center"><IMG src="images/lips.png" width="193" height="172" border="0"> | |||
<h1>eSpeak text to speech</h1></div> | |||
<div align="center"> | |||
(email) jonsd at users dot sourceforge.net<br> | |||
<a href="download.html"><strong>Download</strong></a> | |||
<a href="http://sourceforge.net/forum/?group_id=159649"><strong>Forum</strong></a> | |||
</div> | |||
</td> | |||
</tr> | |||
<tr> | |||
<td valign="top"> | |||
<font size="+1"><strong> | |||
<A href="commands.html">Usage</a> | |||
<p> | |||
<A href="languages.html">Languages</A> | |||
<p> | |||
<A href="docindex.html">Documents</A> | |||
<p> | |||
<A href="samples.html">Samples</A> | |||
</strong></font> | |||
</td> | |||
<td> | |||
eSpeak is a compact open source software speech synthesizer for English and other languages. | |||
<a href="http://espeak.sourceforge.net/"><strong>http://espeak.sourceforge.net</strong></a> | |||
<p> | |||
eSpeak produces good quality English speech. It uses a different synthesis method from other open source TTS engines, and sounds quite different. It's perhaps not as natural or "smooth", but I find the articulation clearer and easier to listen to for long periods. | |||
<p> | |||
It can run as a command line program to speak text from a file or from stdin. A shared library version is also available. | |||
<ul> | |||
<li>Includes different Voices, whose characteristics can be altered. | |||
<li>Can produce speech output as a WAV file. | |||
<li>SSML (Speech Synthesis Markup Language) is supported (not complete), and also HTML. | |||
<li>Compact size. The program and its data, including several languages, totals about 700 kbytes. | |||
<li>Can translate text to phoneme codes, so it could be adapted as a front end for another speech synthesis engine. | |||
<li>Potential for other languages. Several are included in varying stages of progress. Help from native speakers for these or other languages is welcomed. | |||
<li>Development tools available for producing and tuning phoneme data. | |||
<li>Written in C++. | |||
</ul> | |||
<p> | |||
It works well as a "Talker" with the KDE text to speech system (KTTS), as an alternative to Festival for example. As such, it can speak text which has been selected into the clipboard, or directly from the Konquerer browser or the Kate editor. A Gnome Speech driver is now available. | |||
<p> | |||
I regularly use it to listen to blogs and news sites. I prefer the sound through a domestic stereo system rather than my small computer speakers. | |||
<hr> | |||
<strong>Windows Version</strong>. There is now a Windows SAPI5 version of eSpeak. It can be used with screen readers such as NVDA, JAWS, Supernova, and Window-Eyes.<br> | |||
This is available as a Windows installer package from the eSpeak <a href="download.html"><strong>Download</strong></a> page.<p> | |||
A Windows version of the espeakedit program is also available. | |||
<hr> | |||
<strong>Languages</strong>. The eSpeak speech synthesizer supports several languages, however in most cases these are initial drafts and need more work to improve them. Assistance from native speakers is welcome for these, or other new languages. Please contact me if you want to help.<p> | |||
eSpeak does text to speech synthesis for the following languages, some better than others. Afrikaans, Croatian, Czech, Dutch, English, Esperanto, Finnish, French, German, Greek, Hindi, Hungarian, Italian, Norwegian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swahili, Swedish, Vietnamese, Welsh. See <a href="languages.html">Languages</a>. | |||
<hr> | |||
<strong>espeakedit</strong> is a GUI program used to prepare and compile phoneme data. It is now available for download. Documentation is currently sparse, but if you want to use it to add or improve language support, let me know. | |||
<hr> | |||
<strong>speak</strong> was originally written for Acorn/RISC_OS computers starting in 1995. This version is an update and re-write, including a relaxation of the original memory and processing power constraints, and with support for additional languages if anyone is interested in doing so. | |||
<p> | |||
The project name <strong>speak</strong> had already been taken by another project on SourceForge (for a Windows TTS front-end) so I added a letter 'e' to the front to make <strong>eSpeak</strong>. For now, the program executable remains <strong>speak</strong> and is referred to as such in the documentation. | |||
</td> | |||
</tr> | |||
</tbody> | |||
</table> | |||
</body> | |||
</html> |
@@ -0,0 +1,220 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak Speech Synthesizer</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> | |||
</head> | |||
<body> | |||
<A href="index.html">Back</A> | |||
<hr> | |||
<h2>3. LANGUAGES</h2> | |||
<hr> | |||
<h4>Help Needed</h4> | |||
Many of these are just experimental attempts at these languages, produced after a quick reading of the corresponding article on wikipedia.org. They will need work or advice from native speakers to improve them. Please contact me if you want to advise or assist with these or other languages.<p> | |||
The sound of some phonemes may be poorly implemented, particularly [r] since I'm English and therefore unable to make a "proper" [r] sound.<p> | |||
A major factor is the rhythm or cadance. An Italian speaker told me the Italian voice improved from "difficult to understand" to "good" by changing the relative length of stressed syllables. Identifying unstressed function words in the xx_list file is also important to make the speech flow well. See <a href="add_language.html">Adding or Improving a Language</a> | |||
<h4>Character sets</h4> | |||
Languages recognise text either as UTF8 or alternatively in an 8-bit character set which is appropriate for that language. For example, for Polish this is Latin2, for Russian it is KOI8-R. This choice can be overridden by a line in the voices file to specify an ISO 8859 character set, eg. for Russian the line:<br> | |||
<pre> charset 5</pre> | |||
will mean that ISO 8859-5 is used as the 8-bit character set rather than KOI8-R. | |||
<p> | |||
In the case of a language which uses a non-Latin character set (eg. Greek or Russian) if the text contains a word with Latin characters then that particular word will be pronounced using English pronunciation rules and English phonemes. Speaking entirely English text using a Greek or Russian voice will sound OK, but each word is spoken separately so it won't flow properly. | |||
<p> | |||
Sample texts in various languages can be found at <a href="http://meta.wikimedia.org/wiki/List_of_Wikipedias"> http://<language>.wikipedia.org</a> and <a href="http://www.gutenberg.org">www.gutenberg.org</a> | |||
<h3>3.1 Voice Files</h3> | |||
A number of Voice files are provided in the <code>espeak-data/voices</code> directory. | |||
You can select one of these with the <strong>-v <voice filename></strong> parameter to the | |||
speak command, eg: | |||
<pre> espeak -vaf</pre> | |||
to speak using the Afrikaans voice. | |||
<p> | |||
For details of the voice files see <a href="voices.html">Voices</a>. | |||
<h4>Default Voice</h4> | |||
<ul> | |||
<dl> | |||
<dt> | |||
<strong>default</strong><br> | |||
<dd> This voice is used if none is specified in the speak command. Copy your preferred voice to "default" so you can use the speak command without the need to specify a voice.</dd> | |||
</dl> | |||
</ul> | |||
<h3>3.2 English Voices</h3> | |||
<ul><dl> | |||
<dt> | |||
<strong>en</strong><br> | |||
<dd> is the standard default English voice.</dd> | |||
<p> | |||
<dt> | |||
<strong>en-sc</strong><br> | |||
<dd> Scottish English. | |||
<p> | |||
<dt> | |||
<strong>en-r</strong><br> | |||
<dd> Some slight vowel changes, and a "rhotic" accent, where "r" is pronounced even when not followed by a vowel. This may sound less "British" to an American. | |||
<p> | |||
<dt> | |||
<strong>en-n<br> | |||
en-rp<br> | |||
en-wm</strong><br> | |||
<dd> are different English voices. These can be considered caricatures of | |||
various British accents: Northern, Received Pronunciation, West Midlands | |||
respectively.</dd> | |||
<p> | |||
</dl></ul> | |||
<h3>3.3 Voice Variants</h3> | |||
To make alternative voices for a language, you can make additional voice files in espeak-data/voices which contains commands to change various voice and pronunciation attributes. See <a href="voices.html">voices.html</a>. | |||
<p> | |||
Alternatively there are some preset voice variants which can be applied to any of the language voices, by appending <code>+</code> and a number. Their effects are defined by files in <code>espeak-data/voices/!v</code>. | |||
<p> | |||
The variant numbers are <code> +1 +2 +3 +4 +5 </code> for male voices and <code> +11 +12 +13 +14 </code> for female voices. For example: | |||
<pre> espeak -ven+3</pre> | |||
<h3>3.4 Other Languages</h3> | |||
The eSpeak speech synthesizer does text to speech for the following additional langauges. | |||
<ul> | |||
<dl> | |||
<p> | |||
<dt> | |||
<strong>af Afrikaans</strong><br> | |||
<dd>This has been worked on by a native speaker and it should be OK.</dd> | |||
<p> | |||
<dt> | |||
<strong>cs Czech</strong><br> | |||
<dd>Usable. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>de German</strong><br> | |||
<dd>This has improved from easlier versions. A problem is stress placement (which like English is irregular), prosody, and the use of compound words where correct detection of the sub-word boundaries would probably be needed for accurate pronunciation. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>el Greek</strong><br> | |||
<dd>Stress position is marked in text and spelling is fairly regular, so it shouldn't be too bad. It uses a different alphabet and switches to English pronunciation for words which contain Latin characters a-z.</dd> | |||
<p> | |||
<dt> | |||
<strong>eo Esperanto</strong><br> | |||
<dd>Esperanto has simple and regular pronunciation rules, so it should be OK, although I'm not | |||
certain how it's supposed to sound, other than what I've read in an introduction. | |||
Text can be either UTF-8, or Latin3 alphabet, or | |||
can use the Latin1 convention of two-letter combinations (cx, | |||
gx, etc).</dd> | |||
<p> | |||
<dt> | |||
<strong>es Spanish</strong><br> | |||
<dd>Not much feedback yet, but spelling is regular and stress is explicitly marked when it deviates from the normal rules, so it might be reasonably intelligible. This would be a good opportunity for a native Spanish speaker to assist. Perhaps we could have different variants for Castilian Spanish, Mexican Spanish, etc., in a similar way to how I've done different English accents.</dd> | |||
<p> | |||
<dt> | |||
<strong>fi Finnish</strong><br> | |||
<dd>This has had assistance from native speakers and should be usable. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>fr French</strong><br> | |||
<dd>Needs improvement, both for spelling-to-phoneme rules and the sounds and prosody. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>hr Croatian</strong><br> | |||
<dd>Usable, but I'm unsure whether wrong stressed syllables are a problem. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>hu Hungarian</strong><br> | |||
<dd>This has had assistance from a native speaker and it should be OK. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>it Italian</strong><br> | |||
<dd>This has had some feedback from a native speaker but more work is needed. Spelling is fairly regular, but stress marks and vowel accents are often omitted from text, so for some words the dictionary/exceptions list will need to determine the stress position or whether to use open/close [e] or [E] and [o] or [O].</dd> | |||
<p> | |||
<dt> | |||
<strong>pt Portuguese (Brazil)</strong><br> | |||
<dd>Brazilian Portuguese. This has had assistance from a native speaker and it should be OK. Like Italian there is further work to do about the ambiguity in the spelling between open/close "e" and "o" vowels.<p> | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>pt-pt Portuguese (European)</strong><br> | |||
<dd> | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>ro Romanian</strong><br> | |||
<dd>Probably OK. More work is needed to improve the position of stress within words. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>sk Slovak</strong><br> | |||
<dd>A little initial feedback. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>sv Swedish</strong><br> | |||
<dd>This has now had some work done on the pronunciation rules, so it should be useable. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>sw Swahihi</strong><br> | |||
<dd>Not much feedback yet, but the spelling and stress rules are fairly regular, so it's probably usable. | |||
</dd> | |||
<p> | |||
</dl></ul> | |||
<h3>3.5 Provisional Languages</h3> | |||
These languages are only initial naive implementations which have had little or no feedback and improvement from native speakers. | |||
<ul> | |||
<dl> | |||
<p> | |||
<dt> | |||
<strong>cy Welsh</strong><br> | |||
<dd>An initial guess, awaiting feedback. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>hi Hindi</strong><br> | |||
<dd>This is interesting because it uses the Devanagari characters. I'm not sure about Hindi stress rules, and I expect the sound of aspirated/unaspirated consonant pairs needs improvement. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>nl Dutch</strong><br> | |||
<dd>Probably needs improvement of the spelling-to-phoneme rules. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>is Icelandic</strong><br> | |||
<dd>An initial guess, awaiting feedback. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>no Norwegian</strong><br> | |||
<dd>An initial guess, awaiting feedback. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>pl Polish</strong><br> | |||
<dd>Some initial feedback, but I'm told it's difficult to understand, so more work is needed. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>ru Russian</strong><br> | |||
<dd>So far it's just an initial attempt with basic pronunciation rules. Work is needed especially on the consonants. Russian has two versions of most consonants, "hard" and "soft" (palatalised) and in most cases eSpeak doesn't yet make a proper distinction.<br> | |||
Russian stress position is unpredictable so a large lookup dictionary is needed of those words where eSpeak doesn't guess correctly. To avoid increasing the size of the basic eSpeak package, this is available separately at: <a href="http://espeak.sourceforge.net/data/">http://espeak.sourceforge.net/data/</a> | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>vi Vietnamese</strong><br> | |||
<dd>This is interesting because it's a tone language. I don't know how it should sound, so it's just a guess and I need feedback. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>zhy Cantonese Chinese</strong><br> | |||
<dd>Just a naive simple one-to-one translation from single Simplified Chinese characters to phonetic equivalents in Cantonese. No attempt so far at disambiguation, grouping characters into words, or adjusting tones according to their surrounding syllables. This voice needs Chinese character to phonetic translation data, which is available as a separate download at: <a href="http://espeak.sourceforge.net/data/">http://espeak.sourceforge.net/data/</a>.<br>The voice can also read Jyutping romanised text. | |||
</dd> | |||
<h3>3.6 Mbrola Voices</h3> | |||
Some additional voices, whose name start with <b>mb-</b> (for example <b>mb-en1</b>) use eSpeak as a front-end to Mbrola diphone voices. eSpeak does the spelling-to-phoneme translation and intonation. | |||
See <a href="mbrola.html">mbrola.html</a>. | |||
<p> | |||
</body> | |||
</html> |
@@ -0,0 +1,93 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>espeakedit: Mbrola Voices</title> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>MBROLA VOICES</h2> | |||
<hr> | |||
The Mbrola project is a collection of diphone voices for speech synthesis. They do not include any text-to-phoneme translation, so this must be done by another program. The Mbrola voices are cost-free but are not open source. They are available from the Mbrola website at:<br> | |||
<a href="http://www.tcts.fpms.ac.be/synthesis/mbrola/mbrcopybin.html">http://www.tcts.fpms.ac.be/synthesis/mbrola/mbrcopybin.html</a> | |||
<p> | |||
eSpeak can be used as a front-end to Mbrola. It provides the spelling-to-phoneme translation and intonation, which Mbrola then uses to generate speech sound. | |||
<h3>Voice Names</h3> | |||
To use a Mbrola voice, eSpeak needs information to translate from its own phonemes to the equivalent Mbrola phonemes. This has been set up for only some voices so far. | |||
<p> | |||
The eSpeak voices which use Mbrola are named as:<br> | |||
<b>mb-</b>xxx | |||
<p> | |||
where xxx is the name of a Mbrola voice (eg. <b>mb-en1</b> for the Mbrola "<b>en1</b>" English voice). These voice files are in eSpeak's directory <code>espeak-data/voices/mbrola</code>. | |||
<p> | |||
The installation instructions below use the Mbrola voice "en1" as an example. You can use other mbrola voices for which there is an equivalent eSpeak voice in <code>espeak-data/voices/mbrola</code>. | |||
<p> | |||
There are some additional eSpeak Mbrola voices which speak English text using a Mbrola voice for a different language. These contain the name of the Mbrola voice with a suffix <b>-en</b>. For example, the voice <b>mb-de4-en</b> will speak English text with a German accent by using the Mbrola <b>de4</b> voice. | |||
<h3>Windows Installation</h3> | |||
The SAPI5 version of eSpeak uses the mbrola.dll. | |||
<ol> | |||
<li>Install eSpeak. Include the voice <b>mb-en1</b> in the | |||
list of voices during the eSpeak installation. | |||
<p> | |||
<li>Install the PC/Windows version of Mbrola (MbrolaTools35.exe) from: | |||
<a href="http://www.tcts.fpms.ac.be/synthesis/mbrola/bin/pcwin/MbrolaTools35.exe"> http://www.tcts.fpms.ac.be/synthesis/mbrola/bin/pcwin/MbrolaTools35.exe</a>. | |||
<p> | |||
<li>Get the <b>en1</b> voice from: | |||
<a href="http://www.tcts.fpms.ac.be/synthesis/mbrola/mbrcopybin.html"> http://www.tcts.fpms.ac.be/synthesis/mbrola/mbrcopybin.html</a> | |||
unpack the archive, and copy the "<b>en1</b>" data file (not the whole "en1" | |||
directory) into | |||
<code>C:/Program Files/eSpeak/espeak-data/mbrola</code>. | |||
<p> | |||
<li>Use the voice <b>espeak-MB-EN1</b> from the list of SAPI5 voices. | |||
</ol> | |||
<h3>Linux Installation</h3> | |||
I don't think there's a Linux shared library version of Mbrola (equivalent to mbrola.dll), so eSpeak has to pipe phoneme data to the command-line Mbrola. | |||
<ol> | |||
<li>To install the Linux Mbrola binary, download: | |||
<a href="http://www.tcts.fpms.ac.be/synthesis/mbrola/bin/pclinux/mbr301h.zip"> http://www.tcts.fpms.ac.be/synthesis/mbrola/bin/pclinux/mbr301h.zip</a>. | |||
Unpack the archive, and copy and rename the file: <code>mbrola-linux-i386</code> to | |||
<code>mbrola</code> somewhere in your executable path (eg. <code>/usr/bin/mbrola</code> ). | |||
<p> | |||
<li>Get the en1 voice from: | |||
<a href="http://www.tcts.fpms.ac.be/synthesis/mbrola/mbrcopybin.html"> http://www.tcts.fpms.ac.be/synthesis/mbrola/mbrcopybin.html</a>. | |||
Unpack the archive, and copy the "<b>en1</b>" data file (not the whole "en1" | |||
directory) somewhere convenient (eg. <code>/usr/share/mbrola/en1</code> ). | |||
<p> | |||
<li>If you use the eSpeak voice "<b>mb-en1</b>" then eSpeak will generate | |||
Mbrola phoneme data on its stdout. You can pipe this into Mbrola. | |||
<p> | |||
<code>espeak -v mb-en1 -f textfile | mbrola -e /usr/share/mbrola/en1 - | |||
test.wav</code> | |||
<p> | |||
will put the Mbrola speech output into a WAV file. Or you can pipe the output from Mbrola through aplay: | |||
<p> | |||
<code>espeak -v mb-en1 -f textfile | mbrola -e /usr/share/mbrola/en1 - - | aplay -r16000 -fS16</code> | |||
<p> | |||
The -e option prevents Mbrola from stopping if it finds a combination | |||
of phonemes which it doesn't recognise. | |||
</ol> | |||
<h3>Mbrola Voice Files</h3> | |||
eSpeak's voice files for Mbrola voices are in directory <code>espeak-data/voices/mbrola</code>. They contain a line:<br> | |||
<code>mbrola <voice> <translation></code> | |||
<br> | |||
eg.<br> | |||
<code>mbrola en1 en1_phtrans</code> | |||
<ul> | |||
<li><b><voice></b> is the name of the Mbrola voice. | |||
<p> | |||
<li><b><translation></b> is a translation file to convert between eSpeak phonemes and the equivalent Mbrola phonemes. These are kept in: | |||
<code>espeak-data/mbrola_ph</code> | |||
</ul> | |||
They are binary files which are compiled, using espeakedit, from source files in <code>phsource/mbrola</code>. Details to be defined. | |||
</body> | |||
</html> |
@@ -0,0 +1,168 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>sSpeak: Phonemes</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>PHONEMES</h2> | |||
<hr> | |||
In general a different set of phonemes can be defined for each language. | |||
<p> | |||
In most cases different languages inherit the same basic set of consonants. They can add to these or modify them as needed. | |||
<p> | |||
The phoneme mnemonics are based on the scheme by Kirshenbaum which represents International Phonetic Alphabet symbols using ascii characters. See: <a href="http://www.kirshenbaum.net/IPA/ascii-ipa.pdf">www.kirshenbaum.net/IPA/ascii-ipa.pdf</a>. | |||
<p> | |||
Phoneme mnemonics can be used directly in the text input to <strong>espeak</strong>. They are enclosed within double square brackets. Spaces are used to separate words, and all stressed syllables must be marked explicitly. eg:<br> | |||
<code>[[D,Is Iz sVm f@n'EtIk t'Ekst 'InpUt]]</code> | |||
<h3>English Consonants</h3> | |||
<table> | |||
<tbody valign=top> | |||
<tr> | |||
<td width=25><code>[p]</code><td width=80> | |||
<td width=25><code>[b]</code><td width=80> | |||
<tr> | |||
<td><code>[t]</code><td> | |||
<td><code>[d]</code><td> | |||
<tr> | |||
<td><code>[tS]</code><td><b>ch</b>urch | |||
<td><code>[dZ]</code><td><b>j</b>udge | |||
<tr> | |||
<td><code>[k]</code><td> | |||
<td><code>[g]</code><td> | |||
<tr><td><p> | |||
<tr> | |||
<td><code>[f]</code><td> | |||
<td><code>[v]</code><td> | |||
<tr> | |||
<td><code>[T]</code><td><b>th</b>in | |||
<td><code>[D]</code><td><b>th</b>is | |||
<tr> | |||
<td><code>[s]</code><td> | |||
<td><code>[z]</code><td> | |||
<tr> | |||
<td><code>[S]</code><td><b>sh</b>op | |||
<td><code>[Z]</code><td>plea<b>s</b>ure | |||
<tr> | |||
<td><code>[h]</code><td> | |||
<tr><td><p> | |||
<tr> | |||
<td><code>[m]</code><td> | |||
<td><code>[n]</code><td> | |||
<tr> | |||
<td><code>[N]</code><td>si<b>ng</b> | |||
<tr> | |||
<td><code>[l]</code><td> | |||
<td><code>[r]</code><td><b>r</b>ed (Omitted if not immediately followed by a vowel). | |||
<tr> | |||
<td><code>[j]</code><td><b>y</b>es | |||
<td><code>[w]</code><td> | |||
<tr><td><p> | |||
<tr><td colspan=3><strong>Some Additional Consonants</strong></td> | |||
<p> | |||
<tr> | |||
<td><code>[C]</code><td>German i<b>ch</b> | |||
<td><code>[x]</code><td>German bu<b>ch</b> | |||
<tr> | |||
<td><code>[l^]</code><td>Italian <b>gl</b>i | |||
<td><code>[n^]</code><td>Spanish <b>ñ</b> | |||
</tbody> | |||
</table> | |||
</tbody> | |||
</table> | |||
<h3>English Vowels</h3> | |||
These are the phonemes which are used by the English spelling-to-phoneme translations (en_rules and en_list). In some varieties of English different phonemes may have the same sound, but they are kept separate because they may differ in another variety. | |||
<p> | |||
In rhotic accents, such as General American, the phonemes <code>[3:], [A@], [e@], [i@], [O@], [U@] </code> include the "r" sound. | |||
<p> | |||
<table> | |||
<tbody valign=top> | |||
<tr><td width=25><code>[@]</code> | |||
<td width=60>alph<b>a</b><td width=80>schwa | |||
<tr><td><code>[3]</code> | |||
<td>bett<b>er</b><td>rhotic schwa. In British English this is the same as <code>[@]</code>, but it includes 'r' colouring in American and other rhotic accents. In these cases a separate <code>[r]</code> should not be included unless it is followed immediately by another vowel. | |||
<tr><td><code>[3:]</code><td>n<b>ur</b>se | |||
<tr><td><code>[@L]</code><td>simp<b>le</b> | |||
<tr><td><code>[@2]</code><td>the<td>Used only for "the". | |||
<tr><td><code>[@5]</code><td>to<td>Used only for "to". | |||
<tr><td><p> | |||
<tr><td><code>[a]</code><td>tr<b>a</b>p | |||
<tr><td><code>[aa]</code><td>b<b>a</b>th<td>This is <code>[a]</code> in some accents, <code>[A:]</code> in others. | |||
<tr><td><code>[a2]</code><td><b>a</b>bout<td>This may be <code>[@]</code> or may be a more open schwa. | |||
<tr><td><code>[A:]</code><td>p<b>al</b>m | |||
<tr><td><code>[A@]</code><td>st<b>ar</b>t | |||
<tr><td><p> | |||
<tr><td><code>[E]</code><td>dr<b>e</b>ss | |||
<tr><td><code>[e@]</code><td>squ<b>are</b> | |||
<tr><td><p> | |||
<tr><td><code>[I]</code><td>k<b>i</b>t | |||
<tr><td><code>[I2]</code><td><b>i</b>ntend<td>As <code>[I]</code>, but also indicates an unstressed syllable. | |||
<tr><td><code>[i]</code><td>happ<b>y</b><td>An unstressed "i" sound at the end of a word. | |||
<tr><td><code>[i:]</code><td>fl<b>ee</b>ce | |||
<tr><td><code>[i@]</code><td>n<b>ear</b> | |||
<tr><td><p> | |||
<tr><td><code>[0]</code><td>l<b>o</b>t | |||
<tr><td><p> | |||
<tr><td><code>[V]</code><td>str<b>u</b>t | |||
<tr><td><p> | |||
<tr><td><code>[u:]</code><td>g<b>oo</b>se | |||
<tr><td><code>[U]</code><td>f<b>oo</b>t | |||
<tr><td><code>[U@]</code><td>c<b>ure</b> | |||
<tr><td><p> | |||
<tr><td><code>[O:]</code><td>th<b>ou</b>ght | |||
<tr><td><code>[O@]</code><td>f<b>or</b>ce | |||
<tr><td><p> | |||
<tr><td><code>[aI]</code><td>pr<b>i</b>ce | |||
<tr><td><code>[eI]</code><td>f<b>a</b>ce | |||
<tr><td><code>[OI]</code><td>ch<b>oi</b>ce | |||
<tr><td><code>[aU]</code><td>m<b>ou</b>th | |||
<tr><td><code>[oU]</code><td>g<b>oa</b>t | |||
<tr><td><code>[aI@]</code> | |||
<tr><td><code>[aU@]</code> | |||
</tbody> | |||
</table> | |||
<h3>Some Additional Vowels</h3> | |||
Other languages will have their own vowel definitions, eg: | |||
<table> | |||
<tbody valign=top> | |||
<tr><td width=30><code>[e]</code><td>German <b>eh</b>, French <b>é</b> | |||
<tr><td><code>[o]</code><td>German <b>oo</b>, French <b>o</b> | |||
<tr><td><code>[y]</code><td>German <b>ü</b>, French <b>u</b> | |||
<tr><td><code>[Y]</code><td>German <b>ö</b>, French <b>oe</b> | |||
</tbody> | |||
</table> | |||
<hr> | |||
</body> | |||
</html> |
@@ -0,0 +1,211 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak: Phoneme tables</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<A href="docindex.html">Back</A> | |||
<hr> | |||
<h2>PHONEME TABLES</h2> | |||
<hr> | |||
A phoneme table defines all the phonemes which are used by a language, together with their properties and the data for their production as sounds. | |||
<p> | |||
Generally each language has its own phoneme table, although additional phoneme tables can be used for different voices within the language. These alternatives are referenced from Voices files. | |||
<p> | |||
A phoneme table does not need to define all the phonemes used by a language. Instead it can reference a previously defined phoneme table, whose phonemes it inherits. These can then be used as they are, or overridden by new definitions, or new phonemes added. For example, a phoneme table may redefine (or add) some of the vowels that it uses, but inherit most of its consonants from a standard set. | |||
<p> | |||
<blockquote>Note: This specification is not yet complete and does not include the definitions of the formant sequence specifications. | |||
<br> | |||
The source files for the phoneme data is in the "phsource" directory in the espeakedit download package. | |||
</blockquote> | |||
<p> <hr> | |||
<h3>Phoneme files</h3> | |||
The phoneme tables are defined in a master phoneme file, named <strong>phonemes</strong>. This starts with the <strong>base</strong> phoneme table followed by other phoneme tables for languages and voices which inherit phonemes from the <strong>base</strong> table or from each other. | |||
<p> | |||
In addition to phoneme definitions, the phoneme file can contain the following: | |||
<dl> | |||
<dt><strong>include</strong> <filename> | |||
<dd>Includes the text of the specified file at this point. This allows different phoneme tables to be kept in different text files, for convenience. <filename> is a relative path. The included file can itself contain <strong>include</strong> statements. | |||
<p> | |||
<dt><strong>phonemetable</strong> <name> <parent> | |||
<dd>Starts a new phoneme table, and ends the previous table.<br> | |||
<name> Is the name of this phoneme table. This name is used in Voices files.<br> | |||
<parent> Is the name of a previously defined phoneme table whose phoneme definitions are inherited by this one. The name <strong>base</strong> indicates the first (base) phoneme table. | |||
<p> | |||
<dt><strong>phonemenumber</strong> <integer> | |||
<dd>This statement is used at the start of the master <strong>phonemes</strong> file to define some specific code numbers for various phonemes which are used directly within the <strong>speak</strong> program. | |||
</dl> | |||
<p> <hr> | |||
<h3>Phoneme definitions</h3> | |||
A phoneme table contains a list of phoneme definitions. Each starts with the keyword <strong>phoneme</strong> and the phoneme name (this is the name used in the pronunciation rules), and ends with the keyword <strong>endphoneme</strong>. For example: | |||
<pre> phoneme aI | |||
vowel | |||
length 230 | |||
formants vowels/ai | |||
starttype (a) endtype (I) | |||
endphoneme | |||
phoneme s | |||
vls alv frc sibilant | |||
vowelin f1=0 f2=1700 -300 300 f3=-100 100 | |||
vowelout f1=0 f2=1700 -300 250 f3=-100 100 rms=20 | |||
lengthmod 3 | |||
wave unvoc/s | |||
before _ unvoc/s_ | |||
before p unvoc/s! | |||
before t unvoc/s! | |||
before k unvoc/s! | |||
switchvoicing z | |||
endphoneme | |||
</pre> | |||
<p> | |||
Within the phoneme definition the following lines may occur: ( (V) indicates only for vowels, (C) only for consonants) | |||
<p> | |||
<ul> | |||
<dl><dt>Type. One of these must be present. | |||
<dd><table> | |||
<tr><TD width="100"><b>vowel</b></TD></tr> | |||
<tr><TD><b>liquid</b></TD><td>semi-vowels, such as: <code> r, l, j, w</code></td></tr> | |||
<tr><TD><b>nasal</b></TD><td>nasal eg: <code> m, n, N</code></td></tr> | |||
<tr><TD><b>stop</b></TD><td>stop eg: <code> p, b, t, d, k, g</code></td></tr> | |||
<tr><TD><b>frc</b></TD><td>fricative eg: <code> f, v, T, D, s, z, S, Z, C, x</code></td></tr> | |||
<tr><TD><b>afr</b></TD><td>affricate eg: <code> tS, dZ</code></td></tr> | |||
<tr><TD><b>pause</b></TD><td></td></tr> | |||
<tr><TD><b>stress</b></TD><td>stress symbols, eg: ' , = %</td></tr> | |||
<tr><TD><b>virtual</b></TD><td>Used to represent a class of phonemes. See section ("Phoneme Pairs", below)</td></tr> | |||
</table> | |||
</dl> | |||
<dl><dt>Properties: | |||
<dd><table> | |||
<tr><TD width="100"><b>vls</b></TD><td>(C) voiceless eg. <code> p, t, k, f, s</code></TD></tr> | |||
<tr><TD><b>vcd</b></TD><td>(C) voiced eg. <code> b, d, g, v, z</code></td></tr> | |||
<tr><TD><b>sibilant</b></TD><td>(C) eg: <code> s, z, S, Z, tS, dZ</code></td></tr> | |||
<tr><TD><b>palatal</b></TD><td>(C) A palatal or palatalized consonant.</td></tr> | |||
<tr><TD><b>unstressed</b></TD><td>(V) This vowel is always unstressed, unless explicitly marked otherwise.</td></tr> | |||
<tr><TD><b>nolink</b></TD><td>Prevent any linking from the previous phoneme.</td></tr> | |||
<tr><TD><b>trill</b></TD><td>(C) Apply trill to the voicing.</td></tr> | |||
</table> | |||
</dl> | |||
<dl><dt>Place of Articulation (C): | |||
<dd><table> | |||
<tr><TD><b>blb </b></TD><td width="100">bi-labial</TD> | |||
<TD><b>ldb </b></TD><td width="110">labio-dental</TD> | |||
<TD><b>dnt </b></TD><td>dental</TD></tr> | |||
<tr><TD><b>alv</b></TD><td>alveolar</td> | |||
<TD><b>rfx</b></TD><td>retroflex</TD> | |||
<TD><b>pla</b></TD><td>palato-alveolar</TD></tr> | |||
<tr><TD><b>pal</b></TD><td>palatal</td> | |||
<TD><b>vel</b></TD><td>velar</TD> | |||
<TD><b>lbv</b></TD><td>labio-velar</TD></tr> | |||
<tr><TD><b>uvl</b></TD><td>uvular</td> | |||
<TD><b>phr</b></TD><td>pharyngeal</TD> | |||
<TD><b>glt</b></TD><td>glottal</TD></tr> | |||
</table> | |||
</dl> | |||
<dl> | |||
<dt><strong>length</strong> | |||
<dd>(V) The relative length of the phoneme, typically about 140 for a short vowel and from 200 to 250 for a long vowel or diphong. Currently used only for vowels. | |||
<p> | |||
<dt><strong>formants</strong> <sound spec> | |||
<dd><sound spece> is a relative path to a file which defines how to generate the sound (a vowel or voiced consonant) from a sequence of formant values. (see **) | |||
<p> | |||
<dt><strong>wave</strong> <wavefile> | |||
<dd>(C) This is an alternative to <strong>formants</strong>. <wavefile> is a relative path to a WAV file (22 kHz, 16 bits) which will be played to produce the sound. This method is used for unvoiced consonants. <wavefile> does not include a .WAV filename extension, although the file to which it refers may or may not have one. | |||
<p> | |||
<dt><strong>before</strong> <phoneme> <sound spec> | |||
<dd>This specifies an alternative realization when the phoneme followed by another specified phoneme. <strong>before</strong> may be followed by several <phoneme> <sound seq> pairs. | |||
<p> | |||
<dt><strong>after</strong> <phoneme> <sound spec> | |||
<dd>This specifies an alternative realization when the phoneme follows another specified phoneme. Vowels are considered as two parts, start and end, so both a <strong>before</strong> and an <strong>after</strong> condition may apply to the same vowel. | |||
<p> | |||
<dt><strong>starttype</strong> <phoneme> | |||
<dd>Allocates this phoneme to a category for the purposes of choosing the variant of a phoneme that precedes it. See section "Phoneme Pairs" below. | |||
<p> | |||
<dt><strong>endtype</strong> <phoneme> | |||
<dd>Allocates this phoneme to a category for the purposes of choosing the variant of a phoneme that follows it. See section "Phoneme Pairs" below. | |||
<p> | |||
<dt><strong>reduceto</strong> <phoneme> <level> | |||
<dd>(V) Change to the specified phoneme (such as schwa, @) if this syllable has a stress level less than that specified by <level> | |||
<p> | |||
<dt><strong>linkout</strong> <phoneme> | |||
<dd>If the following phoneme is a vowel then this additional phoneme will be inserted before it. | |||
<p> | |||
<dt><strong>beforevowel</strong> <phoneme> | |||
<dd>The phoneme changes to this one if the next phoneme is a vowel. | |||
<p> | |||
<dt><strong>beforevowelpause</strong> <phoneme> | |||
<dd>Change to this if the next phoneme is a vowel or pause. | |||
<p> | |||
<dt><strong>beforenotvowel</strong> <phoneme> | |||
<dd>Change to this if the next phoneme is <strong>not</strong> a vowel. | |||
<p> | |||
<dt><strong>lengthmod</strong> <integer> | |||
<dd>(C) Determines how this consonant affects the length of the previous vowel. This value is used as index into the <code>length_mods</code> table in the <code>CalcLengths()</code> function in the speak program. | |||
<p> | |||
<dt><strong>vowelin</strong> <vowel transition data> | |||
<dd>(C) Specifies the effects of this consonant on the formants of a following vowel. See "vowel transitions", below. | |||
<p> | |||
<dt><strong>vowelout</strong> <vowel transition data> | |||
<dd>(C) Specifies the effects of this consonant on the formants of a preceding vowel. See "vowel transitions", below. | |||
<p> | |||
</dl> | |||
</ul> | |||
<p> <hr> | |||
<h3>Phoneme Pairs</h3> | |||
The pronunciation of a phoneme can depend on the phonemes before and after it. Some of this modification is done automatically - the program automatically adjusts the beginning and end of a vowel to match its adjacent sounds. You can also specify variant pronunciations in the phoneme table. | |||
<p> | |||
The <strong>before</strong> and <strong>after</strong> statements can specify different sound variants to be used when the phoneme is before or is after another specified phoneme. The adjacent phoneme that's specified in a <strong>before</strong> or <strong>after</strong> statement may refer not just to one, but to other phonemes too. For example:<pre> before ; unvoc/s;</pre>means that the sound <code>unvoc/s;</code> is used (rather than <code>unvoc/s</code> if the following phoneme is <code>[;]</code>. But this rule also applies if the next phoneme is another type of pause, <code>[_]</code> or <code>[;;]</code>. This is because these two include a line<pre> starttype ;</pre>in their phoneme specifications. This means that they look like a <code>[;]</code> to a preceding phoneme. | |||
<p> | |||
When looking for a matching <strong>before</strong> or <strong>after</strong> rule, if an exact match is not found, then a match is looked for by replacing either or both of the two phonemes by their <strong>starttype</strong> and <strong>endtype</strong> groups as appropriate. | |||
<p> | |||
<strong>virtual</strong> phonemes can be defined for use in <strong>starttype</strong> and <strong>endtype</strong> statements. For example, a virtual phoneme <code>[ (i) ]</code> is used to represent vowels which start with and end with an <code>[i]</code> type sound. So <code>[i:]</code> and <code>[I]</code> have <code> starttype (i) </code> and those, plus diphthongs such as <code>[aI] [eI] [OI]</code> have <code> endtype (i) </code>. By convension, names of virtual phonemes include a pair of round brackets. | |||
<p> <hr> | |||
<h3>Sound Specifications</h3> | |||
There are three ways to produce sounds: | |||
<ul> | |||
<li>Playing a WAV file. This is used for unvoiced consonants such as <code> [p] [t] [s]</code>. | |||
<li>Generating a wave from a sequence of formant parameters. This is used for vowels and also for sonorants such as <code> [l] [j] [n]</code>. | |||
<li>A mixture of these. A stored WAV file is mixed with a wave generated from formant parameters. This is used for voiced stops and fricatives such as <code> [b] [g] [v] [z]</code>. | |||
</ul> | |||
A <em><sound spec></em> in the phoneme table can refer to a WAV file, a formant sequence, or a mixture of both. It can also include a numeric value to adjust the length of the sound. | |||
<p> <hr> | |||
<h3>Vowel Transitions</h3> | |||
These specify how a consonant affects an adjacent vowel. A consonant may cause a transition in the vowel's formants as the mouth changes shape between the consonant and the vowel. The following attributes may be specified. Note that the maximum rate of change of formant frequencies is limited by the speak program.<p> | |||
<ul><dl> | |||
<dt><strong>len=<integer></strong> | |||
<dd>Nominal length of the transition in mS. If omitted a default value is used. | |||
<dt><strong>rms=<integer></strong> | |||
<dd>Adjusts the amplitude of the vowel at the end of the transition. If omitted a default value is used. | |||
<dt><strong>f1=<integer></strong> | |||
<dd> | |||
0: f1 formant frequency unchanged.<br> | |||
1: f1 formant frequency decreases.<br> | |||
2: f1 formant frequency decreases more. | |||
<dt><strong>f2=<freq> <min> <max></strong> | |||
<dd> | |||
<freq>: The frequency towards which the f2 formant moves (Hz).<br> | |||
<min>: Signed integer (Hz). The minimum f2 frequency change.<br> | |||
<max>: Signed integer (Hz). The maximum f2 frequency change. | |||
<dt><strong>f3=<change> <amplitude></strong> | |||
<dd> | |||
<change>: Signed integer (Hz). Frequence change of f3, f4, and f5 formants.<br> | |||
<amplitude>: Amplitude of the f3, f4, and f5 formants at the end of the transition. 100 = no change. | |||
<dt><strong>brk</strong> | |||
<dd>Break. Do not merge the synthesized wave of the consonant into the vowel. This will produce a discontinuity in the formants. | |||
<dt><strong>rate</strong> | |||
<dd>Allow a greater maximum rate of change of formant frequencies. | |||
<dt><strong>glstop</strong> | |||
<dd>Indicates a glottal stop. | |||
</dl></ul> | |||
</body> | |||
</html> |
@@ -0,0 +1,571 @@ | |||
#ifndef SPEAK_LIB_H | |||
#define SPEAK_LIB_H | |||
/*************************************************************************** | |||
* Copyright (C) 2006 by Jonathan Duddington * | |||
* [email protected] * | |||
* * | |||
* This program is free software; you can redistribute it and/or modify * | |||
* it under the terms of the GNU General Public License as published by * | |||
* the Free Software Foundation; either version 2 of the License, or * | |||
* (at your option) any later version. * | |||
* * | |||
* This program is distributed in the hope that it will be useful, * | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of * | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * | |||
* GNU General Public License for more details. * | |||
* * | |||
* You should have received a copy of the GNU General Public License * | |||
* along with this program; if not, write to the * | |||
* Free Software Foundation, Inc., * | |||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * | |||
***************************************************************************/ | |||
/*************************************************************/ | |||
/* This is the header file for the library version of espeak */ | |||
/* */ | |||
/*************************************************************/ | |||
#include <stdio.h> | |||
/********************/ | |||
/* Initialization */ | |||
/********************/ | |||
typedef enum { | |||
espeakEVENT_LIST_TERMINATED = 0, // Retrieval mode: terminates the event list. | |||
espeakEVENT_WORD = 1, // Start of word | |||
espeakEVENT_SENTENCE, // Start of sentence | |||
espeakEVENT_MARK, // Mark | |||
espeakEVENT_PLAY, // Audio element | |||
espeakEVENT_END, // End of sentence | |||
espeakEVENT_MSG_TERMINATED // End of message | |||
} espeak_EVENT_TYPE; | |||
typedef struct { | |||
espeak_EVENT_TYPE type; | |||
unsigned int unique_identifier; // message identifier (or 0 for key or character) | |||
int text_position; // the number of characters from the start of the text | |||
int length; // word length, in characters (for espeakEVENT_WORD) | |||
int audio_position; // the time in mS within the generated speech output data | |||
int sample; // sample id (internal use) | |||
void* user_data; // pointer supplied by the calling program | |||
union { | |||
int number; // used for WORD and SENTENCE events | |||
const char *name; // used for MARK and PLAY events. UTF8 string | |||
} id; | |||
} espeak_EVENT; | |||
/* | |||
When a message is supplied to espeak_synth, the request is buffered and espeak_synth returns. When the message is really processed, the callback function will be repetedly called. | |||
In RETRIEVAL mode, the callback function supplies to the calling program the audio data and an event list terminated by 0 (LIST_TERMINATED). | |||
In PLAYBACK mode, the callback function is called as soon as an event happens. | |||
For example suppose that the following message is supplied to espeak_Synth: | |||
"hello, hello." | |||
* Once processed in RETRIEVAL mode, it could lead to 3 calls of the callback function : | |||
** Block 1: | |||
<audio data> + | |||
List of events: SENTENCE + WORD + LIST_TERMINATED | |||
** Block 2: | |||
<audio data> + | |||
List of events: WORD + END + LIST_TERMINATED | |||
** Block 3: | |||
no audio data | |||
List of events: MSG_TERMINATED + LIST_TERMINATED | |||
* Once processed in PLAYBACK mode, it could lead to 5 calls of the callback function: | |||
** SENTENCE | |||
** WORD (call when the sounds are actually played) | |||
** WORD | |||
** END (call when the end of sentence is actually played.) | |||
** MSG_TERMINATED | |||
The MSG_TERMINATED event is the last event. It can inform the calling program to clear the user data related to the message. | |||
So if the synthesis must be stopped, the callback function is called for each pending message with the MSG_TERMINATED event. | |||
A MARK event indicates a <mark> element in the text. | |||
A PLAY event indicates an <audio> element in the text, for which the calling program should play the named sound file. | |||
*/ | |||
typedef enum { | |||
POS_CHARACTER = 1, | |||
POS_WORD, | |||
POS_SENTENCE | |||
} espeak_POSITION_TYPE; | |||
typedef enum { | |||
/* PLAYBACK mode: plays the audio data, supplies events to the calling program*/ | |||
AUDIO_OUTPUT_PLAYBACK, | |||
/* RETRIEVAL mode: supplies audio data and events to the calling program */ | |||
AUDIO_OUTPUT_RETRIEVAL, | |||
/* SYNCHRONOUS mode: as RETRIEVAL but doesn't return until synthesis is completed */ | |||
AUDIO_OUTPUT_SYNCHRONOUS, | |||
/* Synchronous playback */ | |||
AUDIO_OUTPUT_SYNCH_PLAYBACK | |||
} espeak_AUDIO_OUTPUT; | |||
typedef enum { | |||
EE_OK=0, | |||
EE_INTERNAL_ERROR=-1, | |||
EE_BUFFER_FULL=1, | |||
EE_NOT_FOUND=2 | |||
} espeak_ERROR; | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
int espeak_Initialize(espeak_AUDIO_OUTPUT output, int buflength, const char *path); | |||
/* Must be called before any synthesis functions are called. | |||
output: the audio data can either be played by eSpeak or passed back by the SynthCallback function. | |||
buflength: The length in mS of sound buffers passed to the SynthCallback function. | |||
path: The directory which contains the espeak-data directory, or NULL for the default location. | |||
Returns: sample rate in Hz, or -1 (EE_INTERNAL_ERROR). | |||
*/ | |||
typedef int (t_espeak_callback)(short*, int, espeak_EVENT*); | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
void espeak_SetSynthCallback(t_espeak_callback* SynthCallback); | |||
/* Must be called before any synthesis functions are called. | |||
This specifies a function in the calling program which is called when a buffer of | |||
speech sound data has been produced. | |||
The callback function is of the form: | |||
int SynthCallback(short *wav, int numsamples, espeak_EVENT *events); | |||
wav: is the speech sound data which has been produced. | |||
NULL indicates that the synthesis has been completed. | |||
numsamples: is the number of entries in wav. This number may vary, may be less than | |||
the value implied by the buflength parameter given in espeak_Initialize, and may | |||
sometimes be zero (which does NOT indicate end of synthesis). | |||
events: an array of espeak_EVENT items which indicate word and sentence events, and | |||
also the occurance if <mark> and <audio> elements within the text. | |||
Callback returns: 0=continue synthesis, 1=abort synthesis. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
void espeak_SetUriCallback(int (*UriCallback)(int, const char*, const char*)); | |||
/* This function must be called before synthesis functions are used, in order to deal with | |||
<audio> tags. It specifies a callback function which is called when an <audio> element is | |||
encountered and allows the calling program to indicate whether the sound file which | |||
is specified in the <audio> element is available and is to be played. | |||
The callback function is of the form: | |||
int UriCallback(int type, const char *uri, const char *base); | |||
type: type of callback event. Currently only 1= <audio> element | |||
uri: the "src" attribute from the <audio> element | |||
base: the "xml:base" attribute (if any) from the <speak> element | |||
Return: 1=don't play the sound, but speak the text alternative. | |||
0=place a PLAY event in the event list at the point where the <audio> element | |||
occurs. The calling program can then play the sound at that point. | |||
*/ | |||
/********************/ | |||
/* Synthesis */ | |||
/********************/ | |||
#define espeakCHARS_AUTO 0 | |||
#define espeakCHARS_UTF8 1 | |||
#define espeakCHARS_8BIT 2 | |||
#define espeakCHARS_WCHAR 3 | |||
#define espeakSSML 0x10 | |||
#define espeakPHONEMES 0x100 | |||
#define espeakENDPAUSE 0x1000 | |||
#define espeakKEEP_NAMEDATA 0x2000 | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_Synth(const void *text, | |||
size_t size, | |||
unsigned int position, | |||
espeak_POSITION_TYPE position_type, | |||
unsigned int end_position, | |||
unsigned int flags, | |||
unsigned int* unique_identifier, | |||
void* user_data); | |||
/* Synthesize speech for the specified text. The speech sound data is passed to the calling | |||
program in buffers by means of the callback function specified by espeak_SetSynthCallback(). The command is asynchronous: it is internally buffered and returns as soon as possible. If espeak_Initialize was previously called with AUDIO_OUTPUT_PLAYBACK as argument, the sound data are played by eSpeak. | |||
text: The text to be spoken, terminated by a zero character. It may be either 8-bit characters, | |||
wide characters (wchar_t), or UTF8 encoding. Which of these is determined by the "flags" | |||
parameter. | |||
size: Equal to (or greatrer than) the size of the text data, in bytes. This is used in order | |||
to allocate internal storage space for the text. This value is not used for | |||
AUDIO_OUTPUT_SYNCHRONOUS mode. | |||
position: The position in the text where speaking starts. Zero indicates speak from the | |||
start of the text. | |||
position_type: Determines whether "position" is a number of characters, words, or sentences. | |||
Values: | |||
end_position: If set, this gives a character position at which speaking will stop. A value | |||
of zero indicates no end position. | |||
flags: These may be OR'd together: | |||
Type of character codes, one of: | |||
espeakCHARS_UTF8 UTF8 encoding | |||
espeakCHARS_8BIT The 8 bit ISO-8859 character set for the particular language. | |||
espeakCHARS_AUTO 8 bit or UTF8 (this is the default) | |||
espeakCHARS_WCHAR Wide characters (wchar_t) | |||
espeakSSML Elements within < > are treated as SSML elements, or if not recognised are ignored. | |||
espeakPHONEMES Text within [[ ]] is treated as phonemes codes (in espeak's Hirschenbaum encoding). | |||
espeakENDPAUSE If set then a sentence pause is added at the end of the text. If not set then | |||
this pause is suppressed. | |||
unique_identifier: message identifier; helpful for identifying later | |||
data supplied to the callback. | |||
user_data: pointer which will be passed to the callback function. | |||
Return: EE_OK: operation achieved | |||
EE_BUFFER_FULL: the command can not be buffered; | |||
you may try after a while to call the function again. | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_Synth_Mark(const void *text, | |||
size_t size, | |||
const char *index_mark, | |||
unsigned int end_position, | |||
unsigned int flags, | |||
unsigned int* unique_identifier, | |||
void* user_data); | |||
/* Synthesize speech for the specified text. Similar to espeak_Synth() but the start position is | |||
specified by the name of a <mark> element in the text. | |||
index_mark: The "name" attribute of a <mark> element within the text which specified the | |||
point at which synthesis starts. UTF8 string. | |||
For the other parameters, see espeak_Synth() | |||
Return: EE_OK: operation achieved | |||
EE_BUFFER_FULL: the command can not be buffered; | |||
you may try after a while to call the function again. | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_Key(const char *key_name); | |||
/* Speak the name of a keyboard key. | |||
Currently this just speaks the "key_name" as given | |||
Return: EE_OK: operation achieved | |||
EE_BUFFER_FULL: the command can not be buffered; | |||
you may try after a while to call the function again. | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_Char(wchar_t character); | |||
/* Speak the name of the given character | |||
Return: EE_OK: operation achieved | |||
EE_BUFFER_FULL: the command can not be buffered; | |||
you may try after a while to call the function again. | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
/* Note, there is no function to play a sound icon. This would be done by the calling program */ | |||
/***********************/ | |||
/* Speech Parameters */ | |||
/***********************/ | |||
typedef enum { | |||
espeakSILENCE=0, /* internal use */ | |||
espeakRATE, | |||
espeakVOLUME, | |||
espeakPITCH, | |||
espeakRANGE, | |||
espeakPUNCTUATION, | |||
espeakCAPITALS, | |||
espeakEMPHASIS, /* internal use */ | |||
espeakLINELENGTH, /* internal use */ | |||
espeakVOICETYPE, // internal, 1=mbrola | |||
N_SPEECH_PARAM /* last enum */ | |||
} espeak_PARAMETER; | |||
typedef enum { | |||
espeakPUNCT_NONE=0, | |||
espeakPUNCT_ALL=1, | |||
espeakPUNCT_SOME=2 | |||
} espeak_PUNCT_TYPE; | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_SetParameter(espeak_PARAMETER parameter, int value, int relative); | |||
/* Sets the value of the specified parameter. | |||
relative=0 Sets the absolute value of the parameter. | |||
relative=1 Sets a relative value of the parameter. | |||
parameter: | |||
espeakRATE: speaking speed in word per minute. | |||
espeakVOLUME: volume in range 0-100 0=silence | |||
espeakPITCH: base pitch, range 0-100. 50=normal | |||
espeakRANGE: pitch range, range 0-100. 0-monotone, 50=normal | |||
espeakPUNCTUATION: which punctuation characters to announce: | |||
value in espeak_PUNCT_TYPE (none, all, some), | |||
see espeak_GetParameter() to specify which characters are announced. | |||
espeakCAPITALS: announce capital letters by: | |||
0=none, | |||
1=sound icon, | |||
2=spelling, | |||
3 or higher, by raising pitch. This values gives the amount in Hz by which the pitch | |||
of a word raised to indicate it has a capital letter. | |||
Return: EE_OK: operation achieved | |||
EE_BUFFER_FULL: the command can not be buffered; | |||
you may try after a while to call the function again. | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
int espeak_GetParameter(espeak_PARAMETER parameter, int current); | |||
/* current=0 Returns the default value of the specified parameter. | |||
current=1 Returns the current value of the specified parameter, as set by SetParameter() | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_SetPunctuationList(const wchar_t *punctlist); | |||
/* Specified a list of punctuation characters whose names are to be spoken when the | |||
value of the Punctuation parameter is set to "some". | |||
punctlist: A list of character codes, terminated by a zero character. | |||
Return: EE_OK: operation achieved | |||
EE_BUFFER_FULL: the command can not be buffered; | |||
you may try after a while to call the function again. | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
void espeak_SetPhonemeTrace(int value, FILE *stream); | |||
/* Controls the output of phoneme symbols for the text | |||
value=0 No phoneme output (default) | |||
value=1 Output the translated phoneme symbols for the text | |||
value=2 as (1), but also output a trace of how the translation was done (matching rules and list entries) | |||
stream output stream for the phoneme symbols (and trace). If stream=NULL then it uses stdout. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
void espeak_CompileDictionary(const char *path, FILE *log); | |||
/* Compile pronunciation dictionary for a language which corresponds to the currently | |||
selected voice. The required voice should be selected before calling this function. | |||
path: The directory which contains the language's '_rules' and '_list' files. | |||
'path' should end with a path separator character ('/'). | |||
log: Stream for error reports and statistics information. If log=NULL then stderr will be used. | |||
*/ | |||
/***********************/ | |||
/* Voice Selection */ | |||
/***********************/ | |||
// voice table | |||
typedef struct { | |||
char *name; // a given name for this voice. UTF8 string. | |||
char *languages; // list of pairs of (byte) priority + (string) language (and dialect qualifier) | |||
char *identifier; // the filename for this voice within espeak-data/voices | |||
unsigned char gender; // 0=none 1=male, 2=female, | |||
unsigned char age; // 0=not specified, or age in years | |||
unsigned char variant; // only used when passed as a parameter to espeak_SetVoiceByProperties | |||
unsigned char xx1; // for internal use | |||
int score; // for internal use | |||
void *spare; // for internal use | |||
} espeak_VOICE; | |||
/* Note: The espeak_VOICE structure is used for two purposes: | |||
1. To return the details of the available voices. | |||
2. As a parameter to espeak_SetVoiceByProperties() in order to specify selection criteria. | |||
In (1), the "languages" field consists of a list of (UTF8) language names for which this voice | |||
may be used, each language name in the list is terminated by a zero byte and is also preceded by | |||
a single byte which gives a "priority" number. The list of languages is terminated by an | |||
additional zero byte. | |||
A language name consists of a language code, optionally followed by one or more qualifier (dialect) | |||
names separated by hyphens (eg. "en-uk"). A voice might, for example, have languages "en-uk" and | |||
"en". Even without "en" listed, voice would still be selected for the "en" language (because | |||
"en-uk" is related) but at a lower priority. | |||
The priority byte indicates how the voice is preferred for the language. A low number indicates a | |||
more preferred voice, a higher number indicates a less preferred voice. | |||
In (2), the "languages" field consists simply of a single (UTF8) language name, with no preceding | |||
priority byte. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
const espeak_VOICE **espeak_ListVoices(espeak_VOICE *voice_spec); | |||
/* Reads the voice files from espeak-data/voices and creates an array of espeak_VOICE pointers. | |||
The list is terminated by a NULL pointer | |||
If voice_spec is NULL then all voices are listed. | |||
If voice spec is give, then only the voices which are compatible with the voice_spec | |||
are listed, and they are listed in preference order. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_SetVoiceByName(const char *name); | |||
/* Searches for a voice with a matching "name" field. Language is not considered. | |||
"name" is a UTF8 string. | |||
Return: EE_OK: operation achieved | |||
EE_BUFFER_FULL: the command can not be buffered; | |||
you may try after a while to call the function again. | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_SetVoiceByProperties(espeak_VOICE *voice_spec); | |||
/* An espeak_VOICE structure is used to pass criteria to select a voice. Any of the following | |||
fields may be set: | |||
name NULL, or a voice name | |||
languages NULL, or a single language string (with optional dialect), eg. "en-uk", or "en" | |||
gender 0=not specified, 1=male, 2=female | |||
age 0=not specified, or an age in years | |||
variant After a list of candidates is produced, scored and sorted, "variant" is used to index | |||
that list and choose a voice. | |||
variant=0 takes the top voice (i.e. best match). variant=1 takes the next voice, etc | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_VOICE *espeak_GetCurrentVoice(void); | |||
/* Returns the espeak_VOICE data for the currently selected voice. | |||
This is not affected by temporary voice changes caused by SSML elements such as <voice> and <s> | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_Cancel(void); | |||
/* Stop immediately synthesis and audio output of the current text. When this | |||
function returns, the audio output is fully stopped and the synthesizer is ready to | |||
synthesize a new message. | |||
Return: EE_OK: operation achieved | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
int espeak_IsPlaying(void); | |||
/* Returns 1 if audio is played, 0 otherwise. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_Synchronize(void); | |||
/* This function returns when all data have been spoken. | |||
Return: EE_OK: operation achieved | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
espeak_ERROR espeak_Terminate(void); | |||
/* last function to be called. | |||
Return: EE_OK: operation achieved | |||
EE_INTERNAL_ERROR. | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" | |||
#endif | |||
const char *espeak_Info(void* ptr); | |||
/* Returns the version number string. | |||
The parameter is for future use, and should be set to NULL | |||
*/ | |||
#endif |
@@ -0,0 +1,89 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |||
</head> | |||
<body> | |||
<hr> | |||
<h2>TEXT MARKUP</h2> | |||
<hr> | |||
<h3>SSML: Speech Synthesis Markup Language</h3> | |||
The following markup tags and attributes are recognised:<p> | |||
<dl></dl> | |||
<p><b><speak></b> | |||
<ul> | |||
<li>xml:base (the value is just passed back as a parameter with the UriCallback() function) | |||
<li>xml:lang | |||
</ul> | |||
<p><b><voice></b> | |||
<ul> | |||
<li>xml:lang | |||
<li>name | |||
<li>age | |||
<li>variant | |||
<li>gender | |||
</ul> | |||
<p><b><prosody></b> | |||
<ul> | |||
<li>rate | |||
<li>volume | |||
<li>pitch | |||
<li>range | |||
</ul> | |||
<p><b><say-as></b> | |||
<ul> | |||
<li>interpret-as="characters" | |||
<li>interpret-as="characters" format="glyphs" | |||
<li>interpret-as="tts:key" | |||
<li>interpret-as="tts:char" | |||
<li>interpret-as="tts:digits" | |||
</ul> | |||
<p><b><mark></b> name | |||
<p><b><s></b> | |||
<ul> | |||
<li>xml:lang | |||
</ul> | |||
<p><b><p></b> | |||
<ul> | |||
<li>xml:lang | |||
</ul> | |||
<p><b><sub></b> alias | |||
<p><b><tts:style></b> | |||
<ul> | |||
<li>field="punctuation" mode=none,all,some | |||
<li>field="capital_letters" mode=no,spelling,icon,pitch | |||
</ul> | |||
<p><b><audio></b> src | |||
<p><b><emphasis></b> | |||
<ul> | |||
<li>level | |||
</ul> | |||
<p><b><break></b> | |||
<ul> | |||
<li>strength | |||
<li>time | |||
</ul> | |||
</dl> | |||
<hr> | |||
<h3>HTML</h3> | |||
eSpeak can speak HTML text directly, or text containing both SSML and HTML markup.<br> | |||
Any unrecognised tags are ignored.<p> | |||
The following tags case a sentence break.<br> | |||
<b><br> | |||
<li> | |||
<img> | |||
<td> | |||
</b><p> | |||
The following tags case a paragraph break.<br> | |||
<b><h1> | |||
<h2> | |||
<h3> | |||
<h4> | |||
<hr> | |||
</b><p> | |||
</body> | |||
</html> |
@@ -0,0 +1,243 @@ | |||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |||
<html> | |||
<head> | |||
<title>eSpeak: Voice Files</title> | |||
<meta name="GENERATOR" content="Quanta Plus"> | |||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> | |||
</head> | |||
<body> | |||
<A href="index.html">Back</A> | |||
<hr> | |||
<h2>5. VOICES</h2> | |||
<hr> | |||
<h3>5.1 Voice Files</h3> | |||
A Voice file specifies a language (and possibly a language variant or dialect) together with various attributes that affect the characteristics of the voice quality and how the language is spoken.<p> | |||
Voice files are placed in the <code>espeak-data/voices</code> directory, or within subdirectories in there.<p> | |||
The available voice files can be listed by:<pre> | |||
espeak --voices | |||
or | |||
espeak --voices=<language></pre> | |||
<hr> | |||
<h3>5.2 Contents of Voice Files</h3> | |||
The <strong>language</strong> attribute is mandatory. All the other attributes are optional. | |||
<p> | |||
<h4>Identification Attributes</h4> | |||
<ul> | |||
<dl> | |||
<dt> | |||
<strong>name <name></strong><br> | |||
<dd>A name given to this voice. | |||
<p> | |||
<dt> | |||
<strong>language <language code> [<priority>]</strong><br> | |||
<dd>This attribute should appear before the other attributes which are listed below.<p> | |||
It selectes the default behaviour and characteristics for the language, and sets default values for | |||
"phonemes", "dictionary" and other attributes. The <language code> should be a two-letter ISO 639-1 language code. One or more language variant codes may be appended, separated by hyphens. (eg. en-uk-north).<p> | |||
The optional <priority> value gives the preference of this voice compared with others for the specified language. A low value indicates a more preferred voice. The default value is 5.<p> | |||
More than one <strong>language</strong> line may be present. A voice may be selected for other related languages (variants which have the same initial 2 letter language code as the specified language), but it will be less preferred for these. Different language variants may be specified by additional <strong>language</strong> lines in order to indicate that this is a preferred voice for them also. Eg.<pre> | |||
language en-uk-north | |||
language en</pre> | |||
indicates that this is voice is for the "en-uk-north" dialect, but it is also a main choice when a general "en" language is specified. Without the second <strong>language</strong> line, it would be disfavoured for "en" for being a more specialised voice. | |||
<p> | |||
<dt> | |||
<strong>gender <gender> [<age>]</strong><br> | |||
<ul><gender> may be male, female, or unknown.<br> | |||
<age> is optional and gives an age in years. | |||
</dl> | |||
</ul> | |||
<h4>Voice Attributes</h4> | |||
<ul> | |||
<dl> | |||
<dt> | |||
<strong>pitch <base> <range></strong><br> | |||
<dd> Two integer values. | |||
The first gives a base pitch to the voice (value in Hz) | |||
The second controls the range of pitches used by the voice. Setting | |||
it equal to the base pitch will give a monotone. The default values are 82 118. | |||
<p> | |||
<dt> | |||
<strong>formant <number> <frequency> <strength> <width></strong><br> | |||
<dd> Systematically adjusts the frequency, strength, and width of the | |||
resonance peaks of the voice. Values are percentages of the | |||
default values. Changing these affects the tone/quality of the voice. | |||
<ul> | |||
<li>Formants 1,2,3 are the standard three formants which define vowels.</li> | |||
<li>Formant 0 is used to give a low frequency component to the sounds, of | |||
frequency lower than F1.</li> | |||
<li>Formants 4,5 are higher than F3. They affect the quality of the voice.</li> | |||
<li>Formants 6,7,8 are weak, high frequency, additions to vowels to give | |||
a clearer sound.</li> | |||
</ul> | |||
<p> | |||
<dt> | |||
<strong>echo <delay> <amplitude></strong><br> | |||
<dd> Parameter 1 gives the delay in mS (0 to 250mS).<br> | |||
Parameter 2 gives the echo amplitude (0 to 100).<br> | |||
Adding some echo can give a clearer or more interesting sound, | |||
especially when listening through a domestic stereo sound system, | |||
rather than small computer speakers. | |||
<dt> | |||
<strong>tone</strong><br> | |||
<dd> Controls the tone of the sound.<br> | |||
<strong>tone</strong> is followed by up to 4 pairs of <frequency> <amplitude> which define a frequency response graph. Frequency is | |||
in Hz and amplitude is in the range 0 to 255. The default is:<p> | |||
<code> tone 600 170 1200 135 2000 110</code><p> | |||
This means that from frequency 0Hz to 600Hz the amplitude is 170. From | |||
600Hz to 1200Hz the amplitude decreases from 170 to 135, then decreases to 110 at 2000Hz | |||
and remains at 110 at higher frequencies. This adjustment applies only to voiced sounds such as | |||
vowels and sonorant consonants (such as [n] and [l]). Unvoiced sounds such | |||
as [s] are unaffected.<p> | |||
This <strong>tone</strong> statement can also appear in <code>espeak-data/config</code>, in which case it applies to all voices which | |||
don't have their own <strong>tone</strong> statement. | |||
<p> | |||
<dt> | |||
<strong>flutter <value></strong><br> | |||
<dd> Default value: 2.<br> | |||
Adds pitch fluctuations to give a wavering or older-sounding voice. | |||
A large value (eg. 20) makes the voice sound "croaky". | |||
<p> | |||
<dt> | |||
<strong>roughness <value></strong><br> | |||
<dd> Default value: 2. Range 0 - 7<br> | |||
Reduces the amplitude of alternate waveform cycles in order to make the voice sound creaky. | |||
<p> | |||
<dt> | |||
<strong>voicing <value></strong><br> | |||
<dd> Default value: 100.<br> | |||
Adjusts the strength of formant-synthesized sounds (vowels and sonorant consonants). | |||
<p> | |||
<dt> | |||
<strong>breath <up to 8 integer values></strong><br> | |||
<dd> Default values: 0.<br> | |||
Adds noise which corresponds to the formant frequency peaks. The values give the strength | |||
of noise for each formant peak (formants 1 to 8). | |||
<p> | |||
Use together with a low or zero value of the <strong>voicing</strong> attribute to make a "wisper". | |||
For example:<br> | |||
<code>breath 75 75 60 40 15 10<br> | |||
breathw 150 150 200 200 400 400<br> | |||
voicing 18<br> | |||
flutter 20<br> | |||
formant 0 100 0 100 // remove formant 0 | |||
</code> | |||
<p> | |||
<dt> | |||
<strong>breathw <up to 8 integer values></strong><br> | |||
<dd> | |||
These values give bandwidths of the noise peaks of the <strong>breath</strong> attribute. If <strong>breathw</strong> values are not given, then suitable default values will be used. | |||
<p> | |||
</dl> | |||
</ul> | |||
<h4>Language Attributes</h4> | |||
<ul> | |||
<dl> | |||
<p> | |||
<dt> | |||
<strong>phonemes <name></strong><br> | |||
<dd>Specifies which set of phonemes to use from those contained in the | |||
phontab, phonindex, and phondata data files. | |||
This is a <strong>phonemetable</strong> name as given in the "phoneme" source file. | |||
<p> | |||
This parameter is usually not needed as it is set by default to the first two letters of the "language" parameter. | |||
However, different voices of the same language can use different phoneme sets, to give different accents. | |||
</dd> | |||
<dt> | |||
<strong>dictionary <name></strong><br> | |||
<dd> Specifies which pair of dictionary files to use. eg. "english" | |||
indicates that <em>speak-data/en_dict</em> should | |||
be used to translate from words to phonemes. This parameter is usually | |||
not needed as it is set by default to the first two letters of "language" parameter.</dd> | |||
<p> | |||
<dt> | |||
<strong>dictrules <list of rule numbers></strong><br> | |||
<dd> | |||
Gives a list of conditional dictionary rules which are applied for this voice. Rule numbers are in the range 0 to 31 and are specific to a language. They can apply to rules in the langauge's <b>_rules</b> dictionary file and also its <b>_list</b> exceptions list. | |||
See <a href="dictionary.html">dictionary.html</a>. | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>replace <flags> <phoneme> <replacement phoneme></strong><br> | |||
<dd> Replace a phoneme by another whenever it occurs.<p> | |||
<replacement phoneme> may be NULL.<p> | |||
Flags: bit 0: replacement only occurs on the final phoneme of a word.<br> | |||
Flags: bit 1: replacement doesn't occur in stressed syllables.<br> | |||
eg. | |||
<pre> | |||
replace 0 h NULL // drops h's | |||
replace 0 V U // replaces vowel in 'strut' by that in 'foot' | |||
// as occurs in northern British English | |||
replace 3 N n // change 'fishing' to 'fishin' etc. | |||
// (only the last phoneme of a word, only in unstressed syllables) | |||
</pre> | |||
The phoneme mnemonics can be defined for each language, but some are listed in <A href="phonemes.html">phonemes.html</A> | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>stressLength <8 integer values></strong><br> | |||
<dd> Eight integer parameters. These control the relative lengths of the vowels in | |||
stressed and unstressed syllables. | |||
<ul> | |||
<li> 0 unstressed | |||
</li><li> 1 diminished. Its use depends on the language. In English it's used for unstressed syllables within multisyllabic words. In Spanish it's used for unstressed final syllables. | |||
</li><li> 2 secondary stress | |||
</li><li> 3 words marked as "unstressed" in the dictionary | |||
</li><li> 4 not currently used | |||
</li><li> 5 not currently used | |||
</li><li> 6 stressed syllable (the main syllable in stressed words) | |||
</li><li> 7 tonic syllable (by default, the last stressed syllable in the clause) | |||
</li></ul> | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>stressAdd <8 integer values></strong><br> | |||
<dd> Eight integer parameters. These are added to the voice's corresponding stressLength values. They are used in the voice variant files in <code>espeak-data/voices/!v</code> to give some variety. Negative values may be used.</dd> | |||
<p> | |||
<dt> | |||
<strong>stressAmp <8 integer values></strong><br> | |||
<dd> Eight integer parameters. These control the relative amplitudes of the vowels in | |||
stressed and unstressed syllables (see stressLength above). | |||
The general default values are: 16, 16, 20, 20, 20, 24, 24, 22, although these defaults may be different for particular languages.</dd> | |||
<p> | |||
<dt> | |||
<strong>intonation <param1> <param2></strong><br> | |||
<dd> (for further development)<br> | |||
</dd> | |||
<p> | |||
<dt> | |||
<strong>charset <param1></strong><br> | |||
<dd> | |||
The ISO 8859 character set number. (not all are implemented). | |||
</dd> | |||
<p> | |||
Additional attributes are available to set various internal options which control how language is processed. These would normally be set in the program code rather than in a voice file. | |||
<p> | |||
<dt> | |||
<strong>stressrule <param1> <param2> <param3> <param4></strong><br> | |||
<dd> | |||
Controls how different stress levels are applied to the syllables of a word. | |||
</dd> | |||
</ul> | |||
<hr> | |||
<h3>5.3 Voice Files Provided</h3> | |||
A number of Voice files are provided in the <code>espeak-data/voices</code> directory. | |||
You can select one of these with the <strong>-v <voice filename></strong> parameter to the | |||
speak command. | |||
<p> | |||
<dl> | |||
<dt> | |||
<strong>default</strong><br> | |||
<dd> This voice is used if none is specified in the speak command. Copy your preferred voice to "default" so you can use the speak command without the need to specify a voice.</dd> | |||
</dl> | |||
For a list of voices provided for English and other languages see <a href="languages.html">Languages</a>. | |||
</body> | |||
</html> |
@@ -1,6 +1,7 @@ | |||
34 phoneme tables | |||
35 phoneme tables | |||
new total | |||
base 96 96 | |||
base2 24 114 | |||
en 53 144 | |||
en_n 29 144 | |||
en_us 37 144 | |||
@@ -22,12 +23,12 @@ | |||
cs 5 124 | |||
hr 25 133 | |||
ru 36 124 | |||
it 25 115 | |||
es 6 115 | |||
pt 28 132 | |||
pt_pt 20 132 | |||
ro 36 139 | |||
el 8 115 | |||
it 12 114 | |||
es 6 114 | |||
pt 28 131 | |||
pt_pt 20 131 | |||
ro 36 138 | |||
el 8 114 | |||
sv 25 118 | |||
no 28 122 | |||
is 32 121 | |||
@@ -45,16 +46,16 @@ | |||
2 b/bo base hi | |||
2 b/bu base hi | |||
2 b/xb base hi | |||
15 d/d base fi fr hi hu pl hr ru it ro el sw | |||
16 d/d_ base fi fr hi hu pl hr ru it ro el sw | |||
15 d/d base base2 fi fr hi hu pl hr ru ro el sw | |||
16 d/d_ base base2 fi fr hi hu pl hr ru ro el sw | |||
1 d/d_dnt hi | |||
17 d/dr base fi fr hi hu pl hr ru it ro el sw | |||
17 d/dr base base2 fi fr hi hu pl hr ru ro el sw | |||
1 d/tap ro | |||
2 d/tap1 base | |||
2 d/tap2 base | |||
2 d/tap3 base | |||
1 d/tap_i ro | |||
13 d/xd base fi fr hi hu pl hr ru it ro el sw | |||
13 d/xd base base2 fi fr hi hu pl hr ru ro el sw | |||
1 d/xd_dnt hi | |||
2 d/xd_pzd pl ru | |||
1 d/x_tap base | |||
@@ -240,25 +241,25 @@ | |||
1 ufric/x_hr hr | |||
1 ustop/c base | |||
5 ustop/k base en fr hi sw | |||
9 ustop/k_ base en fi fr hi hu it el sw | |||
12 ustop/ki base en af fi fr hi hu it is sw | |||
10 ustop/kl base en fi fr hi hu it el sw | |||
11 ustop/kr base en fi fr hi hu it el sw | |||
9 ustop/k_unasp base fi hi hu it el | |||
9 ustop/k_ base base2 en fi fr hi hu el sw | |||
12 ustop/ki base base2 en af fi fr hi hu is sw | |||
10 ustop/kl base base2 en fi fr hi hu el sw | |||
11 ustop/kr base base2 en fi fr hi hu el sw | |||
9 ustop/k_unasp base base2 fi hi hu el | |||
2 ustop/p base hi | |||
2 ustop/p_ base hi | |||
1 ustop/percus10 base | |||
8 ustop/pl base fi hi hu hr it ro | |||
8 ustop/pr base fi hi hu it ro | |||
7 ustop/p_unasp base fi hi hu hr it ro | |||
6 ustop/p_unasp_ fi hi hu hr it ro | |||
8 ustop/pl base base2 fi hi hu hr ro | |||
8 ustop/pr base base2 fi hi hu ro | |||
7 ustop/p_unasp base base2 fi hi hu hr ro | |||
6 ustop/p_unasp_ base2 fi hi hu hr ro | |||
3 ustop/t base en hi | |||
6 ustop/t_ base en hi sk el | |||
27 ustop/t_dnt base en fi fr hi hu pl hr ru it ro vi ++ | |||
27 ustop/t_dnt base base2 en fi fr hi hu pl hr ru ro vi ++ | |||
2 ustop/t_dnt2 hi vi | |||
4 ustop/t_pzd pl ru | |||
7 ustop/tr base en hi ru | |||
8 ustop/ts de eo hu pl ru it ro zh_yue | |||
8 ustop/ts base2 de eo hu pl ru ro zh_yue | |||
2 ustop/tsh base zh_yue | |||
2 ustop/tsh_ base zh_yue | |||
3 ustop/t_short hi sk el | |||
@@ -276,7 +277,7 @@ | |||
1 vdiph2/ii@ en | |||
1 vdiph2/ii@_2 en_wm | |||
1 vdiph2/ii@_3 vi | |||
3 vdiph2/iu it vi zh_yue | |||
3 vdiph2/iu base2 vi zh_yue | |||
1 vdiph2/iu_2 fi | |||
1 vdiph2/iu_3 af | |||
2 vdiph2/iu_4 cy | |||
@@ -301,7 +302,7 @@ | |||
1 vdiph/aau_4 vi | |||
1 vdiph/ae fr_ca | |||
1 vdiph/ae_2 en_n | |||
5 vdiph/ai eo hr it pt vi | |||
6 vdiph/ai base2 eo hr it pt vi | |||
3 vdiph/ai_2 en_us cy | |||
1 vdiph/ai_3 no | |||
1 vdiph/ai_4 af | |||
@@ -313,7 +314,7 @@ | |||
1 vdiph/au# en_sc | |||
2 vdiph/au_2 en_us zh_yue | |||
1 vdiph/au_3 en_rp | |||
5 vdiph/au_4 cy eo sk it is | |||
6 vdiph/au_4 base2 cy eo sk it is | |||
1 vdiph/ee-e hi | |||
3 vdiph/eei en pt vi | |||
2 vdiph/eei_2 eo fi | |||
@@ -323,9 +324,9 @@ | |||
1 vdiph/eeu_2 pt_pt | |||
2 vdiph/eeu_3 en_n en_wm | |||
1 vdiph/eey fi | |||
6 vdiph/ei nl it pt is vi zh_yue | |||
6 vdiph/ei base2 nl pt is vi zh_yue | |||
1 vdiph/ei_2 hr | |||
8 vdiph/eu en cy eo nl sk it pt vi | |||
8 vdiph/eu base2 en cy eo nl sk pt vi | |||
1 vdiph/eu_2 fi | |||
2 vdiph/&i fi hi | |||
3 vdiph/@i_2 af cy | |||
@@ -335,7 +336,7 @@ | |||
1 vdiph/@i_4 vi | |||
2 vdiph/ii hr ro | |||
1 vdiph/i#i ro | |||
4 vdiph/oi en_wm de it vi | |||
5 vdiph/oi base2 en_wm de it vi | |||
1 vdiph/oi_2 af | |||
10 vdiph/ooi en en_n en_us en_rp cy eo fi no zh_yue | |||
1 vdiph/ooi_2 af | |||
@@ -347,7 +348,7 @@ | |||
1 vdiph/@u_2 en_rp | |||
1 vdiph/@u_3 ro | |||
3 vdiph/@u_en en vi | |||
6 vdiph/ui en eo fi it vi zh_yue | |||
6 vdiph/ui base2 en eo fi vi zh_yue | |||
1 vdiph/u-i vi | |||
1 vdiph/ui_2 af | |||
2 vdiph/ui_3 cy | |||
@@ -408,7 +409,7 @@ | |||
14 vowel/@ base en en_us en_rp cy hi hr | |||
1 vowel/@- base | |||
4 vowel/& en_rp fi hi sv | |||
4 vowel/0 en hi it pt | |||
4 vowel/0 base2 en hi pt | |||
4 vowel/0_2 en_n en_wm pt_pt sw | |||
5 vowel/0_3 en_us en_sc en_rp hu | |||
1 vowel/@_2 fr | |||
@@ -420,8 +421,8 @@ | |||
2 vowel/8_2 en_us sv | |||
1 vowel/8_3 zh_yue | |||
11 vowel/a en_n cy de hu nl pl sk hr | |||
4 vowel/a# en_sc it pt | |||
6 vowel/a_2 eo it pt pt_pt ro vi | |||
5 vowel/a# base2 en_sc it pt | |||
7 vowel/a_2 base2 eo it pt pt_pt ro vi | |||
4 vowel/a#_2 hr sv is sw | |||
6 vowel/a_3 en_sc cs is | |||
12 vowel/a#_3 en en_n en_us en_wm de hi ru pt_pt vi zh_yue | |||
@@ -437,27 +438,27 @@ | |||
1 vowel/aa_7 nl | |||
4 vowel/a_en en fr | |||
1 vowel/@_bck hi | |||
13 vowel/e en en_n af cy eo fr hu hr it pt pt_pt vi | |||
14 vowel/e base2 en en_n af cy eo fr hu hr it pt pt_pt ++ | |||
2 vowel/e# en_sc | |||
6 vowel/e_2 en_sc de hi sv no | |||
1 vowel/e_3 hu | |||
1 vowel/e_5 en_sc | |||
3 vowel/ee fr pl sv | |||
2 vowel/e_e en_sc is | |||
7 vowel/ee_1 en en_n pl it sv no zh_yue | |||
7 vowel/ee_1 base2 en en_n pl sv no zh_yue | |||
3 vowel/ee_2 en cy nl | |||
1 vowel/ee#_2 sv | |||
3 vowel/ee_3 af pt vi | |||
3 vowel/ee_6 en_n sk sv | |||
12 vowel/e_mid en_rp en_wm fr_ca hi hu sk cs hr es pt_pt no is | |||
13 vowel/e_mid en_rp en_wm fr_ca hi hu sk cs hr it es pt_pt no ++ | |||
10 vowel/e_mid2 af de fi nl sk ro el sw | |||
1 vowel/@_fnt hr | |||
1 vowel/@_hgh no | |||
18 vowel/i en_n en_rp en_wm cy eo fr hu pl it pt pt_pt ro ++ | |||
19 vowel/i base2 en_n en_rp en_wm cy eo fr hu pl it pt pt_pt ++ | |||
1 vowel/i# cy | |||
3 vowel/i_2 de nl sv | |||
2 vowel/i_3 af sk | |||
4 vowel/i_4 fi hu it is | |||
3 vowel/i_4 fi hu is | |||
5 vowel/i_5 en_sc | |||
3 vowel/i#_5 pt_pt ro | |||
1 vowel/i_6 hr | |||
@@ -469,7 +470,7 @@ | |||
2 vowel/ii_4 en_rp | |||
6 vowel/ii_en en en_n | |||
5 vowel/@_low hi ro no | |||
9 vowel/o en en_wm de fr hi it pt_pt sv | |||
10 vowel/o base2 en en_wm de fr hi it pt_pt sv | |||
4 vowel/o_2 cy hi hu no | |||
2 vowel/o-_2 en_n en_wm | |||
2 vowel/o_3 en_sc | |||
@@ -505,9 +506,9 @@ | |||
1 vowel/u_5 sw | |||
3 vowel/u_6 en_rp pt_pt | |||
1 vowel/u_7 vi | |||
15 vowel/u_bck cy fi fr_ca hi hu nl pl sk hr it pt sv ++ | |||
16 vowel/u_bck base2 cy fi fr_ca hi hu nl pl sk hr it pt ++ | |||
2 vowel/uu en en_wm | |||
2 vowel/uu_2 de it | |||
2 vowel/uu_2 base2 de | |||
1 vowel/uu_3 af | |||
2 vowel/uu_4 fi sv | |||
7 vowel/uu_bck fr_ca hi hu pt no zh_yue | |||
@@ -601,7 +602,7 @@ | |||
1 w/_w base | |||
1 w/w_ base | |||
1 w/w@ base | |||
8 w/w2 pl sk it pt_pt | |||
8 w/w2 base2 pl sk pt_pt | |||
1 w/wa base | |||
1 w/we base | |||
3 w/wi base vi zh_yue | |||
@@ -612,8 +613,8 @@ | |||
20 x/b base hi ro is | |||
1 x/b_ base | |||
10 x/d base en_us hi hr el is | |||
14 x/d_ base fi fr hi pl hr ru it ro el is sw | |||
23 x/d_dnt base fi fr hi hu pl ru it ro sw | |||
14 x/d_ base base2 fi fr hi pl hr ru ro el is sw | |||
23 x/d_dnt base base2 fi fr hi hu pl ru ro sw | |||
6 x/d_pzd pl ru | |||
4 x/dzh base hi | |||
5 x/dzh_ base hi ru |
@@ -0,0 +1,209 @@ | |||
//==================================================== | |||
// Italian | |||
//==================================================== | |||
phoneme : // Lengthen previous vowel by "length" | |||
virtual | |||
length 70 | |||
endphoneme | |||
phoneme a | |||
vowel starttype (a) endtype (a) | |||
length 180 | |||
formants vowel/a_2 | |||
reduceto a/ 4 | |||
endphoneme | |||
phoneme a/ | |||
vowel starttype (@) endtype (@) | |||
length 180 | |||
formants vowel/a# | |||
endphoneme | |||
phoneme e | |||
vowel starttype (e) endtype (e) | |||
length 170 | |||
formants vowel/e | |||
endphoneme | |||
phoneme E | |||
vowel starttype (e) endtype (e) | |||
length 170 | |||
formants vowel/ee_1 | |||
reduceto e 4 // [E] only in stressed syllables | |||
endphoneme | |||
phoneme i | |||
vowel starttype (i) endtype (i) | |||
length 150 | |||
formants vowel/i | |||
linkout ; | |||
endphoneme | |||
phoneme o | |||
vowel starttype (o) endtype (o) | |||
length 170 | |||
formants vowel/o | |||
endphoneme | |||
phoneme O | |||
vowel starttype (o) endtype (o) | |||
length 170 | |||
formants vowel/0 | |||
reduceto o 4 // [O] only in stressed syllables | |||
endphoneme | |||
phoneme u | |||
vowel starttype (u) endtype (u) | |||
length 160 | |||
formants vowel/u_bck | |||
endphoneme | |||
phoneme U | |||
vowel starttype (u) endtype (u) | |||
length 160 | |||
formants vowel/uu_2 | |||
endphoneme | |||
phoneme aU | |||
vowel starttype (a) endtype (u) | |||
length 270 | |||
formants vdiph/au_4 | |||
endphoneme | |||
phoneme eU | |||
vowel starttype (e) endtype (u) | |||
length 260 | |||
formants vdiph/eu | |||
endphoneme | |||
phoneme iU | |||
vowel starttype (i) endtype (u) | |||
length 240 | |||
formants vdiph2/iu | |||
endphoneme | |||
phoneme aI | |||
vowel starttype (a) endtype (i) | |||
length 250 | |||
formants vdiph/ai | |||
endphoneme | |||
phoneme eI | |||
vowel starttype (e) endtype (i) | |||
length 250 | |||
formants vdiph/ei | |||
endphoneme | |||
phoneme oI | |||
vowel starttype (o) endtype (i) | |||
length 240 | |||
formants vdiph/oi | |||
endphoneme | |||
phoneme uI | |||
vowel starttype (u) endtype (i) | |||
length 240 | |||
formants vdiph/ui | |||
endphoneme | |||
// CONSONANTS | |||
//=========== | |||
phoneme w2 | |||
starttype w endtype w | |||
liquid | |||
length 100 | |||
beforenotvowel w/ | |||
lengthmod 7 | |||
formants w/w2 | |||
after _ w/w2 | |||
endphoneme | |||
phoneme p | |||
vls blb stop | |||
vowelin f1=0 f2=1000 -50 -100 f3=-200 80 | |||
vowelout f1=0 f2=1000 -500 -350 f3=-300 80 rms=30 | |||
lengthmod 2 | |||
wave ustop/p_unasp | |||
before _ ustop/p_unasp_%80 | |||
before r ustop/pr | |||
before l ustop/pl | |||
switchvoicing b | |||
endphoneme | |||
phoneme ts | |||
vls alv afr sibilant | |||
vowelin f1=0 f2=1700 -300 300 f3=-100 80 | |||
vowelout f1=0 f2=1700 -300 250 f3=-100 80 rms=20 | |||
lengthmod 2 | |||
wave ustop/ts | |||
endphoneme | |||
phoneme t // dental variant of /t/ | |||
vls dnt stop | |||
vowelin f1=0 f2=1600 -300 300 f3=-100 80 | |||
vowelout f1=0 f2=1600 -300 250 f3=-100 80 rms=20 | |||
lengthmod 2 | |||
wave ustop/t_dnt%50 | |||
before _ ustop/t_dnt%35 | |||
switchvoicing d | |||
endphoneme | |||
phoneme d // dental variant of /d/ | |||
vcd dnt stop | |||
vowelin f1=2 f2=1500 -300 300 f3=-150 80 | |||
vowelout f1=2 f2=1500 -300 300 f3=-150 80 | |||
formants d/d+x/d_dnt%110 | |||
before _ d/d_+x/d_ | |||
before r d/dr+x/d_dnt%110 | |||
after _ NULL | |||
after @ d/xd | |||
lengthmod 5 | |||
switchvoicing t | |||
endphoneme | |||
phoneme k | |||
vls vel stop | |||
vowelin f1=0 f2=2300 200 400 f3=-100 80 | |||
vowelout f1=0 f2=2300 300 400 f3=-100 80 rms=20 | |||
lengthmod 2 | |||
wave ustop/k_unasp%70 // weaker | |||
before _ ustop/k_ | |||
before r ustop/kr | |||
before l ustop/kl | |||
before (i) ustop/ki | |||
switchvoicing g | |||
endphoneme | |||
@@ -3,13 +3,6 @@ | |||
// Italian | |||
//==================================================== | |||
phoneme : // Lengthen previous vowel by "length" | |||
virtual | |||
length 70 | |||
endphoneme | |||
phoneme a | |||
vowel starttype (a) endtype (a) | |||
@@ -36,7 +29,7 @@ endphoneme | |||
phoneme E | |||
vowel starttype (e) endtype (e) | |||
length 170 | |||
formants vowel/ee_1 | |||
formants vowel/e_mid | |||
reduceto e 4 // [E] only in stressed syllables | |||
endphoneme | |||
@@ -48,13 +41,6 @@ phoneme i | |||
linkout ; | |||
endphoneme | |||
phoneme i2 | |||
vowel starttype (i) endtype (i) | |||
length 150 | |||
formants vowel/i_4 | |||
linkout ; | |||
endphoneme | |||
phoneme o | |||
vowel starttype (o) endtype (o) | |||
@@ -66,18 +52,12 @@ endphoneme | |||
phoneme O | |||
vowel starttype (o) endtype (o) | |||
length 170 | |||
formants vowel/0 | |||
// formants vowel/oo_4 | |||
// formants vowel/0 | |||
reduceto o 4 // [O] only in stressed syllables | |||
endphoneme | |||
phoneme U | |||
vowel starttype (u) endtype (u) | |||
length 160 | |||
formants vowel/uu_2 | |||
endphoneme | |||
phoneme u | |||
vowel starttype (u) endtype (u) | |||
length 160 | |||
@@ -92,19 +72,6 @@ phoneme aU | |||
endphoneme | |||
phoneme eU | |||
vowel starttype (e) endtype (u) | |||
length 260 | |||
formants vdiph/eu | |||
endphoneme | |||
phoneme iU | |||
vowel starttype (i) endtype (u) | |||
length 240 | |||
formants vdiph2/iu | |||
endphoneme | |||
phoneme aI | |||
vowel starttype (a) endtype (i) | |||
@@ -113,13 +80,6 @@ phoneme aI | |||
endphoneme | |||
phoneme eI | |||
vowel starttype (e) endtype (i) | |||
length 250 | |||
formants vdiph/ei | |||
endphoneme | |||
phoneme oI | |||
vowel starttype (o) endtype (i) | |||
length 230 | |||
@@ -127,90 +87,7 @@ phoneme oI | |||
endphoneme | |||
phoneme uI | |||
vowel starttype (u) endtype (i) | |||
length 240 | |||
formants vdiph/ui | |||
endphoneme | |||
// CONSONANTS | |||
//=========== | |||
phoneme w2 | |||
starttype w endtype w | |||
liquid | |||
length 100 | |||
beforenotvowel w/ | |||
lengthmod 7 | |||
formants w/w2 | |||
after _ w/w2 | |||
endphoneme | |||
phoneme p | |||
vls blb stop | |||
vowelin f1=0 f2=1000 -50 -100 f3=-200 80 | |||
vowelout f1=0 f2=1000 -500 -350 f3=-300 80 rms=30 | |||
lengthmod 2 | |||
wave ustop/p_unasp | |||
before _ ustop/p_unasp_%80 | |||
before r ustop/pr | |||
before l ustop/pl | |||
switchvoicing b | |||
endphoneme | |||
phoneme ts | |||
vls alv afr sibilant | |||
vowelin f1=0 f2=1700 -300 300 f3=-100 80 | |||
vowelout f1=0 f2=1700 -300 250 f3=-100 80 rms=20 | |||
lengthmod 2 | |||
wave ustop/ts | |||
endphoneme | |||
phoneme t // dental variant of /t/ | |||
vls dnt stop | |||
vowelin f1=0 f2=1600 -300 300 f3=-100 80 | |||
vowelout f1=0 f2=1600 -300 250 f3=-100 80 rms=20 | |||
lengthmod 2 | |||
wave ustop/t_dnt%50 | |||
before _ ustop/t_dnt%35 | |||
switchvoicing d | |||
endphoneme | |||
phoneme d // dental variant of /d/ | |||
vcd dnt stop | |||
vowelin f1=2 f2=1500 -300 300 f3=-150 80 | |||
vowelout f1=2 f2=1500 -300 300 f3=-150 80 | |||
formants d/d+x/d_dnt%110 | |||
before _ d/d_+x/d_ | |||
before r d/dr+x/d_dnt%110 | |||
after _ NULL | |||
after @ d/xd | |||
lengthmod 5 | |||
switchvoicing t | |||
endphoneme | |||
phoneme k | |||
vls vel stop | |||
vowelin f1=0 f2=2300 200 400 f3=-100 80 | |||
vowelout f1=0 f2=2300 300 400 f3=-100 80 rms=20 | |||
lengthmod 2 | |||
wave ustop/k_unasp%70 // weaker | |||
before _ ustop/k_ | |||
before r ustop/kr | |||
before l ustop/kl | |||
before (i) ustop/ki | |||
switchvoicing g | |||
endphoneme | |||
@@ -1125,6 +1125,8 @@ endphoneme | |||
// ADDITIONAL PHONEME TABLES | |||
//******************************************************************* | |||
phonemetable base2 base | |||
include ph_base2 | |||
phonemetable en base | |||
include ph_english | |||
@@ -1190,22 +1192,22 @@ phonemetable ru base | |||
include ph_russian | |||
phonemetable it base | |||
phonemetable it base2 | |||
include ph_italian | |||
phonemetable es it | |||
phonemetable es base2 | |||
include ph_spanish | |||
phonemetable pt it | |||
phonemetable pt base2 | |||
include ph_pt_brazil | |||
phonemetable pt_pt pt | |||
include ph_portugual | |||
phonemetable ro it | |||
phonemetable ro base2 | |||
include ph_romanian | |||
phonemetable el it | |||
phonemetable el base2 | |||
include ph_greek | |||
phonemetable sv base |
@@ -42,10 +42,11 @@ | |||
FILE *f_wavtest = NULL; | |||
FILE *f_events = NULL; | |||
int OpenWaveFile3(const char *path, int rate) | |||
/******************************************/ | |||
FILE *OpenWaveFile3(const char *path) | |||
/***********************************/ | |||
{ | |||
int *p; | |||
FILE *f; | |||
static unsigned char wave_hdr[44] = { | |||
'R','I','F','F',0,0,0,0,'W','A','V','E','f','m','t',' ', | |||
@@ -54,56 +55,54 @@ int OpenWaveFile3(const char *path, int rate) | |||
if(path == NULL) | |||
return(2); | |||
return(NULL); | |||
// set the sample rate in the header | |||
p = (int *)(&wave_hdr[24]); | |||
p[0] = rate; | |||
p[1] = rate * 2; | |||
p[0] = samplerate; | |||
p[1] = samplerate * 2; | |||
f_wavtest = fopen(path,"wb"); | |||
f = fopen(path,"wb"); | |||
if(f_wavtest != NULL) | |||
if(f != NULL) | |||
{ | |||
fwrite(wave_hdr,1,sizeof(wave_hdr),f_wavtest); | |||
return(0); | |||
fwrite(wave_hdr,1,sizeof(wave_hdr),f); | |||
} | |||
return(1); | |||
return(f); | |||
} // end of OpenWaveFile | |||
void CloseWaveFile3(int rate) | |||
/******************/ | |||
void CloseWaveFile3(FILE *f) | |||
/*************************/ | |||
{ | |||
unsigned int pos; | |||
static int value; | |||
fflush(f_wavtest); | |||
pos = ftell(f_wavtest); | |||
fflush(f); | |||
pos = ftell(f); | |||
value = pos - 8; | |||
fseek(f_wavtest,4,SEEK_SET); | |||
fwrite(&value,4,1,f_wavtest); | |||
fseek(f,4,SEEK_SET); | |||
fwrite(&value,4,1,f); | |||
value = rate; | |||
fseek(f_wavtest,24,SEEK_SET); | |||
fwrite(&value,4,1,f_wavtest); | |||
value = samplerate; | |||
fseek(f,24,SEEK_SET); | |||
fwrite(&value,4,1,f); | |||
value = rate*2; | |||
fseek(f_wavtest,28,SEEK_SET); | |||
fwrite(&value,4,1,f_wavtest); | |||
value = samplerate*2; | |||
fseek(f,28,SEEK_SET); | |||
fwrite(&value,4,1,f); | |||
value = pos - 44; | |||
fseek(f_wavtest,40,SEEK_SET); | |||
fwrite(&value,4,1,f_wavtest); | |||
fseek(f,40,SEEK_SET); | |||
fwrite(&value,4,1,f); | |||
fclose(f_wavtest); | |||
f_wavtest = NULL; | |||
fclose(f); | |||
} // end of CloseWaveFile2 | |||
} // end of CloseWaveFile3 | |||
int TestUriCallback(int type, const char *uri, const char *base) | |||
@@ -126,7 +125,8 @@ if(f_wavtest == NULL) return(0); | |||
if(wav == NULL) | |||
{ | |||
fprintf(f_events,"Finished\n"); | |||
CloseWaveFile3(samplerate); | |||
CloseWaveFile3(f_wavtest); | |||
f_wavtest = NULL; | |||
fclose(f_events); | |||
return(0); | |||
} | |||
@@ -1183,8 +1183,7 @@ void TestTest(int control) | |||
//CharsetToUnicode("ISO-8859-4"); | |||
//CharsetToUnicode("ISCII"); | |||
//return; | |||
return; | |||
if(control==2) | |||
{ | |||
@@ -1206,7 +1205,7 @@ if(control==2) | |||
textbuf[ix] = 0; | |||
fclose(f); | |||
OpenWaveFile3("/home/jsd1/speechdata/text/test.wav",samplerate); | |||
f_wavtest = OpenWaveFile3("/home/jsd1/speechdata/text/test.wav"); | |||
f_events = fopen("/home/jsd1/speechdata/text/events","w"); | |||
fprintf(f_events,"Audio Text Length Type Id\n"); | |||
@@ -962,7 +962,8 @@ void SpectSeq::MakeWave(int start, int end, PitchEnvelope &pitch) | |||
// } | |||
len_samples = int(((total_length * lfactor + 50) * samplerate) / 1000); | |||
SetPitch(len_samples,pitch.env,pitch.pitch1-pbase,pitch.pitch2-pbase); | |||
// SetPitch(len_samples,pitch.env,pitch.pitch1-pbase,pitch.pitch2-pbase); | |||
SetPitch(len_samples,pitch.env,9,44); | |||
fname_speech = WavFileName(); | |||
OpenWaveFile2(fname_speech); | |||
@@ -1038,7 +1039,8 @@ void SpectFrame::MakeWave(int control, PitchEnvelope &pitche, int amplitude, int | |||
len_samples = (length * samplerate) / 1000; | |||
pbase = voice->pitch_base >> 12; | |||
SetPitch(len_samples + 50,pitche.env,pitche.pitch1-pbase,pitche.pitch2-pbase); | |||
// SetPitch(len_samples + 50,pitche.env,pitche.pitch1-pbase,pitche.pitch2-pbase); | |||
SetPitch(len_samples + 50,pitche.env,9,44); | |||
fname_speech = WavFileName(); | |||
if(OpenWaveFile2(fname_speech) != 0) |
@@ -35,7 +35,7 @@ | |||
#include "translate.h" | |||
#include "wave.h" | |||
const char *version_string = "1.26.03 12.Jun.07"; | |||
const char *version_string = "1.26.04 13.Jun.07"; | |||
const int version_phdata = 0x012601; | |||
int option_device_number = -1; |
@@ -81,6 +81,15 @@ typedef struct { | |||
DOUBLEX right_inc; | |||
} wavegen_peaks_t; | |||
typedef struct { | |||
double a; | |||
double b; | |||
double c; | |||
double x1; | |||
double x2; | |||
} RESONATOR; | |||
typedef struct { | |||
short length; | |||
unsigned char n_frames; | |||
@@ -240,3 +249,5 @@ int DoSample(PHONEME_TAB *ph1, PHONEME_TAB *ph2, int which, int length_mod, int | |||
int DoSpect(PHONEME_TAB *this_ph, PHONEME_TAB *prev_ph, PHONEME_TAB *next_ph, | |||
int which, PHONEME_LIST *plist, int modulation); | |||
int PauseLength(int pause); | |||
void InitBreath(void); |
@@ -1187,8 +1187,8 @@ int Translator::TranslateWord2(char *word, WORD_TAB *wtab, int pre_pause, int ne | |||
if(sylimit & 0x100) | |||
{ | |||
// only if the second word has $alt attribute | |||
flags2 = translator->TranslateWord(p2+1, 0, wtab+1); | |||
strcpy(ph_buf,word_phonemes); | |||
flags2 = translator->TranslateWord(p2+1, 0, wtab+1); | |||
if((flags2 & FLAG_ALT_TRANS) == 0) | |||
{ | |||
ok = 0; |
@@ -36,6 +36,7 @@ typedef struct { | |||
int echo_amp; | |||
int n_harmonic_peaks; // highest formant which is formed from adding harmonics | |||
int peak_shape; // alternative shape for formant peaks (0=standard 1=squarer) | |||
int voicing; // 100% = 64, level of formant-synthesized sound | |||
// parameters used by Wavegen | |||
int freq[N_PEAKS]; // 100% = 256 | |||
@@ -47,6 +48,9 @@ typedef struct { | |||
int height2[N_PEAKS]; // 100% = 256 | |||
int width2[N_PEAKS]; // 100% = 256 | |||
int breath[N_PEAKS]; // amount of breath for each formant. breath[0] indicates whether any are set. | |||
int breathw[N_PEAKS]; // width of each breath formant | |||
// This table provides the opportunity for tone control. | |||
// Adjustment of harmonic amplitudes, steps of 8Hz | |||
// value of 128 means no change |
@@ -84,23 +84,26 @@ char voice_name[40]; | |||
#define V_ROUGHNESS 11 | |||
#define V_CLARITY 12 | |||
#define V_TONE 13 | |||
#define V_VOICING 14 | |||
#define V_BREATH 15 | |||
#define V_BREATHW 16 | |||
// these override defaults set by the translator | |||
#define V_WORDGAP 15 | |||
#define V_INTONATION 16 | |||
#define V_STRESSLENGTH 17 | |||
#define V_STRESSAMP 18 | |||
#define V_STRESSADD 19 | |||
#define V_DICTRULES 20 | |||
#define V_STRESSRULE 21 | |||
#define V_CHARSET 22 | |||
#define V_NUMBERS 23 | |||
#define V_OPTION 24 | |||
#define V_MBROLA 25 | |||
#define V_WORDGAP 17 | |||
#define V_INTONATION 18 | |||
#define V_STRESSLENGTH 19 | |||
#define V_STRESSAMP 20 | |||
#define V_STRESSADD 21 | |||
#define V_DICTRULES 22 | |||
#define V_STRESSRULE 23 | |||
#define V_CHARSET 24 | |||
#define V_NUMBERS 25 | |||
#define V_OPTION 26 | |||
#define V_MBROLA 27 | |||
// these need a phoneme table to have been specified | |||
#define V_REPLACE 26 | |||
#define V_REPLACE 28 | |||
@@ -133,6 +136,9 @@ static keywtab_t keyword_tab[] = { | |||
{"roughness", V_ROUGHNESS}, | |||
{"clarity", V_CLARITY}, | |||
{"tone", V_TONE}, | |||
{"voicing", V_VOICING}, | |||
{"breath", V_BREATH}, | |||
{"breathw", V_BREATHW}, | |||
{"numbers", V_NUMBERS}, | |||
{"option", V_OPTION}, | |||
{"mbrola", V_MBROLA}, | |||
@@ -352,7 +358,9 @@ static espeak_VOICE *ReadVoiceFile(FILE *f_in, const char *fname, const char*lea | |||
void VoiceReset(int tone_only) | |||
{//=========================== | |||
// Set voice to the default values | |||
int pk; | |||
static int breath_widths[N_PEAKS] = {0,200,200,400,400,400,600,600,600}; | |||
// default is: pitch 82,118 | |||
voice->pitch_base = 0x49000; // default, 73 << 12; | |||
@@ -363,17 +371,21 @@ void VoiceReset(int tone_only) | |||
voice->flutter = 64; | |||
voice->n_harmonic_peaks = 5; | |||
voice->peak_shape = 1; | |||
voice->voicing = 64; | |||
#ifdef PLATFORM_RISCOS | |||
voice->roughness = 1; | |||
#else | |||
voice->roughness = 2; | |||
#endif | |||
InitBreath(); | |||
for(pk=0; pk<N_PEAKS; pk++) | |||
{ | |||
voice->freq[pk] = 256; | |||
voice->height[pk] = 256; | |||
voice->width[pk] = 256; | |||
voice->breath[pk] = 0; | |||
voice->breathw[pk] = breath_widths[pk]; // default breath formant woidths | |||
// adjust formant smoothing depending on sample rate | |||
formant_rate[pk] = (formant_rate_22050[pk] * 22050)/samplerate; | |||
@@ -769,6 +781,19 @@ voice_t *LoadVoice(char *vname, int control) | |||
} | |||
break; | |||
case V_VOICING: | |||
if(sscanf(p,"%d",&value)==1) | |||
voice->voicing = (value * 64)/100; | |||
break; | |||
case V_BREATH: | |||
voice->breath[0] = Read8Numbers(p,&voice->breath[1]); | |||
break; | |||
case V_BREATHW: | |||
voice->breathw[0] = Read8Numbers(p,&voice->breathw[1]); | |||
break; | |||
case V_MBROLA: | |||
{ | |||
char name[40]; |
@@ -49,7 +49,7 @@ | |||
#include "sintab.h" | |||
#define PI 3.1415927 | |||
#define PI2 6.283185307 | |||
#define STEPSIZE 64 // 2.9mS at 22 kHz sample rate | |||
#define N_WAV_BUF 10 | |||
@@ -82,6 +82,9 @@ static int echo_tail; | |||
static int echo_amp = 0; | |||
static short echo_buf[N_ECHO_BUF]; | |||
static int voicing; | |||
RESONATOR rbreath[N_PEAKS]; | |||
static int harm_sqrt_n = 0; | |||
@@ -124,6 +127,9 @@ static int cycle_samples; // number of samples in a cycle at current pit | |||
static int cbytes; | |||
static int hf_factor; | |||
static double minus_pi_t; | |||
static double two_pi_t; | |||
unsigned char *out_ptr; | |||
unsigned char *out_start; | |||
@@ -770,6 +776,7 @@ static void WavegenSetEcho(void) | |||
int delay; | |||
int amp; | |||
voicing = wvoice->voicing; | |||
delay = wvoice->echo_delay; | |||
amp = wvoice->echo_amp; | |||
@@ -824,6 +831,8 @@ int PeaksToHarmspect(wavegen_peaks_t *peaks, int pitch, int *htab, int control) | |||
if(wvoice == NULL) | |||
return(1); | |||
hmax = (peaks[wvoice->n_harmonic_peaks].freq + peaks[wvoice->n_harmonic_peaks].right)/pitch; | |||
if(hmax >= MAX_HARMONIC) | |||
hmax = MAX_HARMONIC-1; | |||
// restrict highest harmonic to half the samplerate | |||
hmax_samplerate = (((samplerate * 19)/40) << 16)/pitch; // only 95% of Nyquist freq | |||
@@ -961,6 +970,110 @@ static void AdvanceParameters() | |||
static double resonator(RESONATOR *r, double input) | |||
{//================================================ | |||
double x; | |||
x = r->a * input + r->b * r->x1 + r->c * r->x2; | |||
r->x2 = r->x1; | |||
r->x1 = x; | |||
return x; | |||
} | |||
static void setresonator(RESONATOR *rp, int freq, int bwidth, int init) | |||
{//==================================================================== | |||
// freq Frequency of resonator in Hz | |||
// bwidth Bandwidth of resonator in Hz | |||
// init Initialize internal data | |||
double x; | |||
double arg; | |||
if(init) | |||
{ | |||
rp->x1 = 0; | |||
rp->x2 = 0; | |||
} | |||
// x = exp(-pi * bwidth * t) | |||
arg = minus_pi_t * bwidth; | |||
x = exp(arg); | |||
// c = -(x*x) | |||
rp->c = -(x * x); | |||
// b = x * 2*cos(2 pi * freq * t) | |||
arg = two_pi_t * freq; | |||
rp->b = x * cos(arg) * 2.0; | |||
// a = 1.0 - b - c | |||
rp->a = 1.0 - rp->b - rp->c; | |||
} // end if setresonator | |||
void InitBreath(void) | |||
{//================== | |||
int ix; | |||
minus_pi_t = -PI / samplerate; | |||
two_pi_t = -2.0 * minus_pi_t; | |||
for(ix=0; ix<N_PEAKS; ix++) | |||
{ | |||
setresonator(&rbreath[ix],2000,200,1); | |||
} | |||
} // end of InitBreath | |||
void SetBreath() | |||
{//============= | |||
int pk; | |||
if(wvoice->breath[0] == 0) | |||
return; | |||
for(pk=1; pk<N_PEAKS; pk++) | |||
{ | |||
if(wvoice->breath[pk] != 0) | |||
{ | |||
// breath[0] indicates that some breath formants are needed | |||
// set the freq from the current ynthesis formant and the width from the voice data | |||
setresonator(&rbreath[pk], peaks[pk].freq >> 16, wvoice->breathw[pk],0); | |||
} | |||
} | |||
} // end of SetBreath | |||
#define getrandom(min,max) ((rand()%(int)(((max)+1)-(min)))+(min)) | |||
int ApplyBreath(void) | |||
{//================== | |||
int noise; | |||
int ix; | |||
int amp; | |||
int value = 0; | |||
noise = getrandom(-4095,4095); | |||
for(ix=1; ix < N_PEAKS; ix++) | |||
{ | |||
if((amp = wvoice->breath[ix]) > 0) | |||
{ | |||
amp *= (peaks[ix].height >> 13); | |||
value += int(resonator(&rbreath[ix],noise) * amp); | |||
} | |||
} | |||
return (value); | |||
} | |||
static int Wavegen() | |||
{//================= | |||
unsigned short waveph; | |||
@@ -1015,6 +1128,7 @@ static int Wavegen() | |||
hswitch ^= 1; | |||
maxh2 = PeaksToHarmspect(peaks,pitch<<4,hspect[hswitch],1); | |||
SetBreath(); | |||
} | |||
else | |||
if((samplecount & 0x07) == 0) | |||
@@ -1150,6 +1264,17 @@ static int Wavegen() | |||
h++; | |||
} | |||
#endif | |||
if(voicing != 64) | |||
{ | |||
total = (total >> 6) * voicing; | |||
} | |||
if(wvoice->breath[0]) | |||
{ | |||
total += ApplyBreath(); | |||
} | |||
// mix with sampled wave if required | |||
z2 = 0; | |||
if(mix_wavefile_ix < n_mix_wavefile) |