A dataset of informal Persian audio and text chunks, along with a fully open processing pipeline, suitable for ASR and TTS tasks. Created from crawled content on virgool.io.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

VirgoolInformal_Dataset_Processing.ipynb 98KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068
  1. {
  2. "nbformat": 4,
  3. "nbformat_minor": 0,
  4. "metadata": {
  5. "colab": {
  6. "provenance": [],
  7. "collapsed_sections": [
  8. "3AcYoP960ry-"
  9. ]
  10. },
  11. "kernelspec": {
  12. "name": "python3",
  13. "display_name": "Python 3"
  14. },
  15. "language_info": {
  16. "name": "python"
  17. }
  18. },
  19. "cells": [
  20. {
  21. "cell_type": "markdown",
  22. "source": [
  23. "# Environment Setup"
  24. ],
  25. "metadata": {
  26. "id": "qm2b1llYZMlY"
  27. }
  28. },
  29. {
  30. "cell_type": "code",
  31. "source": [
  32. "!pip install hazm # Requires Restart"
  33. ],
  34. "metadata": {
  35. "colab": {
  36. "base_uri": "https://localhost:8080/",
  37. "height": 879
  38. },
  39. "id": "sUOKF5SYZhgW",
  40. "outputId": "12400790-d49e-40a6-c797-6492aad77f2a"
  41. },
  42. "execution_count": 1,
  43. "outputs": [
  44. {
  45. "output_type": "stream",
  46. "name": "stdout",
  47. "text": [
  48. "Collecting hazm\n",
  49. " Downloading hazm-0.10.0-py3-none-any.whl (892 kB)\n",
  50. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m892.6/892.6 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  51. "\u001b[?25hCollecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)\n",
  52. " Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)\n",
  53. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  54. "\u001b[?25hCollecting flashtext<3.0,>=2.7 (from hazm)\n",
  55. " Downloading flashtext-2.7.tar.gz (14 kB)\n",
  56. " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  57. "Requirement already satisfied: gensim<5.0.0,>=4.3.1 in /usr/local/lib/python3.10/dist-packages (from hazm) (4.3.2)\n",
  58. "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from hazm) (3.8.1)\n",
  59. "Collecting numpy==1.24.3 (from hazm)\n",
  60. " Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
  61. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  62. "\u001b[?25hCollecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)\n",
  63. " Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
  64. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m26.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  65. "\u001b[?25hRequirement already satisfied: scikit-learn<2.0.0,>=1.2.2 in /usr/local/lib/python3.10/dist-packages (from hazm) (1.2.2)\n",
  66. "Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)\n",
  67. " Downloading pybind11-2.12.0-py3-none-any.whl (234 kB)\n",
  68. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.0/235.0 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  69. "\u001b[?25hRequirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext-wheel<0.10.0,>=0.9.2->hazm) (67.7.2)\n",
  70. "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim<5.0.0,>=4.3.1->hazm) (1.11.4)\n",
  71. "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim<5.0.0,>=4.3.1->hazm) (6.4.0)\n",
  72. "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (8.1.7)\n",
  73. "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (1.4.2)\n",
  74. "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (2024.5.15)\n",
  75. "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->hazm) (4.66.4)\n",
  76. "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn<2.0.0,>=1.2.2->hazm) (3.5.0)\n",
  77. "Building wheels for collected packages: flashtext\n",
  78. " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  79. " Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9296 sha256=1c7ed8269d7ca3c0838a069be159d8aee981e9ed58934caebf5c66db1193f70d\n",
  80. " Stored in directory: /root/.cache/pip/wheels/bc/be/39/c37ad168eb2ff644c9685f52554440372129450f0b8ed203dd\n",
  81. "Successfully built flashtext\n",
  82. "Installing collected packages: python-crfsuite, flashtext, pybind11, numpy, fasttext-wheel, hazm\n",
  83. " Attempting uninstall: numpy\n",
  84. " Found existing installation: numpy 1.25.2\n",
  85. " Uninstalling numpy-1.25.2:\n",
  86. " Successfully uninstalled numpy-1.25.2\n",
  87. "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  88. "pandas-stubs 2.0.3.230814 requires numpy>=1.25.0; python_version >= \"3.9\", but you have numpy 1.24.3 which is incompatible.\u001b[0m\u001b[31m\n",
  89. "\u001b[0mSuccessfully installed fasttext-wheel-0.9.2 flashtext-2.7 hazm-0.10.0 numpy-1.24.3 pybind11-2.12.0 python-crfsuite-0.9.10\n"
  90. ]
  91. },
  92. {
  93. "output_type": "display_data",
  94. "data": {
  95. "application/vnd.colab-display-data+json": {
  96. "pip_warning": {
  97. "packages": [
  98. "numpy"
  99. ]
  100. },
  101. "id": "3a39f6c901ec42aa8fcdf3a3439f33ec"
  102. }
  103. },
  104. "metadata": {}
  105. }
  106. ]
  107. },
  108. {
  109. "cell_type": "code",
  110. "source": [
  111. "! pip install pydub"
  112. ],
  113. "metadata": {
  114. "colab": {
  115. "base_uri": "https://localhost:8080/"
  116. },
  117. "id": "cFSxcaOjW1ij",
  118. "outputId": "be7f68a5-f4d1-4511-e750-842bb4fdfa69"
  119. },
  120. "execution_count": 1,
  121. "outputs": [
  122. {
  123. "output_type": "stream",
  124. "name": "stdout",
  125. "text": [
  126. "Collecting pydub\n",
  127. " Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
  128. "Installing collected packages: pydub\n",
  129. "Successfully installed pydub-0.25.1\n"
  130. ]
  131. }
  132. ]
  133. },
  134. {
  135. "cell_type": "markdown",
  136. "source": [
  137. "## Setup parsi io"
  138. ],
  139. "metadata": {
  140. "id": "NVlJhS9_cHaJ"
  141. }
  142. },
  143. {
  144. "cell_type": "code",
  145. "source": [
  146. "! git clone https://github.com/language-ml/parsi.io.git"
  147. ],
  148. "metadata": {
  149. "colab": {
  150. "base_uri": "https://localhost:8080/"
  151. },
  152. "id": "lrPGOeS6Zz-b",
  153. "outputId": "6017ed27-4953-4155-a9c8-be5451d8ae70"
  154. },
  155. "execution_count": 2,
  156. "outputs": [
  157. {
  158. "output_type": "stream",
  159. "name": "stdout",
  160. "text": [
  161. "fatal: destination path 'parsi.io' already exists and is not an empty directory.\n"
  162. ]
  163. }
  164. ]
  165. },
  166. {
  167. "cell_type": "code",
  168. "source": [
  169. "mv parsi.io parsi_io"
  170. ],
  171. "metadata": {
  172. "id": "eEROzSSgaAUK"
  173. },
  174. "execution_count": 3,
  175. "outputs": []
  176. },
  177. {
  178. "cell_type": "markdown",
  179. "metadata": {
  180. "id": "ILGxoNVgwDPy"
  181. },
  182. "source": [
  183. "## Setup Perpos POS Tagger"
  184. ]
  185. },
  186. {
  187. "cell_type": "code",
  188. "execution_count": 4,
  189. "metadata": {
  190. "colab": {
  191. "base_uri": "https://localhost:8080/"
  192. },
  193. "id": "4d2fX0Nkx-c_",
  194. "outputId": "939fbbc7-4994-4818-d80b-e533673c811e"
  195. },
  196. "outputs": [
  197. {
  198. "output_type": "stream",
  199. "name": "stdout",
  200. "text": [
  201. "Collecting sklearn-crfsuite\n",
  202. " Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)\n",
  203. "Requirement already satisfied: python-crfsuite>=0.8.3 in /usr/local/lib/python3.10/dist-packages (from sklearn-crfsuite) (0.9.10)\n",
  204. "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from sklearn-crfsuite) (1.16.0)\n",
  205. "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from sklearn-crfsuite) (0.9.0)\n",
  206. "Requirement already satisfied: tqdm>=2.0 in /usr/local/lib/python3.10/dist-packages (from sklearn-crfsuite) (4.66.4)\n",
  207. "Installing collected packages: sklearn-crfsuite\n",
  208. "Successfully installed sklearn-crfsuite-0.3.6\n"
  209. ]
  210. }
  211. ],
  212. "source": [
  213. "! pip install sklearn-crfsuite"
  214. ]
  215. },
  216. {
  217. "cell_type": "code",
  218. "execution_count": 5,
  219. "metadata": {
  220. "colab": {
  221. "base_uri": "https://localhost:8080/"
  222. },
  223. "id": "3FC4zlmoxOD2",
  224. "outputId": "e2094ec6-522d-4932-ffb9-228a7665dab1"
  225. },
  226. "outputs": [
  227. {
  228. "output_type": "stream",
  229. "name": "stdout",
  230. "text": [
  231. "Cloning into 'perpos'...\n",
  232. "remote: Enumerating objects: 41, done.\u001b[K\n",
  233. "remote: Total 41 (delta 0), reused 0 (delta 0), pack-reused 41\u001b[K\n",
  234. "Receiving objects: 100% (41/41), 17.33 MiB | 8.56 MiB/s, done.\n",
  235. "Resolving deltas: 100% (11/11), done.\n"
  236. ]
  237. }
  238. ],
  239. "source": [
  240. "! git clone https://github.com/mhbashari/perpos.git"
  241. ]
  242. },
  243. {
  244. "cell_type": "code",
  245. "execution_count": 6,
  246. "metadata": {
  247. "id": "1wT7lcQLxTkd"
  248. },
  249. "outputs": [],
  250. "source": [
  251. "import string\n",
  252. "\n",
  253. "from nltk import tree2conlltags\n",
  254. "\n",
  255. "\n",
  256. "def read_conll(path, col=2):\n",
  257. " with open(path, \"r\", encoding=\"utf-8\") as conll:\n",
  258. " out = []\n",
  259. " for sent in conll.readlines():\n",
  260. " split = sent.strip(\"\\r\\n\").split()\n",
  261. " if len(split) > 1:\n",
  262. " none_token_count = col - 1\n",
  263. " new_elem = split[-1:]\n",
  264. " new_elem = split[:none_token_count] + new_elem\n",
  265. " out.append(new_elem)\n",
  266. "\n",
  267. " else:\n",
  268. " yield out\n",
  269. " out = []\n",
  270. "\n",
  271. "\n",
  272. "def template(word):\n",
  273. " return \"\".join([(lambda item: \"x\" if not item in \"آایو\" else \"a\")(char) for char in word])\n",
  274. "\n",
  275. "\n",
  276. "def isdigit(word):\n",
  277. " return all(map(lambda char: char in \"۱۲۳۴۵۶۷۸۹۰1234567890.\", word))\n",
  278. "\n",
  279. "\n",
  280. "def ngram(word, leng=2):\n",
  281. " for i in range(len(word) - 1):\n",
  282. " yield 'word[' + str(i) + \":\" + str(i + leng) + \"]\", word[i:i + leng]\n",
  283. "\n",
  284. "\n",
  285. "def tree2brackets(tree):\n",
  286. " str, tag = '', ''\n",
  287. " for item in tree2conlltags(tree):\n",
  288. " if item[2][0] in {'B', 'O'} and tag:\n",
  289. " str += tag + '] '\n",
  290. " tag = ''\n",
  291. "\n",
  292. " if item[2][0] == 'B':\n",
  293. " tag = item[2].split('-')[1]\n",
  294. " str += '['\n",
  295. " str += item[0] + ' '\n",
  296. "\n",
  297. " if tag:\n",
  298. " str += tag + '] '\n",
  299. "\n",
  300. " return str.strip()\n",
  301. "\n",
  302. "def word2features(sent, i):\n",
  303. " W = sent[i]\n",
  304. " features = {\n",
  305. " 'B': 1.0,\n",
  306. " 'W': W,\n",
  307. " 'P': W in string.punctuation,\n",
  308. " 'T': template(W),\n",
  309. " 'D(W)': isdigit(W),\n",
  310. " }\n",
  311. " for leng in range(max(4 + 1, len(W)) + 1):\n",
  312. " for k, v in ngram(W, leng=leng):\n",
  313. " features[k] = v\n",
  314. " if i > 0:\n",
  315. " W = sent[i - 1][0]\n",
  316. " features.update({\n",
  317. " '-1W[-3': W[-3:],\n",
  318. " '-1W[-2': W[-2:],\n",
  319. " '-1W[-1': W[-1:],\n",
  320. " '-1W': W,\n",
  321. " '-1W0W': W + sent[i],\n",
  322. " '-1P': W in string.punctuation,\n",
  323. " '-1T': template(W)\n",
  324. " })\n",
  325. " else:\n",
  326. " features['BOS'] = True\n",
  327. " if i > 1:\n",
  328. " W = sent[i - 2][0]\n",
  329. " features.update({\n",
  330. " '-2W[-3': W[-3:],\n",
  331. " '-2W[-2': W[-2:],\n",
  332. " '-2W[-1': W[-1:],\n",
  333. " '-2P': W in string.punctuation,\n",
  334. " '-2T': template(W)\n",
  335. " })\n",
  336. "\n",
  337. " if i < len(sent) - 2:\n",
  338. " W = sent[i + 2][0]\n",
  339. " features.update({\n",
  340. " '+2W[-1': W[-1:],\n",
  341. " '+2W[-2': W[-2:],\n",
  342. " '+2W': W,\n",
  343. " '+2P': W in string.punctuation,\n",
  344. " '+2T': template(W)\n",
  345. " })\n",
  346. " if i < len(sent) - 1:\n",
  347. " W = sent[i + 1][0]\n",
  348. " features.update({\n",
  349. " '+1W[-1': W[-1:],\n",
  350. " '+1W': W,\n",
  351. " '+1W0W': W + sent[i],\n",
  352. " '+1W[-2': W[-2:],\n",
  353. " '+1:P': W in string.punctuation,\n",
  354. " '+1:T': template(W)\n",
  355. " })\n",
  356. " else:\n",
  357. " features['EOS'] = True\n",
  358. " if 0 < i < len(sent) - 1:\n",
  359. " features['-1W/+1W'] = sent[i + 1][0] + \"/\" + sent[i - 1][0]\n",
  360. " return features\n",
  361. "\n",
  362. "\n",
  363. "def token2features(token_list):\n",
  364. " return [word2features(token_list, i) for i in range(len(token_list))]\n",
  365. "\n",
  366. "\n",
  367. "def sent2labels(sent):\n",
  368. " return [postag for token, postag in sent]\n",
  369. "\n",
  370. "\n",
  371. "def sent2tokens(sent):\n",
  372. " return [token for token, postag in sent]\n",
  373. "\n",
  374. "\n",
  375. "import pickle\n",
  376. "\n",
  377. "class POSTagger:\n",
  378. " def __init__(self, model_path):\n",
  379. " self.model_path = model_path\n",
  380. " self.crf = pickle.load(open(model_path, \"rb\"))\n",
  381. "\n",
  382. " def parse(self, token_stream):\n",
  383. " return self.parse_sentences([token_stream])[0]\n",
  384. "\n",
  385. " def parse_sentences(self, list_of_token_stream):\n",
  386. " X_test = [token2features(s) for s in list_of_token_stream]\n",
  387. " y_pred = self.crf.predict(X_test)\n",
  388. " out = []\n",
  389. " for x_sent, y_pred in zip(list_of_token_stream, y_pred):\n",
  390. " out.append(list(zip(x_sent, y_pred)))\n",
  391. " return out\n"
  392. ]
  393. },
  394. {
  395. "cell_type": "code",
  396. "execution_count": 7,
  397. "metadata": {
  398. "id": "HhkXRbAoxcoe"
  399. },
  400. "outputs": [],
  401. "source": [
  402. "pos_tagger = POSTagger(\"perpos/model/perpos.model\")"
  403. ]
  404. },
  405. {
  406. "cell_type": "markdown",
  407. "metadata": {
  408. "id": "3AcYoP960ry-"
  409. },
  410. "source": [
  411. "## Setup Aeneas"
  412. ]
  413. },
  414. {
  415. "cell_type": "code",
  416. "execution_count": 8,
  417. "metadata": {
  418. "colab": {
  419. "base_uri": "https://localhost:8080/"
  420. },
  421. "id": "wUyP7wCo1J59",
  422. "outputId": "62623d7a-1e9d-4324-a0e0-2bd60cae8d5b"
  423. },
  424. "outputs": [
  425. {
  426. "output_type": "stream",
  427. "name": "stdout",
  428. "text": [
  429. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.8/16.8 MB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  430. "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
  431. "chex 0.1.86 requires numpy>=1.24.1, but you have numpy 1.22.4 which is incompatible.\n",
  432. "cudf-cu12 24.4.1 requires numpy<2.0a0,>=1.23, but you have numpy 1.22.4 which is incompatible.\n",
  433. "hazm 0.10.0 requires numpy==1.24.3, but you have numpy 1.22.4 which is incompatible.\n",
  434. "pandas-stubs 2.0.3.230814 requires numpy>=1.25.0; python_version >= \"3.9\", but you have numpy 1.22.4 which is incompatible.\n",
  435. "plotnine 0.12.4 requires numpy>=1.23.0, but you have numpy 1.22.4 which is incompatible.\n",
  436. "rmm-cu12 24.4.0 requires numpy<2.0a0,>=1.23, but you have numpy 1.22.4 which is incompatible.\n",
  437. "tensorflow 2.15.0 requires numpy<2.0.0,>=1.23.5, but you have numpy 1.22.4 which is incompatible.\u001b[0m\u001b[31m\n",
  438. "Reading package lists... Done\n",
  439. "Building dependency tree... Done\n",
  440. "Reading state information... Done\n",
  441. "ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).\n",
  442. "0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.\n",
  443. "Reading package lists... Done\n",
  444. "Building dependency tree... Done\n",
  445. "Reading state information... Done\n",
  446. "The following additional packages will be installed:\n",
  447. " espeak-data libespeak1 libportaudio2 libsonic0\n",
  448. "The following NEW packages will be installed:\n",
  449. " espeak espeak-data libespeak1 libportaudio2 libsonic0\n",
  450. "0 upgraded, 5 newly installed, 0 to remove and 45 not upgraded.\n",
  451. "Need to get 1,382 kB of archives.\n",
  452. "After this operation, 3,178 kB of additional disk space will be used.\n",
  453. "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudio2 amd64 19.6.0-1.1 [65.3 kB]\n",
  454. "Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]\n",
  455. "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 espeak-data amd64 1.48.15+dfsg-3 [1,085 kB]\n",
  456. "Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libespeak1 amd64 1.48.15+dfsg-3 [156 kB]\n",
  457. "Get:5 http://archive.ubuntu.com/ubuntu jammy/universe amd64 espeak amd64 1.48.15+dfsg-3 [64.2 kB]\n",
  458. "Fetched 1,382 kB in 1s (1,302 kB/s)\n",
  459. "Selecting previously unselected package libportaudio2:amd64.\n",
  460. "(Reading database ... 121918 files and directories currently installed.)\n",
  461. "Preparing to unpack .../libportaudio2_19.6.0-1.1_amd64.deb ...\n",
  462. "Unpacking libportaudio2:amd64 (19.6.0-1.1) ...\n",
  463. "Selecting previously unselected package libsonic0:amd64.\n",
  464. "Preparing to unpack .../libsonic0_0.2.0-11build1_amd64.deb ...\n",
  465. "Unpacking libsonic0:amd64 (0.2.0-11build1) ...\n",
  466. "Selecting previously unselected package espeak-data:amd64.\n",
  467. "Preparing to unpack .../espeak-data_1.48.15+dfsg-3_amd64.deb ...\n",
  468. "Unpacking espeak-data:amd64 (1.48.15+dfsg-3) ...\n",
  469. "Selecting previously unselected package libespeak1:amd64.\n",
  470. "Preparing to unpack .../libespeak1_1.48.15+dfsg-3_amd64.deb ...\n",
  471. "Unpacking libespeak1:amd64 (1.48.15+dfsg-3) ...\n",
  472. "Selecting previously unselected package espeak.\n",
  473. "Preparing to unpack .../espeak_1.48.15+dfsg-3_amd64.deb ...\n",
  474. "Unpacking espeak (1.48.15+dfsg-3) ...\n",
  475. "Setting up libportaudio2:amd64 (19.6.0-1.1) ...\n",
  476. "Setting up libsonic0:amd64 (0.2.0-11build1) ...\n",
  477. "Setting up espeak-data:amd64 (1.48.15+dfsg-3) ...\n",
  478. "Setting up libespeak1:amd64 (1.48.15+dfsg-3) ...\n",
  479. "Setting up espeak (1.48.15+dfsg-3) ...\n",
  480. "Processing triggers for man-db (2.10.2-1) ...\n",
  481. "Processing triggers for libc-bin (2.35-0ubuntu3.4) ...\n",
  482. "/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
  483. "\n",
  484. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
  485. "\n",
  486. "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
  487. "\n",
  488. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
  489. "\n",
  490. "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
  491. "\n",
  492. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
  493. "\n",
  494. "Reading package lists... Done\n",
  495. "Building dependency tree... Done\n",
  496. "Reading state information... Done\n",
  497. "libgdal-dev is already the newest version (3.6.4+dfsg-1~jammy0).\n",
  498. "0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.\n"
  499. ]
  500. }
  501. ],
  502. "source": [
  503. "!pip install -q numpy==1.22.4\n",
  504. "!apt-get install ffmpeg\n",
  505. "!apt-get install espeak\n",
  506. "!pip install -q beautifulsoup4\n",
  507. "!pip install -q lxml\n",
  508. "!apt-get install libgdal-dev"
  509. ]
  510. },
  511. {
  512. "cell_type": "code",
  513. "execution_count": 9,
  514. "metadata": {
  515. "colab": {
  516. "base_uri": "https://localhost:8080/"
  517. },
  518. "id": "ibxBKfbQ1MAv",
  519. "outputId": "adbe9c70-6bef-4b7f-b421-e4aecd0c5182"
  520. },
  521. "outputs": [
  522. {
  523. "output_type": "stream",
  524. "name": "stdout",
  525. "text": [
  526. "--2024-06-03 15:43:07-- https://raw.githubusercontent.com/readbeyond/aeneas/master/install_dependencies.sh\n",
  527. "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
  528. "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
  529. "HTTP request sent, awaiting response... 200 OK\n",
  530. "Length: 2759 (2.7K) [text/plain]\n",
  531. "Saving to: ‘install_dependencies.sh’\n",
  532. "\n",
  533. "install_dependencie 100%[===================>] 2.69K --.-KB/s in 0s \n",
  534. "\n",
  535. "2024-06-03 15:43:07 (32.3 MB/s) - ‘install_dependencies.sh’ saved [2759/2759]\n",
  536. "\n",
  537. "[INFO] A.1 Adding deb-multimedia to apt sources...\n",
  538. "[INFO] A.1 Adding deb-multimedia to apt sources... done\n",
  539. "[INFO] A.2 Updating apt...\n",
  540. "Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
  541. "Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
  542. "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]\n",
  543. "Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
  544. "Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n",
  545. "Ign:6 http://www.deb-multimedia.org jessie InRelease\n",
  546. "Err:7 http://www.deb-multimedia.org jessie Release\n",
  547. " 404 Not Found [IP: 188.165.241.192 80]\n",
  548. "Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease\n",
  549. "Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
  550. "Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n",
  551. "Get:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]\n",
  552. "Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,858 kB]\n",
  553. "Hit:13 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
  554. "Get:14 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,084 kB]\n",
  555. "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,377 kB]\n",
  556. "Get:16 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy/main amd64 Packages [47.6 kB]\n",
  557. "Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,129 kB]\n",
  558. "Reading package lists... Done\n",
  559. "E: The repository 'http://www.deb-multimedia.org jessie Release' does not have a Release file.\n",
  560. "N: Updating from such a repository can't be done securely, and is therefore disabled by default.\n",
  561. "N: See apt-secure(8) manpage for repository creation and user configuration details.\n",
  562. "[INFO] A.2 Updating apt... done\n",
  563. "[INFO] A.3 Downloading and installing deb-multimedia keyring...\n",
  564. "Reading package lists... Done\n",
  565. "Building dependency tree... Done\n",
  566. "Reading state information... Done\n",
  567. "W: --force-yes is deprecated, use one of the options starting with --allow instead.\n",
  568. "E: Unable to locate package deb-multimedia-keyring\n",
  569. "[INFO] A.3 Downloading and installing deb-multimedia keyring... done\n",
  570. "[INFO] A.4 Updating apt...\n",
  571. "Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease\n",
  572. "Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
  573. "Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease\n",
  574. "Ign:4 http://www.deb-multimedia.org jessie InRelease\n",
  575. "Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
  576. "Err:6 http://www.deb-multimedia.org jessie Release\n",
  577. " 404 Not Found [IP: 188.165.241.192 80]\n",
  578. "Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease\n",
  579. "Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease\n",
  580. "Hit:9 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease\n",
  581. "Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
  582. "Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
  583. "Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
  584. "Reading package lists... Done\n",
  585. "E: The repository 'http://www.deb-multimedia.org jessie Release' does not have a Release file.\n",
  586. "N: Updating from such a repository can't be done securely, and is therefore disabled by default.\n",
  587. "N: See apt-secure(8) manpage for repository creation and user configuration details.\n",
  588. "[INFO] A.4 Updating apt... done\n",
  589. "[INFO] B.1 Installing ffmpeg (from deb-multimedia)...\n",
  590. "Reading package lists... Done\n",
  591. "Building dependency tree... Done\n",
  592. "Reading state information... Done\n",
  593. "ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).\n",
  594. "0 upgraded, 0 newly installed, 0 to remove and 51 not upgraded.\n",
  595. "W: --force-yes is deprecated, use one of the options starting with --allow instead.\n",
  596. "[INFO] B.1 Installing ffmpeg (from deb-multimedia)... done\n",
  597. "[INFO] B.2 Installing espeak...\n",
  598. "Reading package lists... Done\n",
  599. "Building dependency tree... Done\n",
  600. "Reading state information... Done\n",
  601. "espeak is already the newest version (1.48.15+dfsg-3).\n",
  602. "espeak-data is already the newest version (1.48.15+dfsg-3).\n",
  603. "espeak-data set to manually installed.\n",
  604. "libespeak1 is already the newest version (1.48.15+dfsg-3).\n",
  605. "libespeak1 set to manually installed.\n",
  606. "The following NEW packages will be installed:\n",
  607. " libespeak-dev\n",
  608. "0 upgraded, 1 newly installed, 0 to remove and 51 not upgraded.\n",
  609. "Need to get 193 kB of archives.\n",
  610. "After this operation, 624 kB of additional disk space will be used.\n",
  611. "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libespeak-dev amd64 1.48.15+dfsg-3 [193 kB]\n",
  612. "Fetched 193 kB in 1s (324 kB/s)\n",
  613. "debconf: unable to initialize frontend: Dialog\n",
  614. "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)\n",
  615. "debconf: falling back to frontend: Readline\n",
  616. "debconf: unable to initialize frontend: Readline\n",
  617. "debconf: (This frontend requires a controlling tty.)\n",
  618. "debconf: falling back to frontend: Teletype\n",
  619. "dpkg-preconfigure: unable to re-open stdin: \n",
  620. "Selecting previously unselected package libespeak-dev:amd64.\n",
  621. "(Reading database ... 122248 files and directories currently installed.)\n",
  622. "Preparing to unpack .../libespeak-dev_1.48.15+dfsg-3_amd64.deb ...\n",
  623. "Unpacking libespeak-dev:amd64 (1.48.15+dfsg-3) ...\n",
  624. "Setting up libespeak-dev:amd64 (1.48.15+dfsg-3) ...\n",
  625. "[INFO] B.2 Installing espeak... done\n",
  626. "[INFO] B.3 Installing festival...\n",
  627. "Reading package lists... Done\n",
  628. "Building dependency tree... Done\n",
  629. "Reading state information... Done\n",
  630. "Note, selecting 'festival-voice' for glob 'festival*'\n",
  631. "Note, selecting 'festival-catalan-voice' for glob 'festival*'\n",
  632. "Note, selecting 'festival-czech' for glob 'festival*'\n",
  633. "Note, selecting 'festival-ca' for glob 'festival*'\n",
  634. "Note, selecting 'festival-hi' for glob 'festival*'\n",
  635. "Note, selecting 'festival-mr' for glob 'festival*'\n",
  636. "Note, selecting 'festival-te' for glob 'festival*'\n",
  637. "Note, selecting 'festival-freebsoft-utils' for glob 'festival*'\n",
  638. "Note, selecting 'festival-dev' for glob 'festival*'\n",
  639. "Note, selecting 'festival-doc' for glob 'festival*'\n",
  640. "Note, selecting 'festival' for glob 'festival*'\n",
  641. "Note, selecting 'festvox-ca-ona-hts' instead of 'festival-catalan-voice'\n",
  642. "The following additional packages will be installed:\n",
  643. " alsa-utils doc-base festlex-cmu festlex-poslex festvox-czech-ph\n",
  644. " festvox-hi-nsk festvox-kallpc16k festvox-mr-nsk festvox-te-nsk libatopology2\n",
  645. " libestools-dev libestools2.5 libfftw3-single3 libopencore-amrnb0\n",
  646. " libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libuuid-perl\n",
  647. " libwavpack1 libyaml-tiny-perl sgml-base sox\n",
  648. "Suggested packages:\n",
  649. " dialog dhelp | dwww | dochelp | doc-central | yelp | khelpcenter\n",
  650. " pidgin-festival speech-tools-doc libfftw3-bin libfftw3-dev libsox-fmt-all\n",
  651. " sgml-base-doc\n",
  652. "The following NEW packages will be installed:\n",
  653. " alsa-utils doc-base festival festival-ca festival-czech festival-dev\n",
  654. " festival-doc festival-freebsoft-utils festival-hi festival-mr festival-te\n",
  655. " festlex-cmu festlex-poslex festvox-ca-ona-hts festvox-czech-ph\n",
  656. " festvox-hi-nsk festvox-kallpc16k festvox-mr-nsk festvox-te-nsk libatopology2\n",
  657. " libestools-dev libestools2.5 libfftw3-single3 libopencore-amrnb0\n",
  658. " libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libuuid-perl\n",
  659. " libwavpack1 libyaml-tiny-perl sgml-base sox\n",
  660. "0 upgraded, 33 newly installed, 0 to remove and 51 not upgraded.\n",
  661. "Need to get 50.5 MB of archives.\n",
  662. "After this operation, 139 MB of additional disk space will be used.\n",
  663. "Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 sgml-base all 1.30 [12.5 kB]\n",
  664. "Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libatopology2 amd64 1.2.6.1-1ubuntu1 [51.3 kB]\n",
  665. "Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libfftw3-single3 amd64 3.3.8-2ubuntu8 [800 kB]\n",
  666. "Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 alsa-utils amd64 1.2.6-1ubuntu1 [1,177 kB]\n",
  667. "Get:5 http://archive.ubuntu.com/ubuntu jammy/main amd64 libuuid-perl amd64 0.28-1build4 [15.9 kB]\n",
  668. "Get:6 http://archive.ubuntu.com/ubuntu jammy/main amd64 libyaml-tiny-perl all 1.73-1 [25.2 kB]\n",
  669. "Get:7 http://archive.ubuntu.com/ubuntu jammy/main amd64 doc-base all 0.11.1 [79.4 kB]\n",
  670. "Get:8 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libestools2.5 amd64 1:2.5.0-12 [1,038 kB]\n",
  671. "Get:9 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival amd64 1:2.5.0-8 [751 kB]\n",
  672. "Get:10 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-ca all 3.0.6-2 [2,695 kB]\n",
  673. "Get:11 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-czech all 0.3-5 [31.5 kB]\n",
  674. "Get:12 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-doc all 1:2.5.0-8 [648 kB]\n",
  675. "Get:13 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-freebsoft-utils all 0.10-9 [48.0 kB]\n",
  676. "Get:14 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festvox-hi-nsk all 0.1-10 [7,538 kB]\n",
  677. "Get:15 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-hi all 0.1-10 [16.8 kB]\n",
  678. "Get:16 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festvox-mr-nsk all 0.1-10 [7,543 kB]\n",
  679. "Get:17 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-mr all 0.1-10 [17.3 kB]\n",
  680. "Get:18 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festvox-te-nsk all 0.3.3-5 [7,547 kB]\n",
  681. "Get:19 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-te all 0.3.3-5 [13.3 kB]\n",
  682. "Get:20 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festlex-cmu all 2.4-2 [895 kB]\n",
  683. "Get:21 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festlex-poslex all 2.4-1 [186 kB]\n",
  684. "Get:22 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festvox-ca-ona-hts all 1.3-3 [3,583 kB]\n",
  685. "Get:23 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festvox-czech-ph all 0.1-6 [9,573 kB]\n",
  686. "Get:24 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libestools-dev amd64 1:2.5.0-12 [1,408 kB]\n",
  687. "Get:25 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrnb0 amd64 0.1.5-1 [94.8 kB]\n",
  688. "Get:26 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrwb0 amd64 0.1.5-1 [49.1 kB]\n",
  689. "Get:27 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox3 amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [240 kB]\n",
  690. "Get:28 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox-fmt-alsa amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [11.2 kB]\n",
  691. "Get:29 http://archive.ubuntu.com/ubuntu jammy/main amd64 libwavpack1 amd64 5.4.0-1build2 [83.7 kB]\n",
  692. "Get:30 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox-fmt-base amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [33.7 kB]\n",
  693. "Get:31 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 sox amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [104 kB]\n",
  694. "Get:32 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festival-dev amd64 1:2.5.0-8 [606 kB]\n",
  695. "Get:33 http://archive.ubuntu.com/ubuntu jammy/universe amd64 festvox-kallpc16k all 2.4-1 [3,614 kB]\n",
  696. "Fetched 50.5 MB in 1s (43.4 MB/s)\n",
  697. "debconf: unable to initialize frontend: Dialog\n",
  698. "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 33.)\n",
  699. "debconf: falling back to frontend: Readline\n",
  700. "debconf: unable to initialize frontend: Readline\n",
  701. "debconf: (This frontend requires a controlling tty.)\n",
  702. "debconf: falling back to frontend: Teletype\n",
  703. "dpkg-preconfigure: unable to re-open stdin: \n",
  704. "Selecting previously unselected package sgml-base.\n",
  705. "(Reading database ... 122255 files and directories currently installed.)\n",
  706. "Preparing to unpack .../00-sgml-base_1.30_all.deb ...\n",
  707. "Unpacking sgml-base (1.30) ...\n",
  708. "Selecting previously unselected package libatopology2:amd64.\n",
  709. "Preparing to unpack .../01-libatopology2_1.2.6.1-1ubuntu1_amd64.deb ...\n",
  710. "Unpacking libatopology2:amd64 (1.2.6.1-1ubuntu1) ...\n",
  711. "Selecting previously unselected package libfftw3-single3:amd64.\n",
  712. "Preparing to unpack .../02-libfftw3-single3_3.3.8-2ubuntu8_amd64.deb ...\n",
  713. "Unpacking libfftw3-single3:amd64 (3.3.8-2ubuntu8) ...\n",
  714. "Selecting previously unselected package alsa-utils.\n",
  715. "Preparing to unpack .../03-alsa-utils_1.2.6-1ubuntu1_amd64.deb ...\n",
  716. "Unpacking alsa-utils (1.2.6-1ubuntu1) ...\n",
  717. "Selecting previously unselected package libuuid-perl.\n",
  718. "Preparing to unpack .../04-libuuid-perl_0.28-1build4_amd64.deb ...\n",
  719. "Unpacking libuuid-perl (0.28-1build4) ...\n",
  720. "Selecting previously unselected package libyaml-tiny-perl.\n",
  721. "Preparing to unpack .../05-libyaml-tiny-perl_1.73-1_all.deb ...\n",
  722. "Unpacking libyaml-tiny-perl (1.73-1) ...\n",
  723. "Selecting previously unselected package doc-base.\n",
  724. "Preparing to unpack .../06-doc-base_0.11.1_all.deb ...\n",
  725. "Unpacking doc-base (0.11.1) ...\n",
  726. "Selecting previously unselected package libestools2.5:amd64.\n",
  727. "Preparing to unpack .../07-libestools2.5_1%3a2.5.0-12_amd64.deb ...\n",
  728. "Unpacking libestools2.5:amd64 (1:2.5.0-12) ...\n",
  729. "Selecting previously unselected package festival.\n",
  730. "Preparing to unpack .../08-festival_1%3a2.5.0-8_amd64.deb ...\n",
  731. "Unpacking festival (1:2.5.0-8) ...\n",
  732. "Selecting previously unselected package festival-ca.\n",
  733. "Preparing to unpack .../09-festival-ca_3.0.6-2_all.deb ...\n",
  734. "Unpacking festival-ca (3.0.6-2) ...\n",
  735. "Selecting previously unselected package festival-czech.\n",
  736. "Preparing to unpack .../10-festival-czech_0.3-5_all.deb ...\n",
  737. "Unpacking festival-czech (0.3-5) ...\n",
  738. "Selecting previously unselected package festival-doc.\n",
  739. "Preparing to unpack .../11-festival-doc_1%3a2.5.0-8_all.deb ...\n",
  740. "Unpacking festival-doc (1:2.5.0-8) ...\n",
  741. "Selecting previously unselected package festival-freebsoft-utils.\n",
  742. "Preparing to unpack .../12-festival-freebsoft-utils_0.10-9_all.deb ...\n",
  743. "Unpacking festival-freebsoft-utils (0.10-9) ...\n",
  744. "Selecting previously unselected package festvox-hi-nsk.\n",
  745. "Preparing to unpack .../13-festvox-hi-nsk_0.1-10_all.deb ...\n",
  746. "Unpacking festvox-hi-nsk (0.1-10) ...\n",
  747. "Selecting previously unselected package festival-hi.\n",
  748. "Preparing to unpack .../14-festival-hi_0.1-10_all.deb ...\n",
  749. "Unpacking festival-hi (0.1-10) ...\n",
  750. "Selecting previously unselected package festvox-mr-nsk.\n",
  751. "Preparing to unpack .../15-festvox-mr-nsk_0.1-10_all.deb ...\n",
  752. "Unpacking festvox-mr-nsk (0.1-10) ...\n",
  753. "Selecting previously unselected package festival-mr.\n",
  754. "Preparing to unpack .../16-festival-mr_0.1-10_all.deb ...\n",
  755. "Unpacking festival-mr (0.1-10) ...\n",
  756. "Selecting previously unselected package festvox-te-nsk.\n",
  757. "Preparing to unpack .../17-festvox-te-nsk_0.3.3-5_all.deb ...\n",
  758. "Unpacking festvox-te-nsk (0.3.3-5) ...\n",
  759. "Selecting previously unselected package festival-te.\n",
  760. "Preparing to unpack .../18-festival-te_0.3.3-5_all.deb ...\n",
  761. "Unpacking festival-te (0.3.3-5) ...\n",
  762. "Selecting previously unselected package festlex-cmu.\n",
  763. "Preparing to unpack .../19-festlex-cmu_2.4-2_all.deb ...\n",
  764. "Unpacking festlex-cmu (2.4-2) ...\n",
  765. "Selecting previously unselected package festlex-poslex.\n",
  766. "Preparing to unpack .../20-festlex-poslex_2.4-1_all.deb ...\n",
  767. "Unpacking festlex-poslex (2.4-1) ...\n",
  768. "Selecting previously unselected package festvox-ca-ona-hts.\n",
  769. "Preparing to unpack .../21-festvox-ca-ona-hts_1.3-3_all.deb ...\n",
  770. "Unpacking festvox-ca-ona-hts (1.3-3) ...\n",
  771. "Selecting previously unselected package festvox-czech-ph.\n",
  772. "Preparing to unpack .../22-festvox-czech-ph_0.1-6_all.deb ...\n",
  773. "Unpacking festvox-czech-ph (0.1-6) ...\n",
  774. "Selecting previously unselected package libestools-dev.\n",
  775. "Preparing to unpack .../23-libestools-dev_1%3a2.5.0-12_amd64.deb ...\n",
  776. "Unpacking libestools-dev (1:2.5.0-12) ...\n",
  777. "Selecting previously unselected package libopencore-amrnb0:amd64.\n",
  778. "Preparing to unpack .../24-libopencore-amrnb0_0.1.5-1_amd64.deb ...\n",
  779. "Unpacking libopencore-amrnb0:amd64 (0.1.5-1) ...\n",
  780. "Selecting previously unselected package libopencore-amrwb0:amd64.\n",
  781. "Preparing to unpack .../25-libopencore-amrwb0_0.1.5-1_amd64.deb ...\n",
  782. "Unpacking libopencore-amrwb0:amd64 (0.1.5-1) ...\n",
  783. "Selecting previously unselected package libsox3:amd64.\n",
  784. "Preparing to unpack .../26-libsox3_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n",
  785. "Unpacking libsox3:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  786. "Selecting previously unselected package libsox-fmt-alsa:amd64.\n",
  787. "Preparing to unpack .../27-libsox-fmt-alsa_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n",
  788. "Unpacking libsox-fmt-alsa:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  789. "Selecting previously unselected package libwavpack1:amd64.\n",
  790. "Preparing to unpack .../28-libwavpack1_5.4.0-1build2_amd64.deb ...\n",
  791. "Unpacking libwavpack1:amd64 (5.4.0-1build2) ...\n",
  792. "Selecting previously unselected package libsox-fmt-base:amd64.\n",
  793. "Preparing to unpack .../29-libsox-fmt-base_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n",
  794. "Unpacking libsox-fmt-base:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  795. "Selecting previously unselected package sox.\n",
  796. "Preparing to unpack .../30-sox_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n",
  797. "Unpacking sox (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  798. "Selecting previously unselected package festival-dev.\n",
  799. "Preparing to unpack .../31-festival-dev_1%3a2.5.0-8_amd64.deb ...\n",
  800. "Unpacking festival-dev (1:2.5.0-8) ...\n",
  801. "Selecting previously unselected package festvox-kallpc16k.\n",
  802. "Preparing to unpack .../32-festvox-kallpc16k_2.4-1_all.deb ...\n",
  803. "Unpacking festvox-kallpc16k (2.4-1) ...\n",
  804. "Setting up libfftw3-single3:amd64 (3.3.8-2ubuntu8) ...\n",
  805. "Setting up libuuid-perl (0.28-1build4) ...\n",
  806. "Setting up libsox3:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  807. "Setting up libestools2.5:amd64 (1:2.5.0-12) ...\n",
  808. "Setting up libyaml-tiny-perl (1.73-1) ...\n",
  809. "Setting up festvox-hi-nsk (0.1-10) ...\n",
  810. "Setting up libopencore-amrwb0:amd64 (0.1.5-1) ...\n",
  811. "Setting up festvox-mr-nsk (0.1-10) ...\n",
  812. "Setting up libsox-fmt-alsa:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  813. "Setting up libwavpack1:amd64 (5.4.0-1build2) ...\n",
  814. "Setting up festvox-te-nsk (0.3.3-5) ...\n",
  815. "Setting up libopencore-amrnb0:amd64 (0.1.5-1) ...\n",
  816. "Setting up libestools-dev (1:2.5.0-12) ...\n",
  817. "Setting up sgml-base (1.30) ...\n",
  818. "Setting up libsox-fmt-base:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  819. "Setting up festival-dev (1:2.5.0-8) ...\n",
  820. "Setting up libatopology2:amd64 (1.2.6.1-1ubuntu1) ...\n",
  821. "Setting up doc-base (0.11.1) ...\n",
  822. "Registering 22 doc-base files...\n",
  823. "Error in `/usr/share/doc-base/base-passwd.users-and-groups', line 13: all `Format' sections are invalid.\n",
  824. "Note: `install-docs --verbose --check file_name' may give more details about the above error.\n",
  825. "Setting up festival-doc (1:2.5.0-8) ...\n",
  826. "Setting up alsa-utils (1.2.6-1ubuntu1) ...\n",
  827. "Setting up festival (1:2.5.0-8) ...\n",
  828. "Setting up sox (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n",
  829. "Processing triggers for libc-bin (2.35-0ubuntu3.4) ...\n",
  830. "/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
  831. "\n",
  832. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
  833. "\n",
  834. "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
  835. "\n",
  836. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
  837. "\n",
  838. "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
  839. "\n",
  840. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
  841. "\n",
  842. "Processing triggers for man-db (2.10.2-1) ...\n",
  843. "Processing triggers for sgml-base (1.30) ...\n",
  844. "Setting up festlex-poslex (2.4-1) ...\n",
  845. "Setting up festival-te (0.3.3-5) ...\n",
  846. "Setting up festival-hi (0.1-10) ...\n",
  847. "Setting up festival-mr (0.1-10) ...\n",
  848. "Setting up festlex-cmu (2.4-2) ...\n",
  849. "Setting up festival-czech (0.3-5) ...\n",
  850. "Setting up festvox-czech-ph (0.1-6) ...\n",
  851. "Setting up festival-ca (3.0.6-2) ...\n",
  852. "Setting up festival-freebsoft-utils (0.10-9) ...\n",
  853. "Setting up festvox-kallpc16k (2.4-1) ...\n",
  854. "Setting up festvox-ca-ona-hts (1.3-3) ...\n",
  855. "[INFO] B.3 Installing festival... done\n",
  856. "[INFO] B.4 Installing common libs using apt-get...\n",
  857. "Reading package lists... Done\n",
  858. "Building dependency tree... Done\n",
  859. "Reading state information... Done\n",
  860. "build-essential is already the newest version (12.9ubuntu3).\n",
  861. "0 upgraded, 0 newly installed, 0 to remove and 51 not upgraded.\n",
  862. "Reading package lists... Done\n",
  863. "Building dependency tree... Done\n",
  864. "Reading state information... Done\n",
  865. "libasound2-dev is already the newest version (1.2.6.1-1ubuntu1).\n",
  866. "libsndfile1-dev is already the newest version (1.0.31-2ubuntu0.1).\n",
  867. "Suggested packages:\n",
  868. " libaudio2 libsndio6.1\n",
  869. "The following NEW packages will be installed:\n",
  870. " flac libao-common libao4 vorbis-tools\n",
  871. "0 upgraded, 4 newly installed, 0 to remove and 51 not upgraded.\n",
  872. "Need to get 367 kB of archives.\n",
  873. "After this operation, 1,377 kB of additional disk space will be used.\n",
  874. "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 flac amd64 1.3.3-2ubuntu0.2 [130 kB]\n",
  875. "Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libao-common all 1.2.2+20180113-1.1ubuntu3 [6,568 B]\n",
  876. "Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libao4 amd64 1.2.2+20180113-1.1ubuntu3 [35.2 kB]\n",
  877. "Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 vorbis-tools amd64 1.4.2-1 [195 kB]\n",
  878. "Fetched 367 kB in 0s (1,239 kB/s)\n",
  879. "debconf: unable to initialize frontend: Dialog\n",
  880. "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 4.)\n",
  881. "debconf: falling back to frontend: Readline\n",
  882. "debconf: unable to initialize frontend: Readline\n",
  883. "debconf: (This frontend requires a controlling tty.)\n",
  884. "debconf: falling back to frontend: Teletype\n",
  885. "dpkg-preconfigure: unable to re-open stdin: \n",
  886. "Selecting previously unselected package flac.\n",
  887. "(Reading database ... 123430 files and directories currently installed.)\n",
  888. "Preparing to unpack .../flac_1.3.3-2ubuntu0.2_amd64.deb ...\n",
  889. "Unpacking flac (1.3.3-2ubuntu0.2) ...\n",
  890. "Selecting previously unselected package libao-common.\n",
  891. "Preparing to unpack .../libao-common_1.2.2+20180113-1.1ubuntu3_all.deb ...\n",
  892. "Unpacking libao-common (1.2.2+20180113-1.1ubuntu3) ...\n",
  893. "Selecting previously unselected package libao4:amd64.\n",
  894. "Preparing to unpack .../libao4_1.2.2+20180113-1.1ubuntu3_amd64.deb ...\n",
  895. "Unpacking libao4:amd64 (1.2.2+20180113-1.1ubuntu3) ...\n",
  896. "Selecting previously unselected package vorbis-tools.\n",
  897. "Preparing to unpack .../vorbis-tools_1.4.2-1_amd64.deb ...\n",
  898. "Unpacking vorbis-tools (1.4.2-1) ...\n",
  899. "Setting up libao-common (1.2.2+20180113-1.1ubuntu3) ...\n",
  900. "Setting up flac (1.3.3-2ubuntu0.2) ...\n",
  901. "Setting up libao4:amd64 (1.2.2+20180113-1.1ubuntu3) ...\n",
  902. "Setting up vorbis-tools (1.4.2-1) ...\n",
  903. "Processing triggers for man-db (2.10.2-1) ...\n",
  904. "Processing triggers for libc-bin (2.35-0ubuntu3.4) ...\n",
  905. "/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
  906. "\n",
  907. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
  908. "\n",
  909. "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
  910. "\n",
  911. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
  912. "\n",
  913. "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
  914. "\n",
  915. "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
  916. "\n",
  917. "Reading package lists... Done\n",
  918. "Building dependency tree... Done\n",
  919. "Reading state information... Done\n",
  920. "Note, selecting 'libxslt1-dev' instead of 'libxslt-dev'\n",
  921. "libxml2-dev is already the newest version (2.9.13+dfsg-1ubuntu0.4).\n",
  922. "zlib1g-dev is already the newest version (1:1.2.11.dfsg-2ubuntu9.2).\n",
  923. "zlib1g-dev set to manually installed.\n",
  924. "The following NEW packages will be installed:\n",
  925. " libxslt1-dev\n",
  926. "0 upgraded, 1 newly installed, 0 to remove and 51 not upgraded.\n",
  927. "Need to get 219 kB of archives.\n",
  928. "After this operation, 2,058 kB of additional disk space will be used.\n",
  929. "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libxslt1-dev amd64 1.1.34-4ubuntu0.22.04.1 [219 kB]\n",
  930. "Fetched 219 kB in 0s (1,358 kB/s)\n",
  931. "debconf: unable to initialize frontend: Dialog\n",
  932. "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)\n",
  933. "debconf: falling back to frontend: Readline\n",
  934. "debconf: unable to initialize frontend: Readline\n",
  935. "debconf: (This frontend requires a controlling tty.)\n",
  936. "debconf: falling back to frontend: Teletype\n",
  937. "dpkg-preconfigure: unable to re-open stdin: \n",
  938. "Selecting previously unselected package libxslt1-dev:amd64.\n",
  939. "(Reading database ... 123496 files and directories currently installed.)\n",
  940. "Preparing to unpack .../libxslt1-dev_1.1.34-4ubuntu0.22.04.1_amd64.deb ...\n",
  941. "Unpacking libxslt1-dev:amd64 (1.1.34-4ubuntu0.22.04.1) ...\n",
  942. "Setting up libxslt1-dev:amd64 (1.1.34-4ubuntu0.22.04.1) ...\n",
  943. "Processing triggers for man-db (2.10.2-1) ...\n",
  944. "Processing triggers for doc-base (0.11.1) ...\n",
  945. "Processing 1 added doc-base file...\n",
  946. "Reading package lists... Done\n",
  947. "Building dependency tree... Done\n",
  948. "Reading state information... Done\n",
  949. "Package python-dev is not available, but is referred to by another package.\n",
  950. "This may mean that the package is missing, has been obsoleted, or\n",
  951. "is only available from another source\n",
  952. "However the following packages replace it:\n",
  953. " python2-dev python2 python-dev-is-python3\n",
  954. "\n",
  955. "E: Package 'python-dev' has no installation candidate\n",
  956. "[INFO] B.4 Installing common libs using apt-get... done\n",
  957. "[INFO] Congratulations, now you can use aeneas!\n"
  958. ]
  959. }
  960. ],
  961. "source": [
  962. "!wget https://raw.githubusercontent.com/readbeyond/aeneas/master/install_dependencies.sh\n",
  963. "!bash install_dependencies.sh"
  964. ]
  965. },
  966. {
  967. "cell_type": "code",
  968. "execution_count": 10,
  969. "metadata": {
  970. "colab": {
  971. "base_uri": "https://localhost:8080/"
  972. },
  973. "id": "SpjTiZH71Pix",
  974. "outputId": "0c449734-a794-4a80-cca9-9670252a5551"
  975. },
  976. "outputs": [
  977. {
  978. "output_type": "stream",
  979. "name": "stdout",
  980. "text": [
  981. "Cloning into 'aeneas'...\n",
  982. "remote: Enumerating objects: 5636, done.\u001b[K\n",
  983. "remote: Counting objects: 100% (19/19), done.\u001b[K\n",
  984. "remote: Compressing objects: 100% (18/18), done.\u001b[K\n",
  985. "remote: Total 5636 (delta 0), reused 10 (delta 0), pack-reused 5617\u001b[K\n",
  986. "Receiving objects: 100% (5636/5636), 29.86 MiB | 22.90 MiB/s, done.\n",
  987. "Resolving deltas: 100% (4272/4272), done.\n",
  988. "Requirement already satisfied: BeautifulSoup4>=4.5.1 in /usr/local/lib/python3.10/dist-packages (from -r /content/aeneas/requirements.txt (line 1)) (4.12.3)\n",
  989. "Requirement already satisfied: lxml>=3.6.0 in /usr/local/lib/python3.10/dist-packages (from -r /content/aeneas/requirements.txt (line 2)) (4.9.4)\n",
  990. "Requirement already satisfied: numpy>=1.9 in /usr/local/lib/python3.10/dist-packages (from -r /content/aeneas/requirements.txt (line 3)) (1.22.4)\n",
  991. "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from BeautifulSoup4>=4.5.1->-r /content/aeneas/requirements.txt (line 1)) (2.5)\n",
  992. "\u001b[39mrunning build_ext\u001b[0m\n",
  993. "\u001b[39mbuilding 'aeneas.cdtw.cdtw' extension\u001b[0m\n",
  994. "\u001b[39mWarning: Can't read registry to find the necessary compiler setting\n",
  995. "Make sure that Python modules winreg, win32api or win32con are installed.\u001b[0m\n",
  996. "\u001b[39mINFO: C compiler: x86_64-linux-gnu-gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC\n",
  997. "\u001b[0m\n",
  998. "\u001b[39mcreating build\u001b[0m\n",
  999. "\u001b[39mcreating build/temp.linux-x86_64-cpython-310\u001b[0m\n",
  1000. "\u001b[39mcreating build/temp.linux-x86_64-cpython-310/aeneas\u001b[0m\n",
  1001. "\u001b[39mcreating build/temp.linux-x86_64-cpython-310/aeneas/cdtw\u001b[0m\n",
  1002. "\u001b[39mcreating build/temp.linux-x86_64-cpython-310/aeneas/cint\u001b[0m\n",
  1003. "\u001b[39mINFO: compile options: '-I/usr/local/lib/python3.10/dist-packages/numpy/core/include -I['/usr/local/lib/python3.10/dist-packages/numpy/core/include'] -I/usr/include/python3.10 -c'\u001b[0m\n",
  1004. "\u001b[39mINFO: x86_64-linux-gnu-gcc: aeneas/cdtw/cdtw_func.c\u001b[0m\n",
  1005. "\u001b[39mINFO: x86_64-linux-gnu-gcc: aeneas/cdtw/cdtw_py.c\u001b[0m\n",
  1006. "\u001b[01m\u001b[Kcc1:\u001b[m\u001b[K \u001b[01;31m\u001b[Kfatal error: \u001b[m\u001b[Kaeneas/cdtw/cdtw_py.c: No such file or directory\n",
  1007. "compilation terminated.\n",
  1008. "\u001b[01m\u001b[Kcc1:\u001b[m\u001b[K \u001b[01;31m\u001b[Kfatal error: \u001b[m\u001b[Kaeneas/cdtw/cdtw_func.c: No such file or directory\n",
  1009. "compilation terminated.\n",
  1010. "\u001b[39mINFO: x86_64-linux-gnu-gcc: aeneas/cint/cint.c\u001b[0m\n",
  1011. "\u001b[01m\u001b[Kcc1:\u001b[m\u001b[K \u001b[01;31m\u001b[Kfatal error: \u001b[m\u001b[Kaeneas/cint/cint.c: No such file or directory\n",
  1012. "compilation terminated.\n",
  1013. "error: Command \"x86_64-linux-gnu-gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/usr/local/lib/python3.10/dist-packages/numpy/core/include -I['/usr/local/lib/python3.10/dist-packages/numpy/core/include'] -I/usr/include/python3.10 -c aeneas/cdtw/cdtw_func.c -o build/temp.linux-x86_64-cpython-310/aeneas/cdtw/cdtw_func.o\" failed with exit status 1\n",
  1014. "\u001b[92m[INFO] aeneas OK\u001b[0m\n",
  1015. "\u001b[92m[INFO] ffprobe OK\u001b[0m\n",
  1016. "\u001b[92m[INFO] ffmpeg OK\u001b[0m\n",
  1017. "\u001b[92m[INFO] espeak OK\u001b[0m\n",
  1018. "\u001b[92m[INFO] aeneas.tools OK\u001b[0m\n",
  1019. "\u001b[92m[INFO] shell encoding OK\u001b[0m\n",
  1020. "\u001b[93m[WARN] aeneas.cdtw NOT AVAILABLE\u001b[0m\n",
  1021. "[INFO] You can still run aeneas but it will be significantly slower\n",
  1022. "[INFO] Please refer to the installation documentation for details\n",
  1023. "\u001b[93m[WARN] aeneas.cmfcc NOT AVAILABLE\u001b[0m\n",
  1024. "[INFO] You can still run aeneas but it will be significantly slower\n",
  1025. "[INFO] Please refer to the installation documentation for details\n",
  1026. "\u001b[93m[WARN] aeneas.cew NOT AVAILABLE\u001b[0m\n",
  1027. "[INFO] You can still run aeneas but it will be a bit slower\n",
  1028. "[INFO] Please refer to the installation documentation for details\n",
  1029. "\u001b[93m[WARN] All required dependencies are met but at least one Python C extension is not available\u001b[0m\n",
  1030. "\u001b[93m[WARN] You can still run aeneas but it will be slower\u001b[0m\n",
  1031. "\u001b[93m[WARN] Enjoy running aeneas!\u001b[0m\n"
  1032. ]
  1033. }
  1034. ],
  1035. "source": [
  1036. "!git clone https://github.com/ReadBeyond/aeneas.git\n",
  1037. "!cd /content/aeneas\n",
  1038. "!sudo pip install -r /content/aeneas/requirements.txt\n",
  1039. "!python /content/aeneas/setup.py build_ext --inplace\n",
  1040. "!python /content/aeneas/aeneas_check_setup.py"
  1041. ]
  1042. },
  1043. {
  1044. "cell_type": "code",
  1045. "execution_count": 11,
  1046. "metadata": {
  1047. "colab": {
  1048. "base_uri": "https://localhost:8080/"
  1049. },
  1050. "id": "roqTZsHw1Sxf",
  1051. "outputId": "5c24d091-1d60-4d76-9f8e-19ddc839cfd2"
  1052. },
  1053. "outputs": [
  1054. {
  1055. "output_type": "stream",
  1056. "name": "stdout",
  1057. "text": [
  1058. "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  1059. "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
  1060. " Building wheel for aeneas (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
  1061. ]
  1062. }
  1063. ],
  1064. "source": [
  1065. "!pip install -q aeneas"
  1066. ]
  1067. },
  1068. {
  1069. "cell_type": "code",
  1070. "execution_count": 12,
  1071. "metadata": {
  1072. "colab": {
  1073. "base_uri": "https://localhost:8080/"
  1074. },
  1075. "id": "9mZwov_K1ULt",
  1076. "outputId": "ba39ff2b-2681-4573-9bce-a9e47c6c7a9a"
  1077. },
  1078. "outputs": [
  1079. {
  1080. "output_type": "stream",
  1081. "name": "stdout",
  1082. "text": [
  1083. "\u001b[92m[INFO] ffprobe OK\u001b[0m\n",
  1084. "\u001b[92m[INFO] ffmpeg OK\u001b[0m\n",
  1085. "\u001b[92m[INFO] espeak OK\u001b[0m\n",
  1086. "\u001b[92m[INFO] aeneas.tools OK\u001b[0m\n",
  1087. "\u001b[93m[WARN] shell encoding WARNING\u001b[0m\n",
  1088. "\u001b[93m[WARN] The default input encoding of your shell is not UTF-8\u001b[0m\n",
  1089. "\u001b[93m[WARN] The default output encoding of your shell is not UTF-8\u001b[0m\n",
  1090. "[INFO] If you plan to use aeneas on the command line,\n",
  1091. "[INFO] you might want to 'export PYTHONIOENCODING=UTF-8' in your shell\n",
  1092. "\u001b[92m[INFO] aeneas.cdtw AVAILABLE\u001b[0m\n",
  1093. "\u001b[92m[INFO] aeneas.cmfcc AVAILABLE\u001b[0m\n",
  1094. "\u001b[92m[INFO] aeneas.cew AVAILABLE\u001b[0m\n",
  1095. "\u001b[92m[INFO] All required dependencies are met and all available Python C extensions are working\u001b[0m\n"
  1096. ]
  1097. }
  1098. ],
  1099. "source": [
  1100. "!python -m aeneas.diagnostics"
  1101. ]
  1102. },
  1103. {
  1104. "cell_type": "markdown",
  1105. "source": [
  1106. "## Import Libraries"
  1107. ],
  1108. "metadata": {
  1109. "id": "h65OI0SnxDxd"
  1110. }
  1111. },
  1112. {
  1113. "cell_type": "code",
  1114. "source": [
  1115. "import os\n",
  1116. "import re\n",
  1117. "import subprocess\n",
  1118. "import shutil\n",
  1119. "import json\n",
  1120. "from functools import reduce\n",
  1121. "from pydub import AudioSegment\n",
  1122. "from hazm import Normalizer, sent_tokenize, word_tokenize\n",
  1123. "from parsi_io.parsi_io.modules.number_extractor import NumberExtractor\n",
  1124. "from parsi_io.parsi_io.modules.convert_number_to_text import ConvertNumberToText"
  1125. ],
  1126. "metadata": {
  1127. "id": "oSUSv3-kxGHt"
  1128. },
  1129. "execution_count": 31,
  1130. "outputs": []
  1131. },
  1132. {
  1133. "cell_type": "markdown",
  1134. "source": [
  1135. "# Convert Audio to Mono Mp3"
  1136. ],
  1137. "metadata": {
  1138. "id": "3LrvXJuQXMz-"
  1139. }
  1140. },
  1141. {
  1142. "cell_type": "code",
  1143. "source": [
  1144. "raw_data_dir = \"raw-data/\"\n",
  1145. "\n",
  1146. "processed_data_dir = \"processed-data/\"\n",
  1147. "os.makedirs(processed_data_dir, exist_ok=True)"
  1148. ],
  1149. "metadata": {
  1150. "id": "I0v6BaL8WH2z"
  1151. },
  1152. "execution_count": null,
  1153. "outputs": []
  1154. },
  1155. {
  1156. "cell_type": "code",
  1157. "source": [
  1158. "def convert_audio_to_mono_mp3(input_file_path, output_file_path):\n",
  1159. " input_file_name = input_file_path.split('/')[-1].split('.')[0]\n",
  1160. "\n",
  1161. " # Load the audio file\n",
  1162. " sound = AudioSegment.from_file(input_file_path)\n",
  1163. "\n",
  1164. " # Convert stereo to mono\n",
  1165. " sound = sound.set_channels(1)\n",
  1166. "\n",
  1167. " # Export the audio in MP3 format\n",
  1168. " sound.export(output_file_path, format=\"mp3\")\n",
  1169. "\n",
  1170. " return output_file_path\n"
  1171. ],
  1172. "metadata": {
  1173. "id": "AQ3qtfH7X6w2"
  1174. },
  1175. "execution_count": null,
  1176. "outputs": []
  1177. },
  1178. {
  1179. "cell_type": "code",
  1180. "source": [
  1181. "# Iterate over all.m4a files in the source directory\n",
  1182. "for filename in os.listdir(raw_data_dir):\n",
  1183. " if filename.endswith('.m4a'):\n",
  1184. " # Construct full file path\n",
  1185. " source_file_path = os.path.join(raw_data_dir, filename)\n",
  1186. " # Construct destination file path\n",
  1187. " destination_file_path = os.path.join(processed_data_dir, filename.replace('.m4a', '.mp3'))\n",
  1188. "\n",
  1189. " convert_audio_to_mono_mp3(source_file_path, destination_file_path)\n",
  1190. " print(f'Converted {filename} to MP3.')\n",
  1191. "\n",
  1192. "print('All.m4a files have been converted to.mp3.')\n"
  1193. ],
  1194. "metadata": {
  1195. "colab": {
  1196. "base_uri": "https://localhost:8080/"
  1197. },
  1198. "id": "BxqH91ogWWvx",
  1199. "outputId": "bb5917a5-e1d9-4d26-fe0c-fda66c28286b"
  1200. },
  1201. "execution_count": null,
  1202. "outputs": [
  1203. {
  1204. "output_type": "stream",
  1205. "name": "stdout",
  1206. "text": [
  1207. "Converted 2.m4a to MP3.\n",
  1208. "Converted 1.m4a to MP3.\n",
  1209. "Converted 3.m4a to MP3.\n",
  1210. "Converted 4.m4a to MP3.\n",
  1211. "Converted 9.m4a to MP3.\n",
  1212. "Converted 12.m4a to MP3.\n",
  1213. "Converted 14.m4a to MP3.\n",
  1214. "Converted 16.m4a to MP3.\n",
  1215. "Converted 20.m4a to MP3.\n",
  1216. "Converted 22.m4a to MP3.\n",
  1217. "Converted 24.m4a to MP3.\n",
  1218. "Converted 26.m4a to MP3.\n",
  1219. "Converted 30.m4a to MP3.\n",
  1220. "Converted 40.m4a to MP3.\n",
  1221. "Converted 50.m4a to MP3.\n",
  1222. "Converted 52.m4a to MP3.\n",
  1223. "Converted 54.m4a to MP3.\n",
  1224. "Converted 56.m4a to MP3.\n",
  1225. "Converted 64.m4a to MP3.\n",
  1226. "Converted 72.m4a to MP3.\n",
  1227. "Converted 76.m4a to MP3.\n",
  1228. "Converted 101.m4a to MP3.\n",
  1229. "Converted 103.m4a to MP3.\n",
  1230. "Converted 107.m4a to MP3.\n",
  1231. "Converted 111.m4a to MP3.\n",
  1232. "All.m4a files have been converted to.mp3.\n"
  1233. ]
  1234. }
  1235. ]
  1236. },
  1237. {
  1238. "cell_type": "markdown",
  1239. "source": [
  1240. "# Process Text"
  1241. ],
  1242. "metadata": {
  1243. "id": "_eFddpRKYetZ"
  1244. }
  1245. },
  1246. {
  1247. "cell_type": "markdown",
  1248. "source": [
  1249. "## Normalization"
  1250. ],
  1251. "metadata": {
  1252. "id": "EjNPpBz1yhoi"
  1253. }
  1254. },
  1255. {
  1256. "cell_type": "code",
  1257. "source": [
  1258. "normalizer = Normalizer()\n",
  1259. "\n",
  1260. "def normalize_text(text):\n",
  1261. " return normalizer.normalize(text)"
  1262. ],
  1263. "metadata": {
  1264. "id": "MupcUcpbytQq"
  1265. },
  1266. "execution_count": 16,
  1267. "outputs": []
  1268. },
  1269. {
  1270. "cell_type": "markdown",
  1271. "source": [
  1272. "## Symbol Substitution\n",
  1273. "This step is designed to unify various forms of symbols into their more commonly used counterparts."
  1274. ],
  1275. "metadata": {
  1276. "id": "ard6ph0Uy0dP"
  1277. }
  1278. },
  1279. {
  1280. "cell_type": "code",
  1281. "source": [
  1282. "substitution_dict = {'ﯽ': 'ی', '—': '–', '\\u200f': '\\u200c', '\\xad': '\\u200c', '\\u200e': '\\u200c', '\\u200d': '\\u200c'}\n",
  1283. "\n",
  1284. "def substitute_symbols(text):\n",
  1285. " translation_table = str.maketrans(substitution_dict)\n",
  1286. " substituted_text = text.translate(translation_table)\n",
  1287. " return substituted_text"
  1288. ],
  1289. "metadata": {
  1290. "id": "fNfrmnVgzQV3"
  1291. },
  1292. "execution_count": 17,
  1293. "outputs": []
  1294. },
  1295. {
  1296. "cell_type": "markdown",
  1297. "source": [
  1298. "## Remove In-text References\n",
  1299. "This step is designed to remove the references that come inside the text but are not read aloud. For example:\n",
  1300. "> They have introduced a new tool [1] which ..."
  1301. ],
  1302. "metadata": {
  1303. "id": "rfTD3b0nz4Lr"
  1304. }
  1305. },
  1306. {
  1307. "cell_type": "code",
  1308. "source": [
  1309. "def remove_inline_references(text):\n",
  1310. " # Define pattern to match references like \"[NUM]\"\n",
  1311. " pattern_fa = r\"\\[\\d+\\]|\\[۰-۹]+\\]\"\n",
  1312. " pattern_en = r\"\\[\\d+\\]|\\[0-9]+\\]\"\n",
  1313. "\n",
  1314. " # Use regular expression to remove references\n",
  1315. " text_without_refs_fa = re.sub(pattern_fa, \" \", text)\n",
  1316. " text_without_refs_en = re.sub(pattern_en, \" \", text_without_refs_fa)\n",
  1317. "\n",
  1318. " return text_without_refs_en"
  1319. ],
  1320. "metadata": {
  1321. "id": "Ftu0Vajm0NQZ"
  1322. },
  1323. "execution_count": 18,
  1324. "outputs": []
  1325. },
  1326. {
  1327. "cell_type": "markdown",
  1328. "source": [
  1329. "## Remove Reference Lines\n",
  1330. "This step is designed to remove the references that come at the end of the text but are not read aloud. For example:\n",
  1331. "> [1] Roshan-AI. Hazm. https://www.roshan-ai.ir/hazm/docs/index.html. Accessed:\n",
  1332. "May 3, 2024.\n",
  1333. ">\n",
  1334. "> [2] ...\n",
  1335. "\n"
  1336. ],
  1337. "metadata": {
  1338. "id": "01MnXk8g0Qbc"
  1339. }
  1340. },
  1341. {
  1342. "cell_type": "code",
  1343. "source": [
  1344. "def remove_references_lines(text):\n",
  1345. " # Define pattern to match references like \"[NUM] \"\n",
  1346. " pattern_fa = r\"^\\s*\\[\\d+\\]|\\[۰-۹]+\\]\"\n",
  1347. " pattern_en = r\"^\\s*\\[\\d+\\]|\\[0-9]+\\]\"\n",
  1348. "\n",
  1349. " # Split text into lines\n",
  1350. " lines = text.split('\\n')\n",
  1351. "\n",
  1352. " # Remove lines starting with references\n",
  1353. " cleaned_lines = [line for line in lines if not re.match(pattern_fa, line.strip()) and not re.match(pattern_en, line.strip())]\n",
  1354. "\n",
  1355. " # Join cleaned lines back into text\n",
  1356. " cleaned_text = '\\n'.join(cleaned_lines)\n",
  1357. "\n",
  1358. " return cleaned_text"
  1359. ],
  1360. "metadata": {
  1361. "id": "L7yk3Ube0nU_"
  1362. },
  1363. "execution_count": 19,
  1364. "outputs": []
  1365. },
  1366. {
  1367. "cell_type": "markdown",
  1368. "source": [
  1369. "## Remove Link Lines\n",
  1370. "This step is designed to remove the links and urls that come at the end of the text but are not read aloud. For example:\n",
  1371. "> Resources:\n",
  1372. ">\n",
  1373. "> https://www.roshan-ai.ir/hazm/docs/index.html\n",
  1374. ">\n",
  1375. "> https://virgool.io/\n",
  1376. ">\n",
  1377. "> ..."
  1378. ],
  1379. "metadata": {
  1380. "id": "9-jcYsWn0t9n"
  1381. }
  1382. },
  1383. {
  1384. "cell_type": "code",
  1385. "source": [
  1386. "def remove_link_lines(text):\n",
  1387. " # Define the pattern to match lines starting with http or www\n",
  1388. " pattern = r\"^\\s*(?:http|www)\"\n",
  1389. "\n",
  1390. " # Split text into lines\n",
  1391. " lines = text.split('\\n')\n",
  1392. "\n",
  1393. " # Remove lines starting with link\n",
  1394. " cleaned_lines = [line for line in lines if not re.match(pattern, line.strip())]\n",
  1395. "\n",
  1396. " # Join cleaned lines back into text\n",
  1397. " cleaned_text = '\\n'.join(cleaned_lines)\n",
  1398. "\n",
  1399. " return cleaned_text"
  1400. ],
  1401. "metadata": {
  1402. "id": "U41eap2C0oWn"
  1403. },
  1404. "execution_count": 20,
  1405. "outputs": []
  1406. },
  1407. {
  1408. "cell_type": "markdown",
  1409. "source": [
  1410. "## Convert Numbers to Text\n",
  1411. "This step is designed to convert the numbers in digit format into their spoken version. For example:\n",
  1412. "\n",
  1413. "> 22 → twenty two"
  1414. ],
  1415. "metadata": {
  1416. "id": "08-yP9Hb1OV9"
  1417. }
  1418. },
  1419. {
  1420. "cell_type": "code",
  1421. "source": [
  1422. "persian_digits_pattern = re.compile(r'[۰۱۲۳۴۵۶۷۸۹0123456789]')\n",
  1423. "num2text = ConvertNumberToText()\n",
  1424. "extractor = NumberExtractor()"
  1425. ],
  1426. "metadata": {
  1427. "id": "ZcVnmGRx1oBS"
  1428. },
  1429. "execution_count": 21,
  1430. "outputs": []
  1431. },
  1432. {
  1433. "cell_type": "code",
  1434. "source": [
  1435. "def replace_numbers_with_text(text):\n",
  1436. " # Find all number spans in the text\n",
  1437. " number_spans = extractor.run(text)\n",
  1438. "\n",
  1439. " # Filter out spans that contain digits\n",
  1440. " filtered_spans = [span for span in number_spans if persian_digits_pattern.search(span['phrase'])]\n",
  1441. "\n",
  1442. " # Convert the filtered numbers to text and replace them in the text\n",
  1443. " offset = 0 # Track the offset due to previous replacements\n",
  1444. " for span in filtered_spans:\n",
  1445. " start, end = span['span']\n",
  1446. " start -= offset # Adjust start position based on previous replacements\n",
  1447. " end -= offset # Adjust end position based on previous replacements\n",
  1448. " number_text = span['phrase']\n",
  1449. " number_value = span['value']\n",
  1450. "\n",
  1451. " # Convert the number to text\n",
  1452. " text_value = num2text.run(str(number_value))\n",
  1453. "\n",
  1454. " # Replace the number in the text with its textual equivalent\n",
  1455. " text = text[:start] + text_value + text[end:]\n",
  1456. "\n",
  1457. " # Update the offset\n",
  1458. " offset += len(number_text) - len(text_value)\n",
  1459. "\n",
  1460. " return text"
  1461. ],
  1462. "metadata": {
  1463. "id": "HylkKIWp1saj"
  1464. },
  1465. "execution_count": 22,
  1466. "outputs": []
  1467. },
  1468. {
  1469. "cell_type": "markdown",
  1470. "source": [
  1471. "## Remove Symbols\n",
  1472. "This step is designed to remove some of the symbols that are not very common or do not affect the TTS-ASR models' outputs. This helps simplify the input to the models."
  1473. ],
  1474. "metadata": {
  1475. "id": "Zyghd-a62tid"
  1476. }
  1477. },
  1478. {
  1479. "cell_type": "code",
  1480. "source": [
  1481. "symbols_to_remove = \"«»*[]\\\"'^&<>{}|٫《》•\\x9d\\u200b\\x7f\"\n",
  1482. "\n",
  1483. "def remove_symbols(text):\n",
  1484. " pattern = \"[\" + re.escape(symbols_to_remove) + \"]\"\n",
  1485. " return re.sub(pattern, ' ', text)"
  1486. ],
  1487. "metadata": {
  1488. "id": "VlA-vsW-29Vq"
  1489. },
  1490. "execution_count": 23,
  1491. "outputs": []
  1492. },
  1493. {
  1494. "cell_type": "markdown",
  1495. "source": [
  1496. "## Remove Extra White Spaces\n",
  1497. "This step is designed to remove extra white spaces inluding multiple consequent white spaces and new lines."
  1498. ],
  1499. "metadata": {
  1500. "id": "AGc1r6vx3EdV"
  1501. }
  1502. },
  1503. {
  1504. "cell_type": "code",
  1505. "source": [
  1506. "def remove_extra_white_spaces(text):\n",
  1507. " cleaned_text = re.sub(r'\\s+', ' ', text)\n",
  1508. " return cleaned_text.strip()"
  1509. ],
  1510. "metadata": {
  1511. "id": "DnJGXdap3Zc0"
  1512. },
  1513. "execution_count": 24,
  1514. "outputs": []
  1515. },
  1516. {
  1517. "cell_type": "markdown",
  1518. "source": [
  1519. "## Full Pipeline\n",
  1520. "Here we define the complete text processing pipelin and the processing code."
  1521. ],
  1522. "metadata": {
  1523. "id": "XwjdlP2o3Zya"
  1524. }
  1525. },
  1526. {
  1527. "cell_type": "code",
  1528. "source": [
  1529. "pipeline = [\n",
  1530. " normalize_text,\n",
  1531. " substitute_symbols,\n",
  1532. " remove_inline_references,\n",
  1533. " remove_references_lines,\n",
  1534. " remove_link_lines,\n",
  1535. " replace_numbers_with_text,\n",
  1536. " remove_symbols,\n",
  1537. " remove_extra_white_spaces\n",
  1538. " ]"
  1539. ],
  1540. "metadata": {
  1541. "id": "t9JzXgxM3hQO"
  1542. },
  1543. "execution_count": 25,
  1544. "outputs": []
  1545. },
  1546. {
  1547. "cell_type": "code",
  1548. "source": [
  1549. "def process_text(input_file_path, output_dir_path):\n",
  1550. " input_file_name = input_file_path.split('/')[-1].split('.')[0]\n",
  1551. " output_file_path = os.path.join(output_dir_path, input_file_name + '.txt')\n",
  1552. "\n",
  1553. " # Check if the output file already exists\n",
  1554. " if os.path.exists(output_file_path):\n",
  1555. " print(f\"Skipping file {input_file_name}. Processed text file already exists.\")\n",
  1556. " return output_file_path\n",
  1557. "\n",
  1558. " # Apply the text processing pipeline\n",
  1559. " with open(input_file_path, 'r') as f:\n",
  1560. " text = reduce(lambda txt, func: func(txt), pipeline, f.read())\n",
  1561. "\n",
  1562. " # Export the processed text\n",
  1563. " with open(output_file_path, 'w') as f:\n",
  1564. " f.write(text)\n",
  1565. "\n",
  1566. " return output_file_path"
  1567. ],
  1568. "metadata": {
  1569. "id": "2HM4ETq_3kom"
  1570. },
  1571. "execution_count": 26,
  1572. "outputs": []
  1573. },
  1574. {
  1575. "cell_type": "markdown",
  1576. "source": [
  1577. "## Run the text processing pipeline"
  1578. ],
  1579. "metadata": {
  1580. "id": "QswV18q63pZP"
  1581. }
  1582. },
  1583. {
  1584. "cell_type": "code",
  1585. "source": [
  1586. "def process_all_text_files(directory_path, output_dir_path):\n",
  1587. " # Ensure the output directory exists\n",
  1588. " if not os.path.exists(output_dir_path):\n",
  1589. " os.makedirs(output_dir_path)\n",
  1590. "\n",
  1591. " # Iterate over all files in the directory\n",
  1592. " for filename in os.listdir(directory_path):\n",
  1593. " if filename.endswith('.txt'): # Check if the file is a text file\n",
  1594. " input_file_path = os.path.join(directory_path, filename)\n",
  1595. " output_file_path = os.path.join(output_dir_path, filename)\n",
  1596. "\n",
  1597. " # Call the process_text function for each text file\n",
  1598. " process_text(input_file_path, output_dir_path)"
  1599. ],
  1600. "metadata": {
  1601. "id": "sB-usl1P3vY6"
  1602. },
  1603. "execution_count": 27,
  1604. "outputs": []
  1605. },
  1606. {
  1607. "cell_type": "code",
  1608. "source": [
  1609. "process_all_text_files(raw_data_dir, processed_data_dir)"
  1610. ],
  1611. "metadata": {
  1612. "id": "goiie6IqaY4t",
  1613. "colab": {
  1614. "base_uri": "https://localhost:8080/"
  1615. },
  1616. "outputId": "58660ebf-8206-4504-c456-edeb466e00a3"
  1617. },
  1618. "execution_count": 28,
  1619. "outputs": [
  1620. {
  1621. "output_type": "stream",
  1622. "name": "stdout",
  1623. "text": [
  1624. "Skipping file 1. Processed text file already exists.\n",
  1625. "Skipping file 2. Processed text file already exists.\n",
  1626. "Skipping file 3. Processed text file already exists.\n",
  1627. "Skipping file 4. Processed text file already exists.\n",
  1628. "Skipping file 9. Processed text file already exists.\n",
  1629. "Skipping file 12. Processed text file already exists.\n",
  1630. "Skipping file 14. Processed text file already exists.\n",
  1631. "Skipping file 16. Processed text file already exists.\n",
  1632. "Skipping file 20. Processed text file already exists.\n",
  1633. "Skipping file 22. Processed text file already exists.\n",
  1634. "Skipping file 24. Processed text file already exists.\n",
  1635. "Skipping file 26. Processed text file already exists.\n",
  1636. "Skipping file 30. Processed text file already exists.\n",
  1637. "Skipping file 40. Processed text file already exists.\n",
  1638. "Skipping file 50. Processed text file already exists.\n",
  1639. "Skipping file 52. Processed text file already exists.\n",
  1640. "Skipping file 54. Processed text file already exists.\n",
  1641. "Skipping file 56. Processed text file already exists.\n",
  1642. "Skipping file 64. Processed text file already exists.\n",
  1643. "Skipping file 72. Processed text file already exists.\n",
  1644. "Skipping file 76. Processed text file already exists.\n",
  1645. "Skipping file 101. Processed text file already exists.\n",
  1646. "Skipping file 103. Processed text file already exists.\n",
  1647. "Skipping file 107. Processed text file already exists.\n",
  1648. "Skipping file 111. Processed text file already exists.\n"
  1649. ]
  1650. }
  1651. ]
  1652. },
  1653. {
  1654. "cell_type": "markdown",
  1655. "source": [
  1656. "# Forced Alignment\n",
  1657. "Forced alignment is the task of chunking the audio and text files into smaller parts of a few seconds and a few words. The resulting audio-text chunks contain the same content. I.e. the text files are the transcript of the audio files.\n",
  1658. "\n",
  1659. "Here we use the [Aeneas](https://github.com/readbeyond/aeneas) forced alignment tool which is a good choice for audio and text files that are an exact match.\n",
  1660. "\n",
  1661. "This tool requires the text be tokenized to sentences. We use the [Hazm](https://www.roshan-ai.ir/hazm/docs/index.html) sentence tokenizer and [Perpos](https://github.com/mhbashari/perpos) POS Tagger and develop a new sentence tokenization tool that keeps the sentences in a predefined length range."
  1662. ],
  1663. "metadata": {
  1664. "id": "xLwXCW7hYbDE"
  1665. }
  1666. },
  1667. {
  1668. "cell_type": "markdown",
  1669. "metadata": {
  1670. "id": "zNBFLb7j8F_P"
  1671. },
  1672. "source": [
  1673. "## Sentence Tokenize"
  1674. ]
  1675. },
  1676. {
  1677. "cell_type": "code",
  1678. "execution_count": null,
  1679. "metadata": {
  1680. "id": "cMT1eKD5v-9t"
  1681. },
  1682. "outputs": [],
  1683. "source": [
  1684. "def contains_word_letter(input_string):\n",
  1685. " regex_pattern = re.compile(r'\\w+')\n",
  1686. " return bool(regex_pattern.search(input_string))"
  1687. ]
  1688. },
  1689. {
  1690. "cell_type": "markdown",
  1691. "source": [
  1692. "The implemented sentence tokenizer first uses the Hazm sentnece tokenizer that mostly tokenizes based on punctuation. It then uses the Perpos POS Tagger to identify VERBS and EZAFE tags. It uses the position of VERB tags to further split the sentences into smaller chunks. This tokenizer considers several criteria during tokenization to output meaningful and well-formed splits:\n",
  1693. "\n",
  1694. "* It appends all the symbols to the verb of the last sentence. This is because the symbols can affect the pronunciation of the verb (consider how a `?` mark changes intonation).\n",
  1695. "* It appends the conjunction `و` to the verb of the last sentence. Because the pronunciation of this word is usually integrated with the previous word by a vowel sound `\\o\\` and should not be interrupted.\n",
  1696. "* It keeps sentences under a maximum length\n",
  1697. "* It merges small chunks together to keep sentences longer than a minimum length\n",
  1698. "* It avoids splitting the sentences at the EZAFE tags. This is because these wrods are pronounced connected to the previous words by a vowel `\\e\\` and should not be interrupted."
  1699. ],
  1700. "metadata": {
  1701. "id": "GByUfNUv-gvd"
  1702. }
  1703. },
  1704. {
  1705. "cell_type": "code",
  1706. "execution_count": null,
  1707. "metadata": {
  1708. "id": "Y6LB2k28ykrL"
  1709. },
  1710. "outputs": [],
  1711. "source": [
  1712. "def get_sub_sentences(tagged_words, min_split_len=5, max_split_len=12):\n",
  1713. " i = 0 # Index of next word in the original sentence\n",
  1714. " current_words = []\n",
  1715. " sub_sentences = []\n",
  1716. "\n",
  1717. " while i < len(tagged_words):\n",
  1718. " word, pos = tagged_words[i]\n",
  1719. " current_words.append(word)\n",
  1720. " i += 1\n",
  1721. "\n",
  1722. " while i < len(tagged_words) and not contains_word_letter(tagged_words[i][0]): # Append symbols to previous sentence\n",
  1723. " current_words.append(tagged_words[i][0])\n",
  1724. " i += 1\n",
  1725. "\n",
  1726. " if pos == \"V\" and i < len(tagged_words) and tagged_words[i][0] == \"و\": # Append \"و\" to previous verb\n",
  1727. " current_words.append(\"و\")\n",
  1728. " i += 1\n",
  1729. "\n",
  1730. " while i < len(tagged_words) and pos.endswith(\"e\"): # Append all EZAFE POSes to previous sentence\n",
  1731. " word, pos = tagged_words[i]\n",
  1732. " current_words.append(word)\n",
  1733. " i += 1\n",
  1734. "\n",
  1735. " if i >= len(tagged_words) or len(current_words) >= max_split_len or (len(current_words) > min_split_len and pos == \"V\"):\n",
  1736. " sub_sentences.append(' '.join(current_words))\n",
  1737. " current_words = []\n",
  1738. "\n",
  1739. " return sub_sentences"
  1740. ]
  1741. },
  1742. {
  1743. "cell_type": "code",
  1744. "execution_count": null,
  1745. "metadata": {
  1746. "id": "IrYxESJYynSs"
  1747. },
  1748. "outputs": [],
  1749. "source": [
  1750. "def split_sentences(text, min_split_len=5, max_split_len=12):\n",
  1751. " print(\"Tokenizing to sentences...\")\n",
  1752. "\n",
  1753. " hazm_sentences = sent_tokenize(text)\n",
  1754. " splitted_sentences = []\n",
  1755. "\n",
  1756. " for sent in hazm_sentences:\n",
  1757. " words = word_tokenize(sent)\n",
  1758. " tagged_words = pos_tagger.parse(words)\n",
  1759. " sub_sentences = get_sub_sentences(tagged_words, min_split_len, max_split_len)\n",
  1760. " splitted_sentences.extend(sub_sentences)\n",
  1761. "\n",
  1762. " return splitted_sentences"
  1763. ]
  1764. },
  1765. {
  1766. "cell_type": "code",
  1767. "execution_count": null,
  1768. "metadata": {
  1769. "id": "4MDSzSoG540k"
  1770. },
  1771. "outputs": [],
  1772. "source": [
  1773. "def write_splitted_text(input_file_path, output_file_path='temp_splitted_text.txt'):\n",
  1774. " text = \"\"\n",
  1775. " with open(input_file_path, 'r') as f: text = f.read()\n",
  1776. " splitted_sentences = split_sentences(text)\n",
  1777. " splitted_text = '\\n'.join(splitted_sentences)\n",
  1778. "\n",
  1779. " # Remove _ artifact from hazm word tokenizer\n",
  1780. " splitted_text = splitted_text.replace('_', '\\u200c')\n",
  1781. "\n",
  1782. " with open(output_file_path, 'w') as f: f.write(splitted_text)"
  1783. ]
  1784. },
  1785. {
  1786. "cell_type": "markdown",
  1787. "metadata": {
  1788. "id": "j1EM85Dv87io"
  1789. },
  1790. "source": [
  1791. "## Forced Alignment"
  1792. ]
  1793. },
  1794. {
  1795. "cell_type": "code",
  1796. "execution_count": null,
  1797. "metadata": {
  1798. "id": "GfIvyQ0V42Tg"
  1799. },
  1800. "outputs": [],
  1801. "source": [
  1802. "def write_forced_alignment_map(audio_file, text_file, output_json='temp_alignment_map.json'):\n",
  1803. " print(\"Executing forced alignment...\")\n",
  1804. " command = [\n",
  1805. " \"python\", \"-m\", \"aeneas.tools.execute_task\",\n",
  1806. " audio_file,\n",
  1807. " text_file,\n",
  1808. " \"task_language=fas|os_task_file_format=json|is_text_type=plain\",\n",
  1809. " output_json\n",
  1810. " ]\n",
  1811. " try:\n",
  1812. " subprocess.run(command, check=True)\n",
  1813. " print(\"Forced alignment completed successfully.\")\n",
  1814. " except subprocess.CalledProcessError as e:\n",
  1815. " print(\"Error:\", e)"
  1816. ]
  1817. },
  1818. {
  1819. "cell_type": "code",
  1820. "execution_count": null,
  1821. "metadata": {
  1822. "id": "bVuEetVr5WOa"
  1823. },
  1824. "outputs": [],
  1825. "source": [
  1826. "def read_json(json_file):\n",
  1827. " with open(json_file, 'r') as file:\n",
  1828. " return json.load(file)"
  1829. ]
  1830. },
  1831. {
  1832. "cell_type": "code",
  1833. "execution_count": null,
  1834. "metadata": {
  1835. "id": "EmAqthxF9n1o"
  1836. },
  1837. "outputs": [],
  1838. "source": [
  1839. "def get_aligned_audio_text(audio_file_path, text_file_path, temp_text_file='temp_splitted_text.txt'):\n",
  1840. " # Step 1: Split text into sentences and write to temp text file\n",
  1841. " write_splitted_text(text_file_path, temp_text_file)\n",
  1842. "\n",
  1843. " # Step 2: Perform forced alignment and write result to temp JSON file\n",
  1844. " temp_json_file = 'temp_alignment_map.json'\n",
  1845. " write_forced_alignment_map(audio_file_path, temp_text_file, temp_json_file)\n",
  1846. "\n",
  1847. " # Step 3: Read JSON file and split audio based on start and end spans\n",
  1848. " alignment_dict = read_json(temp_json_file)\n",
  1849. " audio_segments = []\n",
  1850. " for fragment in alignment_dict['fragments']:\n",
  1851. " start = float(fragment['begin'])\n",
  1852. " end = float(fragment['end'])\n",
  1853. " audio_segments.append((start, end))\n",
  1854. "\n",
  1855. " # Step 4: Clean up temporary files\n",
  1856. " os.remove(temp_json_file)\n",
  1857. "\n",
  1858. " return audio_segments"
  1859. ]
  1860. },
  1861. {
  1862. "cell_type": "markdown",
  1863. "metadata": {
  1864. "id": "imjNpL3_AqnR"
  1865. },
  1866. "source": [
  1867. "## Run Forced Alignment"
  1868. ]
  1869. },
  1870. {
  1871. "cell_type": "code",
  1872. "execution_count": null,
  1873. "metadata": {
  1874. "id": "C6EvypcPAxgJ"
  1875. },
  1876. "outputs": [],
  1877. "source": [
  1878. "def truncate_audio(audio_file, start, end):\n",
  1879. " audio = AudioSegment.from_mp3(audio_file)\n",
  1880. " truncated_audio = audio[float(start) * 1000:float(end) * 1000]\n",
  1881. " truncated_audio.export(audio_file, format=\"mp3\")"
  1882. ]
  1883. },
  1884. {
  1885. "cell_type": "code",
  1886. "execution_count": null,
  1887. "metadata": {
  1888. "id": "hl_-lbMMAtYU"
  1889. },
  1890. "outputs": [],
  1891. "source": [
  1892. "def process_audio_files(source_dir, destination_dir, processed_files_log):\n",
  1893. " processed_filenames = set()\n",
  1894. " with open(processed_files_log, 'r') as f:\n",
  1895. " processed_filenames = set([fname.strip() for fname in f.readlines()])\n",
  1896. "\n",
  1897. " # Get total number of audio files in the source directory\n",
  1898. " audio_files = [audio_file for audio_file in os.listdir(source_dir) if audio_file.endswith('.mp3')]\n",
  1899. " total_files = len(audio_files)\n",
  1900. "\n",
  1901. " # Iterate through all audio files in the source directory\n",
  1902. " for idx, audio_file in enumerate(audio_files, start=1):\n",
  1903. " print(f\"({idx}/{total_files}): Processing file {audio_file}\")\n",
  1904. " audio_file_path = os.path.join(source_dir, audio_file)\n",
  1905. " text_file_path = os.path.join(source_dir, audio_file.replace('.mp3', '.txt'))\n",
  1906. "\n",
  1907. " if not os.path.exists(text_file_path):\n",
  1908. " print(\"\\x1b[31m\\\"Processed text file does not exist!\\\"\\x1b[0m\")\n",
  1909. " continue\n",
  1910. "\n",
  1911. " # Create directory with audio file name in destination directory\n",
  1912. " audio_name = os.path.splitext(audio_file)[0]\n",
  1913. " if audio_name in processed_filenames:\n",
  1914. " print(f\"Skippin audio file {audio_name}... already forced aligned!\")\n",
  1915. " continue\n",
  1916. "\n",
  1917. " alignment_dir = os.path.join(destination_dir, audio_name)\n",
  1918. "\n",
  1919. " # Process audio and text file\n",
  1920. " temp_text_file = 'temp_splitted_text.txt'\n",
  1921. " audio_segments = get_aligned_audio_text(audio_file_path, text_file_path, temp_text_file)\n",
  1922. "\n",
  1923. " print(\"Writing output pairs...\")\n",
  1924. " os.makedirs(alignment_dir, exist_ok=True)\n",
  1925. " # Write each line in temp text output file to corresponding txt file\n",
  1926. " with open(temp_text_file, 'r', encoding='utf-8') as f:\n",
  1927. " lines = f.readlines()\n",
  1928. " for idx, line in enumerate(lines):\n",
  1929. " segment_text_name = f\"{audio_name}-{idx}.txt\"\n",
  1930. " segment_text_path = os.path.join(alignment_dir, segment_text_name)\n",
  1931. " with open(segment_text_path, 'w', encoding='utf-8') as segment_file:\n",
  1932. " segment_file.write(line.strip())\n",
  1933. "\n",
  1934. " # Write audio segments to individual files\n",
  1935. " for idx, segment in enumerate(audio_segments):\n",
  1936. " start, end = segment\n",
  1937. " segment_audio_name = f\"{audio_name}-{idx}.mp3\"\n",
  1938. " segment_audio_path = os.path.join(alignment_dir, segment_audio_name)\n",
  1939. " # Copy segment from original audio to segment file\n",
  1940. " shutil.copy(audio_file_path, segment_audio_path)\n",
  1941. " # Truncate audio segment\n",
  1942. " truncate_audio(segment_audio_path, start, end)\n",
  1943. "\n",
  1944. " with open(processed_files_log, 'a') as f: f.write(f'{audio_name}\\n')\n",
  1945. "\n",
  1946. " os.remove(temp_text_file)\n"
  1947. ]
  1948. },
  1949. {
  1950. "cell_type": "code",
  1951. "source": [
  1952. "force_aligned_audio_text_files_dir = \"forced-aligned-data/\"\n",
  1953. "os.makedirs(force_aligned_audio_text_files_dir, exist_ok=True)"
  1954. ],
  1955. "metadata": {
  1956. "id": "PL8SlKlr--AO"
  1957. },
  1958. "execution_count": null,
  1959. "outputs": []
  1960. },
  1961. {
  1962. "cell_type": "code",
  1963. "source": [
  1964. "# This file keeps track of the files already forced aligned not to repeat the processing\n",
  1965. "forced_aligned_files_log = force_aligned_audio_text_files_dir + 'forced-aligned-files-log.txt'\n",
  1966. "with open(forced_aligned_files_log, 'a') as file: pass"
  1967. ],
  1968. "metadata": {
  1969. "id": "82FWRJVn_RbE"
  1970. },
  1971. "execution_count": null,
  1972. "outputs": []
  1973. },
  1974. {
  1975. "cell_type": "code",
  1976. "execution_count": null,
  1977. "metadata": {
  1978. "colab": {
  1979. "base_uri": "https://localhost:8080/"
  1980. },
  1981. "id": "HGrFFHQfEp4m",
  1982. "outputId": "64e84707-5139-477f-e4fc-572f76b44073"
  1983. },
  1984. "outputs": [
  1985. {
  1986. "metadata": {
  1987. "tags": null
  1988. },
  1989. "name": "stdout",
  1990. "output_type": "stream",
  1991. "text": [
  1992. "(1/25): Processing file 2.mp3\n",
  1993. "Skippin audio file 2... already forced aligned!\n",
  1994. "(2/25): Processing file 1.mp3\n",
  1995. "Skippin audio file 1... already forced aligned!\n",
  1996. "(3/25): Processing file 3.mp3\n",
  1997. "Skippin audio file 3... already forced aligned!\n",
  1998. "(4/25): Processing file 4.mp3\n",
  1999. "Skippin audio file 4... already forced aligned!\n",
  2000. "(5/25): Processing file 9.mp3\n",
  2001. "Skippin audio file 9... already forced aligned!\n",
  2002. "(6/25): Processing file 12.mp3\n",
  2003. "Skippin audio file 12... already forced aligned!\n",
  2004. "(7/25): Processing file 14.mp3\n",
  2005. "Skippin audio file 14... already forced aligned!\n",
  2006. "(8/25): Processing file 16.mp3\n",
  2007. "Skippin audio file 16... already forced aligned!\n",
  2008. "(9/25): Processing file 20.mp3\n",
  2009. "Skippin audio file 20... already forced aligned!\n",
  2010. "(10/25): Processing file 22.mp3\n",
  2011. "Skippin audio file 22... already forced aligned!\n",
  2012. "(11/25): Processing file 24.mp3\n",
  2013. "Skippin audio file 24... already forced aligned!\n",
  2014. "(12/25): Processing file 26.mp3\n",
  2015. "Skippin audio file 26... already forced aligned!\n",
  2016. "(13/25): Processing file 30.mp3\n",
  2017. "Skippin audio file 30... already forced aligned!\n",
  2018. "(14/25): Processing file 40.mp3\n",
  2019. "Skippin audio file 40... already forced aligned!\n",
  2020. "(15/25): Processing file 50.mp3\n",
  2021. "Skippin audio file 50... already forced aligned!\n",
  2022. "(16/25): Processing file 52.mp3\n",
  2023. "Skippin audio file 52... already forced aligned!\n",
  2024. "(17/25): Processing file 54.mp3\n",
  2025. "Skippin audio file 54... already forced aligned!\n",
  2026. "(18/25): Processing file 56.mp3\n",
  2027. "Skippin audio file 56... already forced aligned!\n",
  2028. "(19/25): Processing file 64.mp3\n",
  2029. "Skippin audio file 64... already forced aligned!\n",
  2030. "(20/25): Processing file 72.mp3\n",
  2031. "Tokenizing to sentences...\n",
  2032. "Executing forced alignment...\n",
  2033. "Forced alignment completed successfully.\n",
  2034. "Writing output pairs...\n",
  2035. "(21/25): Processing file 76.mp3\n",
  2036. "Tokenizing to sentences...\n",
  2037. "Executing forced alignment...\n",
  2038. "Forced alignment completed successfully.\n",
  2039. "Writing output pairs...\n",
  2040. "(22/25): Processing file 101.mp3\n",
  2041. "Tokenizing to sentences...\n",
  2042. "Executing forced alignment...\n",
  2043. "Forced alignment completed successfully.\n",
  2044. "Writing output pairs...\n",
  2045. "(23/25): Processing file 103.mp3\n",
  2046. "Tokenizing to sentences...\n",
  2047. "Executing forced alignment...\n",
  2048. "Forced alignment completed successfully.\n",
  2049. "Writing output pairs...\n",
  2050. "(24/25): Processing file 107.mp3\n",
  2051. "Tokenizing to sentences...\n",
  2052. "Executing forced alignment...\n",
  2053. "Forced alignment completed successfully.\n",
  2054. "Writing output pairs...\n",
  2055. "(25/25): Processing file 111.mp3\n",
  2056. "Tokenizing to sentences...\n",
  2057. "Executing forced alignment...\n",
  2058. "Forced alignment completed successfully.\n",
  2059. "Writing output pairs...\n"
  2060. ]
  2061. }
  2062. ],
  2063. "source": [
  2064. "process_audio_files(processed_data_dir, force_aligned_audio_text_files_dir, forced_aligned_files_log)"
  2065. ]
  2066. }
  2067. ]
  2068. }