diff --git a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb index 84a5db7e8768..fd156c030d26 100644 --- a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb +++ b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb @@ -1,39 +1,12 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "CTC_Segmentation_Tutorial_update.ipynb", - "private_outputs": true, - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, "cells": [ { "cell_type": "code", + "execution_count": null, "metadata": { "id": "d4KCUoxSpdoZ" }, + "outputs": [], "source": [ "BRANCH = 'r2.1.0'\n", "\n", @@ -46,15 +19,15 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "\"\"\"" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "JDk9zxC6pdod" }, + "outputs": [], "source": [ "import os\n", "# either provide a path to local NeMo repository with NeMo already installed or git clone\n", @@ -67,15 +40,15 @@ " ! git clone -b $BRANCH https://github.com/NVIDIA/NeMo\n", " ! cd NeMo\n", " ! python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "CH7yR7cSwPKr" }, + "outputs": [], "source": [ "import json\n", "import os\n", @@ -88,9 +61,7 @@ "! pip install pandas\n", "! pip install plotly\n", "from plotly import graph_objects as go" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -108,17 +79,18 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "8FAZKakrIyGI" }, + "outputs": [], "source": [ "requirements = f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/tools/ctc_segmentation/requirements.txt'\n", "wget.download(requirements)\n", "! pip install -r requirements.txt\n", - "! apt-get install -y ffmpeg" - ], - "execution_count": null, - "outputs": [] + "! apt-get install -y ffmpeg\n", + "! apt-get install -y libsox-fmt-mp3 " + ] }, { "cell_type": "markdown", @@ -131,9 +103,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "1C9DdMfvRFM-" }, + "outputs": [], "source": [ "if 'google.colab' in str(get_ipython()):\n", " NEMO_DIR_PATH = \"/content/NeMo\"\n", @@ -143,9 +117,7 @@ "TOOLS_DIR = f'{NEMO_DIR_PATH}/tools/ctc_segmentation/scripts'\n", "print(TOOLS_DIR)\n", "! ls -l $TOOLS_DIR" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -159,9 +131,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "bkeKX2I_tIgV" }, + "outputs": [], "source": [ "## create data directory and download an audio file\n", "WORK_DIR = 'WORK_DIR'\n", @@ -174,9 +148,7 @@ "! rm $DATA_DIR/audio_samples.zip\n", "\n", "DATA_DIR = os.path.join(DATA_DIR, \"audio_samples\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -189,14 +161,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Y6VYVk9mpdol" }, + "outputs": [], "source": [ "! ls $DATA_DIR" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -209,14 +181,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "IGhijb-Bpdol" }, + "outputs": [], "source": [ "! ls $DATA_DIR/es/audio/ $DATA_DIR/es/text/" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -231,15 +203,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ulkPrqwipdom" }, + "outputs": [], "source": [ "base_name_es = \"el19demarzoyel2demayo_03_perezgaldos\"\n", "Audio(f\"{DATA_DIR}/es/audio/{base_name_es}.wav\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -252,15 +224,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "9Qfp10Xnpdom" }, + "outputs": [], "source": [ "text = f\"{DATA_DIR}/es/text/{base_name_es}.txt\"\n", "! cat $text" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -296,9 +268,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "u4zjeVVv-UXR" }, + "outputs": [], "source": [ "MODEL = \"stt_es_citrinet_512\" \n", "OUTPUT_DIR = WORK_DIR + \"/es_output\"\n", @@ -311,9 +285,7 @@ "--language='en' \\\n", "--model=$MODEL \\\n", "--audio_dir=$DATA_DIR/es/audio" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -331,14 +303,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "6R7OKAsYH9p0" }, + "outputs": [], "source": [ "! ls $OUTPUT_DIR/processed" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -351,14 +323,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "74GLpMgoICmk" }, + "outputs": [], "source": [ "! head $OUTPUT_DIR/processed/el19demarzoyel2demayo_03_perezgaldos.txt" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -376,9 +348,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "xyKtaqAd-Tvk" }, + "outputs": [], "source": [ "WINDOW = 8000\n", "\n", @@ -387,9 +361,7 @@ "--data=$OUTPUT_DIR/processed \\\n", "--model=$MODEL \\\n", "--window_len=$WINDOW " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -404,15 +376,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ktBAsfJRVCwI" }, + "outputs": [], "source": [ "alignment_file = f\"{WINDOW}_{base_name_es}_segments.txt\"\n", "! head -n 3 $OUTPUT_DIR/segments/$alignment_file" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -451,9 +423,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "6YM64RPlitPL" }, + "outputs": [], "source": [ "OFFSET = 0\n", "THRESHOLD = -2\n", @@ -463,9 +437,7 @@ "--alignment=$OUTPUT_DIR/segments/ \\\n", "--threshold=$THRESHOLD \\\n", "--offset=$OFFSET" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -480,9 +452,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "1UaSIflBZwaV" }, + "outputs": [], "source": [ "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/asr/transcribe_speech.py')\n", "\n", @@ -490,15 +464,15 @@ "pretrained_name=$MODEL \\\n", "dataset_manifest=$OUTPUT_DIR/manifests/manifest.json \\\n", "output_filename=$OUTPUT_DIR/manifests/manifest_transcribed.json" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "F-nPT8z_IVD-" }, + "outputs": [], "source": [ "def plot_signal(signal, sample_rate):\n", " \"\"\" Plot the signal in time domain \"\"\"\n", @@ -530,9 +504,7 @@ " display('ASR transcript: ' + sample['pred_text'])\n", " print(f\"Score: {sample['score']}\")\n", " print('\\n' + '-' * 110)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -549,17 +521,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Q45uBtsHIaAD" }, + "outputs": [], "source": [ "# let's examine only a few first samples\n", "! head -n 2 $OUTPUT_DIR/manifests/manifest_transcribed.json > $OUTPUT_DIR/manifests/samples.json\n", "\n", "display_samples(f\"{OUTPUT_DIR}/manifests/samples.json\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -576,14 +548,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "KRc9yMjPXPgj" }, + "outputs": [], "source": [ "! ls $DATA_DIR/en/audio $DATA_DIR/en/text" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -626,9 +598,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "hRFAl0gO92bp" }, + "outputs": [], "source": [ "MODEL = \"QuartzNet15x5Base-En\" # \"stt_en_citrinet_512_gamma_0_25\" \n", "OUTPUT_DIR_2 = WORK_DIR + \"/en_output\"\n", @@ -642,9 +616,7 @@ "--SCRIPTS_DIR=$TOOLS_DIR \\\n", "--MIN_SCORE=$THRESHOLD \\\n", "--USE_NEMO_NORMALIZATION=False" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -664,32 +636,32 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "xsm89hYlpdor" }, + "outputs": [], "source": [ "! bash $TOOLS_DIR/../run_filter.sh \\\n", "--SCRIPTS_DIR=$TOOLS_DIR \\\n", "--MODEL_NAME_OR_PATH=stt_en_conformer_ctc_large \\\n", "--MANIFEST=$OUTPUT_DIR_2/manifests/manifest.json \\\n", "--INPUT_AUDIO_DIR=$DATA_DIR/en/audio/" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "nacE_iQ2_85L" }, + "outputs": [], "source": [ "# let's examine only a few first samples\n", "! head -n 2 $OUTPUT_DIR_2/manifests/manifest_transcribed_metrics_filtered.json > $OUTPUT_DIR_2/manifests/samples.json\n", "\n", "display_samples(f\"{OUTPUT_DIR_2}/manifests/samples.json\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -713,5 +685,34 @@ "Kürzinger, Ludwig, et al. [\"CTC-Segmentation of Large Corpora for German End-to-End Speech Recognition.\"](https://arxiv.org/abs/2007.09127) International Conference on Speech and Computer. Springer, Cham, 2020." ] } - ] + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "CTC_Segmentation_Tutorial_update.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 0 }