Transformers - Example of automatic speech recognition¶

Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models.
Credits: Huggingface documentation and examples

In [1]:

from transformers import pipeline

In [2]:

# Create the transcriber pipeline with GPU
transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small", device=0)  # Specify the GPU device index)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [3]:

speech_url="https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac"

In [4]:

# Play th eaudio file
import IPython
IPython.display.Audio(url=speech_url)

Out[4]:

In [ ]:

# Install ffmpeg

In [ ]:

%%bash
wget -O ffmpeg.tar.xz https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz
tar xvf ffmpeg.tar.xz

In [5]:

import os
path = os.environ['PATH']
os.environ['PATH'] += ":./ffmpeg-git-20230313-amd64-static" # customize the folder name

In [6]:

# transcribe from speech_url
transcriber(speech_url)

/cvmfs/sft-nightlies.cern.ch/lcg/views/dev4cuda/Mon/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/transformers/generation/utils.py:1313: UserWarning: Using `max_length`'s default (448) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.
  warnings.warn(

Out[6]:

{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}

In [ ]: