From 848e20f9f52931ec2be249d2ff17eed8680e24bd Mon Sep 17 00:00:00 2001
From: Swaminathan Kannan <65940566+SwamiKannan@users.noreply.github.com>
Date: Mon, 1 Jul 2024 18:38:05 +0530
Subject: [PATCH] Update __init__.py

Added code for faster-whisper and distilwhisper from here:
https://github.com/Uberi/speech_recognition/issues/730
---
 speech_recognition/__init__.py | 84 +++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
index 022cd7d5..36ce21e7 100644
--- a/speech_recognition/__init__.py
+++ b/speech_recognition/__init__.py
@@ -43,6 +43,8 @@
 __version__ = "3.10.4"
 __license__ = "BSD"
 
+MODEL_PATH = 'N:\\models\\voice\\model\\'
+TOKENIZER_PATH = 'N:\\models\\voice\\tokenizer\\'
 
 class AudioSource(object):
     def __init__(self):
@@ -324,7 +326,7 @@ def read(self, size=-1):
 
 
 class Recognizer(AudioSource):
-    def __init__(self):
+    def __init__(self, model = None):
         """
         Creates a new ``Recognizer`` instance, which represents a collection of speech recognition functionality.
         """
@@ -337,6 +339,7 @@ def __init__(self):
 
         self.phrase_threshold = 0.3  # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
         self.non_speaking_duration = 0.5  # seconds of non-speaking audio to keep on both sides of the recording
+        self.model = model
 
     def record(self, source, duration=None, offset=None):
         """
@@ -1385,7 +1388,86 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac
             for node_id in top_k:
                 human_string = self.tflabels[node_id]
                 return human_string
+    def recognize_fasterwhisper(self, audio_data, model="small", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
+    #custom recognizer for faster whisper
+        assert isinstance(audio_data, AudioData), "Data must be audio data"
+        import numpy as np
+        import soundfile as sf
+        import torch
+        from faster_whisper import WhisperModel
+
+        if load_options or not hasattr(self, "whisper_model") or self.whisper_model.get(model) is None:
+            self.whisper_model = getattr(self, "whisper_model", {})
+            #self.whisper_model[model] = WhisperModel("base", device="cpu", compute_type="int8")
+            self.whisper_model[model] = WhisperModel("tiny", device="cuda", compute_type="auto")
+            
+        wav_bytes = audio_data.get_wav_data(convert_rate=16000)
+        wav_stream = io.BytesIO(wav_bytes)
+        audio_array, sampling_rate = sf.read(wav_stream)
+        audio_array = audio_array.astype(np.float32)
 
+        segments, info = self.whisper_model[model].transcribe(audio_array, beam_size=5,)
+        text =""
+        for segment in segments:
+            #print("%s " % (segment.text))
+            text=text+segment.text+" "
+            #print(text)
+        if show_dict:
+            return result
+        else:
+            return text.lower()
+
+    def recognize_distilwhisper(self, audio_data, model="distil-whisper/distil-medium.en", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
+        #custom recognizer for distill-whisper
+        assert isinstance(audio_data, AudioData), "Data must be audio data"
+        import numpy as np
+        import soundfile as sf
+        import torch
+        from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging, AutoTokenizer
+        
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        model_id = "distil-whisper/distil-small.en"
+        model_name = model_id.split('/')[1]
+        model_cache_path = MODEL_PATH+model_name
+        tokenizer_cache_path = TOKENIZER_PATH+model_name
+       
+        if not self.model:
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, cache_dir = model_cache_path)
+            model.to(device)
+            processor = AutoProcessor.from_pretrained(model_id, cache_dir=tokenizer_cache_path)
+            
+            whisper = pipeline(
+                "automatic-speech-recognition",model=model,tokenizer=processor.tokenizer,
+                feature_extractor=processor.feature_extractor,max_new_tokens=128,
+                torch_dtype=torch_dtype,device=device)
+            if not os.path.exists(tokenizer_cache_path):
+                processor.save_pretrained(tokenizer_cache_path)
+            if not os.path.exists(model_cache_path):
+                model.save_pretrained(model_cache_path)
+        wav_bytes = audio_data.get_wav_data(convert_rate=16000)
+        wav_stream = io.BytesIO(wav_bytes)
+        audio_array, sampling_rate = sf.read(wav_stream)
+        audio_array = audio_array.astype(np.float16)
+        
+        if not self.model:
+            print('Imported whisper not detected')
+            text = whisper(audio_array,
+                            chunk_length_s=50,
+                            stride_length_s=10,
+                            batch_size=8)
+        else:
+            print('Imported whisper HAS BEEN DETECTED')
+            text = self.model(audio_array,
+                            chunk_length_s=50,
+                            stride_length_s=10,
+                            batch_size=8)
+        if show_dict:
+            return result
+        else:
+            return text["text"]
+    
     def recognize_whisper(self, audio_data, model="base", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
         """
         Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.