From b1976ba391f39523f4a99f89fc3e14c714a9c00c Mon Sep 17 00:00:00 2001 From: Francois Pelletier Date: Sat, 9 Mar 2024 23:57:28 +0000 Subject: [PATCH] Ajout du fichier de script Signed-off-by: Francois Pelletier --- whisper.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 whisper.py diff --git a/whisper.py b/whisper.py new file mode 100644 index 0000000..938b99c --- /dev/null +++ b/whisper.py @@ -0,0 +1,72 @@ +# %% Utilisation de Whisper pour la transcription de podcasts en français +import numpy as np +import torch +import torchaudio +import tqdm +from transformers import ( + AutoModelForCausalLM, + AutoModelForSpeechSeq2Seq, + AutoProcessor, + pipeline, +) + +# %% File paths +audio_paths = ["METTRE LES LIENS DES FICHIERS MP3 OU WAV ICI"] + +# %% load PyTorch +device = "cuda:0" if torch.cuda.is_available() else "cpu" +torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + +# %% Load model +model_name_or_path = "bofenghuang/whisper-large-v3-french" +processor = AutoProcessor.from_pretrained(model_name_or_path) +model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_name_or_path, + torch_dtype=torch_dtype, + low_cpu_mem_usage=True, +) +model.to(device) + +# %% Load draft model +assistant_model_name_or_path = "bofenghuang/whisper-large-v3-french-distil-dec2" +assistant_model = AutoModelForCausalLM.from_pretrained( + assistant_model_name_or_path, + torch_dtype=torch_dtype, + low_cpu_mem_usage=True, +) +assistant_model.to(device) + +# %% Init pipeline +pipe = pipeline( + "automatic-speech-recognition", + model=model, + feature_extractor=processor.feature_extractor, + tokenizer=processor.tokenizer, + torch_dtype=torch_dtype, + device=device, + generate_kwargs={"assistant_model": assistant_model}, + max_new_tokens=128, +) + + +# %% Transcript function +def transcript(audio_path): + # Load audio + model_sr = 16000 + speech, sr = torchaudio.load(audio_path) + speech_16000 = torchaudio.functional.resample(speech, orig_freq=sr, new_freq=model_sr) + speech_16000 = speech_16000.squeeze() + + # Run pipeline + result = pipe(np.array(speech_16000)) + + # Save text result to file + transcript_path = f'whisper-large/{audio_path.replace(".mp3", "_transcript_whisper.txt")}' + with open(transcript_path, "w") as f: + f.write(result["text"]) + return None + + +# %% Transcription loop +for audio_path in tqdm.tqdm(audio_paths): + transcript(audio_path)