From f62eaf5ac81447af4e8f1ca850cb4d2c8b675922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Thu, 3 Oct 2024 00:18:10 -0400 Subject: [PATCH] Podcasts mp3 vers fichiers srt --- import_data/51_importation_podcast.py | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 import_data/51_importation_podcast.py diff --git a/import_data/51_importation_podcast.py b/import_data/51_importation_podcast.py new file mode 100644 index 0000000..1fb1495 --- /dev/null +++ b/import_data/51_importation_podcast.py @@ -0,0 +1,52 @@ +import os +from pathlib import Path +from tqdm import tqdm +from faster_whisper import WhisperModel + +# Get the current file's directory +try: + script_dir = Path(__file__).parent.parent +except NameError: + script_dir = Path().absolute() + +project_root = script_dir +podcast_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'audio') +output_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'transcripts') + +# Create output directory if it doesn't exist +os.makedirs(output_dir, exist_ok=True) + +# Load Faster-Whisper model +model = WhisperModel("small", device="cpu", compute_type="int8") + +def transcribe_audio(audio_path): + l_segments, _ = model.transcribe(audio_path, language="fr", task="transcribe") + return list(l_segments) # Convert generator to list + +def create_srt(l_segments, output_path): + with open(output_path, 'w', encoding='utf-8') as f: + for i, segment in enumerate(l_segments, 1): + start_time = format_time(segment.start) + end_time = format_time(segment.end) + text = segment.text.strip() + f.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n") + +def format_time(seconds): + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds = int(seconds % 60) + milliseconds = int((seconds % 1) * 1000) + return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" + +# Process all MP3 files +for filename in tqdm(os.listdir(podcast_dir)): + if filename.endswith(".mp3"): + mp3_path = os.path.join(podcast_dir, filename) + srt_path = os.path.join(output_dir, filename.replace(".mp3", ".srt")) + + print(f"Transcribing {filename}...") + segments = transcribe_audio(mp3_path) + create_srt(segments, srt_path) + print(f"Transcription saved to {srt_path}") + +print("All podcasts have been transcribed.") \ No newline at end of file