From f62eaf5ac81447af4e8f1ca850cb4d2c8b675922 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= <francois@jevalide.ca>
Date: Thu, 3 Oct 2024 00:18:10 -0400
Subject: [PATCH] Podcasts mp3 vers fichiers srt

---
 import_data/51_importation_podcast.py | 52 +++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 import_data/51_importation_podcast.py

diff --git a/import_data/51_importation_podcast.py b/import_data/51_importation_podcast.py
new file mode 100644
index 0000000..1fb1495
--- /dev/null
+++ b/import_data/51_importation_podcast.py
@@ -0,0 +1,52 @@
+import os
+from pathlib import Path
+from tqdm import tqdm
+from faster_whisper import WhisperModel
+
+# Get the current file's directory
+try:
+    script_dir = Path(__file__).parent.parent
+except NameError:
+    script_dir = Path().absolute()
+
+project_root = script_dir
+podcast_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'audio')
+output_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'transcripts')
+
+# Create output directory if it doesn't exist
+os.makedirs(output_dir, exist_ok=True)
+
+# Load Faster-Whisper model
+model = WhisperModel("small", device="cpu", compute_type="int8")
+
+def transcribe_audio(audio_path):
+    l_segments, _ = model.transcribe(audio_path, language="fr", task="transcribe")
+    return list(l_segments)  # Convert generator to list
+
+def create_srt(l_segments, output_path):
+    with open(output_path, 'w', encoding='utf-8') as f:
+        for i, segment in enumerate(l_segments, 1):
+            start_time = format_time(segment.start)
+            end_time = format_time(segment.end)
+            text = segment.text.strip()
+            f.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
+
+def format_time(seconds):
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = int(seconds % 60)
+    milliseconds = int((seconds % 1) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+# Process all MP3 files
+for filename in tqdm(os.listdir(podcast_dir)):
+    if filename.endswith(".mp3"):
+        mp3_path = os.path.join(podcast_dir, filename)
+        srt_path = os.path.join(output_dir, filename.replace(".mp3", ".srt"))
+
+        print(f"Transcribing {filename}...")
+        segments = transcribe_audio(mp3_path)
+        create_srt(segments, srt_path)
+        print(f"Transcription saved to {srt_path}")
+
+print("All podcasts have been transcribed.")
\ No newline at end of file