Podcasts mp3 vers fichiers srt
This commit is contained in:
parent
7a74dbf413
commit
f62eaf5ac8
1 changed files with 52 additions and 0 deletions
52
import_data/51_importation_podcast.py
Normal file
52
import_data/51_importation_podcast.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from tqdm import tqdm
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
# Get the current file's directory
|
||||||
|
try:
|
||||||
|
script_dir = Path(__file__).parent.parent
|
||||||
|
except NameError:
|
||||||
|
script_dir = Path().absolute()
|
||||||
|
|
||||||
|
project_root = script_dir
|
||||||
|
podcast_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'audio')
|
||||||
|
output_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'transcripts')
|
||||||
|
|
||||||
|
# Create output directory if it doesn't exist
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Load Faster-Whisper model
|
||||||
|
model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||||
|
|
||||||
|
def transcribe_audio(audio_path):
|
||||||
|
l_segments, _ = model.transcribe(audio_path, language="fr", task="transcribe")
|
||||||
|
return list(l_segments) # Convert generator to list
|
||||||
|
|
||||||
|
def create_srt(l_segments, output_path):
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
for i, segment in enumerate(l_segments, 1):
|
||||||
|
start_time = format_time(segment.start)
|
||||||
|
end_time = format_time(segment.end)
|
||||||
|
text = segment.text.strip()
|
||||||
|
f.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
|
||||||
|
|
||||||
|
def format_time(seconds):
|
||||||
|
hours = int(seconds // 3600)
|
||||||
|
minutes = int((seconds % 3600) // 60)
|
||||||
|
seconds = int(seconds % 60)
|
||||||
|
milliseconds = int((seconds % 1) * 1000)
|
||||||
|
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
|
||||||
|
|
||||||
|
# Process all MP3 files
|
||||||
|
for filename in tqdm(os.listdir(podcast_dir)):
|
||||||
|
if filename.endswith(".mp3"):
|
||||||
|
mp3_path = os.path.join(podcast_dir, filename)
|
||||||
|
srt_path = os.path.join(output_dir, filename.replace(".mp3", ".srt"))
|
||||||
|
|
||||||
|
print(f"Transcribing {filename}...")
|
||||||
|
segments = transcribe_audio(mp3_path)
|
||||||
|
create_srt(segments, srt_path)
|
||||||
|
print(f"Transcription saved to {srt_path}")
|
||||||
|
|
||||||
|
print("All podcasts have been transcribed.")
|
Loading…
Reference in a new issue