libere-tes-chaine-de-mots/import_data/51_importation_podcast.py

import os
from pathlib import Path
from tqdm import tqdm
from faster_whisper import WhisperModel

# Get the current file's directory
try:
    script_dir = Path(__file__).parent.parent
except NameError:
    script_dir = Path().absolute()

project_root = script_dir
podcast_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'audio')
output_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'transcripts')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load Faster-Whisper model
model = WhisperModel("small", device="cpu", compute_type="int8")

def transcribe_audio(audio_path):
    l_segments, _ = model.transcribe(audio_path, language="fr", task="transcribe")
    return list(l_segments)  # Convert generator to list

def create_srt(l_segments, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for i, segment in enumerate(l_segments, 1):
            start_time = format_time(segment.start)
            end_time = format_time(segment.end)
            text = segment.text.strip()
            f.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")

def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

# Process all MP3 files
for filename in tqdm(os.listdir(podcast_dir)):
    if filename.endswith(".mp3"):
        mp3_path = os.path.join(podcast_dir, filename)
        srt_path = os.path.join(output_dir, filename.replace(".mp3", ".srt"))

        print(f"Transcribing {filename}...")
        segments = transcribe_audio(mp3_path)
        create_srt(segments, srt_path)
        print(f"Transcription saved to {srt_path}")

print("All podcasts have been transcribed.")
Podcasts mp3 vers fichiers srt 2024-10-03 04:18:10 +00:00			`import os`
			`from pathlib import Path`
			`from tqdm import tqdm`
			`from faster_whisper import WhisperModel`

			`# Get the current file's directory`
			`try:`
			`script_dir = Path(__file__).parent.parent`
			`except NameError:`
			`script_dir = Path().absolute()`

			`project_root = script_dir`
			`podcast_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'audio')`
			`output_dir = os.path.join(project_root, 'import_data', 'data', 'Podcast', 'transcripts')`

			`# Create output directory if it doesn't exist`
			`os.makedirs(output_dir, exist_ok=True)`

			`# Load Faster-Whisper model`
			`model = WhisperModel("small", device="cpu", compute_type="int8")`

			`def transcribe_audio(audio_path):`
			`l_segments, _ = model.transcribe(audio_path, language="fr", task="transcribe")`
			`return list(l_segments) # Convert generator to list`

			`def create_srt(l_segments, output_path):`
			`with open(output_path, 'w', encoding='utf-8') as f:`
			`for i, segment in enumerate(l_segments, 1):`
			`start_time = format_time(segment.start)`
			`end_time = format_time(segment.end)`
			`text = segment.text.strip()`
			`f.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")`

			`def format_time(seconds):`
			`hours = int(seconds // 3600)`
			`minutes = int((seconds % 3600) // 60)`
			`seconds = int(seconds % 60)`
			`milliseconds = int((seconds % 1) * 1000)`
			`return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"`

			`# Process all MP3 files`
			`for filename in tqdm(os.listdir(podcast_dir)):`
			`if filename.endswith(".mp3"):`
			`mp3_path = os.path.join(podcast_dir, filename)`
			`srt_path = os.path.join(output_dir, filename.replace(".mp3", ".srt"))`

			`print(f"Transcribing {filename}...")`
			`segments = transcribe_audio(mp3_path)`
			`create_srt(segments, srt_path)`
			`print(f"Transcription saved to {srt_path}")`

			`print("All podcasts have been transcribed.")`