version initiale. fonctionnelle
This commit is contained in:
parent
37af3f1915
commit
8e36769112
3 changed files with 325 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/.idea/
|
246
app.py
Normal file
246
app.py
Normal file
|
@ -0,0 +1,246 @@
|
|||
import streamlit as st
|
||||
import os
|
||||
import tempfile
|
||||
from moviepy.editor import VideoFileClip
|
||||
import ffmpeg
|
||||
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
||||
import tqdm
|
||||
|
||||
# Load Whisper model
|
||||
@st.cache_resource
|
||||
def load_whisper_model():
|
||||
try:
|
||||
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
|
||||
return processor, model
|
||||
except Exception as e:
|
||||
st.error(f"Failed to load Whisper model: {str(e)}")
|
||||
return None, None
|
||||
|
||||
processor, model = load_whisper_model()
|
||||
|
||||
def transcribe_audio(audio_file, language, chunk_length=3): # Changed default to 3 seconds
|
||||
if model is None or processor is None:
|
||||
st.error("Whisper model is not loaded. Cannot transcribe audio.")
|
||||
return []
|
||||
|
||||
# Load audio
|
||||
audio_input, sr = AudioLoader.load_audio(audio_file)
|
||||
|
||||
# Calculate number of samples per chunk
|
||||
samples_per_chunk = int(chunk_length * sr)
|
||||
|
||||
segments = []
|
||||
for i in tqdm.tqdm(range(0, len(audio_input), samples_per_chunk)):
|
||||
chunk = audio_input[i:i+samples_per_chunk]
|
||||
|
||||
# Pad/trim audio chunk
|
||||
chunk_input = processor.feature_extractor(chunk, sampling_rate=sr, return_tensors="pt").input_features
|
||||
|
||||
# Generate token ids
|
||||
predicted_ids = model.generate(chunk_input, language=language)
|
||||
|
||||
# Decode token ids to text
|
||||
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
||||
|
||||
start_time = i / sr
|
||||
end_time = min((i + samples_per_chunk) / sr, len(audio_input) / sr)
|
||||
|
||||
segments.append({
|
||||
"start": start_time,
|
||||
"end": end_time,
|
||||
"text": transcription[0].strip()
|
||||
})
|
||||
|
||||
return segments
|
||||
|
||||
def format_srt(segments):
|
||||
srt_content = ""
|
||||
for i, segment in tqdm.tqdm(enumerate(segments, start=1)):
|
||||
start_time = format_timestamp(segment['start'])
|
||||
end_time = format_timestamp(segment['end'])
|
||||
text = segment['text'].strip()
|
||||
if text: # Only add non-empty segments
|
||||
srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
|
||||
return srt_content
|
||||
|
||||
def format_timestamp(seconds):
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
seconds = seconds % 60
|
||||
milliseconds = int((seconds - int(seconds)) * 1000)
|
||||
return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}"
|
||||
|
||||
# Add this helper class for audio loading
|
||||
class AudioLoader:
|
||||
@staticmethod
|
||||
def load_audio(file_path):
|
||||
import librosa
|
||||
audio, sr = librosa.load(file_path, sr=16000)
|
||||
return audio, sr
|
||||
|
||||
def burn_subtitles(video_path, srt_content):
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.srt') as temp_srt:
|
||||
temp_srt.write(srt_content.encode('utf-8'))
|
||||
temp_srt_path = temp_srt.name
|
||||
|
||||
output_path = os.path.splitext(video_path)[0] + '_with_captions.mp4'
|
||||
temp_video_path = os.path.splitext(video_path)[0] + '_temp_video.mp4'
|
||||
temp_audio_path = os.path.splitext(video_path)[0] + '_temp_audio.aac'
|
||||
|
||||
try:
|
||||
# Extract video metadata
|
||||
probe = ffmpeg.probe(video_path)
|
||||
video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
|
||||
|
||||
if video_stream is None:
|
||||
raise ValueError("No video stream found in the input file.")
|
||||
|
||||
width = int(video_stream['width'])
|
||||
height = int(video_stream['height'])
|
||||
|
||||
# Extract audio
|
||||
ffmpeg.input(video_path).output(temp_audio_path, acodec='aac', audio_bitrate='128k').overwrite_output().run(capture_stdout=True, capture_stderr=True)
|
||||
|
||||
# Process video with subtitles (without audio)
|
||||
subtitle_style = (
|
||||
'Fontname=Arial,Fontsize=18,'
|
||||
'PrimaryColour=&HFFFFFF&,'
|
||||
'OutlineColour=&H000000&,'
|
||||
'BorderStyle=3,'
|
||||
'Outline=1,'
|
||||
'Shadow=1,'
|
||||
'MarginV=20'
|
||||
)
|
||||
ffmpeg.input(video_path).filter(
|
||||
'subtitles',
|
||||
temp_srt_path,
|
||||
force_style=subtitle_style
|
||||
).output(
|
||||
temp_video_path,
|
||||
vcodec='libx264',
|
||||
video_bitrate='2000k',
|
||||
an=None,
|
||||
s=f'{width}x{height}'
|
||||
).overwrite_output().run(capture_stdout=True, capture_stderr=True)
|
||||
|
||||
# Combine video with subtitles and original audio
|
||||
ffmpeg.concat(
|
||||
ffmpeg.input(temp_video_path),
|
||||
ffmpeg.input(temp_audio_path),
|
||||
v=1,
|
||||
a=1
|
||||
).output(output_path, vcodec='libx264', acodec='aac').overwrite_output().run(capture_stdout=True, capture_stderr=True)
|
||||
|
||||
# Check if the output file was created and has both video and audio streams
|
||||
if os.path.exists(output_path):
|
||||
output_probe = ffmpeg.probe(output_path)
|
||||
output_video_stream = next((stream for stream in output_probe['streams'] if stream['codec_type'] == 'video'), None)
|
||||
output_audio_stream = next((stream for stream in output_probe['streams'] if stream['codec_type'] == 'audio'), None)
|
||||
|
||||
if output_video_stream is None or output_audio_stream is None:
|
||||
raise ValueError("Output file is missing video or audio stream.")
|
||||
else:
|
||||
raise FileNotFoundError("Output file was not created.")
|
||||
|
||||
except (ffmpeg.Error, ValueError, FileNotFoundError) as e:
|
||||
st.error(f"An error occurred while burning subtitles: {str(e)}")
|
||||
return None
|
||||
finally:
|
||||
os.unlink(temp_srt_path)
|
||||
if os.path.exists(temp_video_path):
|
||||
os.unlink(temp_video_path)
|
||||
if os.path.exists(temp_audio_path):
|
||||
os.unlink(temp_audio_path)
|
||||
|
||||
return output_path
|
||||
|
||||
def convert_to_web_compatible(input_path):
|
||||
output_path = os.path.splitext(input_path)[0] + '_web.mp4'
|
||||
try:
|
||||
(
|
||||
ffmpeg
|
||||
.input(input_path)
|
||||
.output(output_path, vcodec='libx264', acodec='aac',
|
||||
video_bitrate='1000k', audio_bitrate='128k')
|
||||
.overwrite_output()
|
||||
.run(capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
return output_path
|
||||
except ffmpeg.Error as e:
|
||||
st.error(f"An error occurred while converting the video: {e.stderr.decode()}")
|
||||
return None
|
||||
|
||||
st.title("Reel Caption Maker")
|
||||
|
||||
if 'temp_video_path' not in st.session_state:
|
||||
st.session_state.temp_video_path = None
|
||||
if 'web_compatible_video_path' not in st.session_state:
|
||||
st.session_state.web_compatible_video_path = None
|
||||
|
||||
uploaded_file = st.file_uploader("Choose a video file", type=["mp4", "mov", "avi"])
|
||||
|
||||
if uploaded_file is not None:
|
||||
# Save the uploaded file to a temporary location if not already done
|
||||
if st.session_state.temp_video_path is None or not os.path.exists(st.session_state.temp_video_path):
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_video:
|
||||
temp_video.write(uploaded_file.read())
|
||||
st.session_state.temp_video_path = temp_video.name
|
||||
|
||||
# Convert the video to web-compatible format
|
||||
st.session_state.web_compatible_video_path = convert_to_web_compatible(st.session_state.temp_video_path)
|
||||
|
||||
# Create two columns for layout
|
||||
col1, col2 = st.columns([2, 3])
|
||||
|
||||
with col1:
|
||||
st.subheader("Video Player")
|
||||
# Display the web-compatible video
|
||||
if st.session_state.web_compatible_video_path:
|
||||
st.video(st.session_state.web_compatible_video_path)
|
||||
else:
|
||||
st.error("Failed to convert video to web-compatible format.")
|
||||
|
||||
with col2:
|
||||
st.subheader("Captions")
|
||||
language = st.selectbox("Select video language", ["French", "English"])
|
||||
lang_code = "fr" if language == "French" else "en"
|
||||
|
||||
if st.button("Generate Captions"):
|
||||
with st.spinner("Generating captions..."):
|
||||
video = VideoFileClip(st.session_state.temp_video_path)
|
||||
audio = video.audio
|
||||
audio.write_audiofile("temp_audio.wav")
|
||||
|
||||
segments = transcribe_audio("temp_audio.wav", lang_code, chunk_length=3)
|
||||
srt_content = format_srt(segments)
|
||||
|
||||
st.session_state.srt_content = srt_content
|
||||
st.session_state.temp_audio_path = "temp_audio.wav" # Store the audio path
|
||||
|
||||
video.close()
|
||||
|
||||
if 'srt_content' in st.session_state:
|
||||
edited_srt = st.text_area("Edit Captions (SRT format)", st.session_state.srt_content, height=300)
|
||||
|
||||
if st.button("Burn Captions and Download"):
|
||||
with st.spinner("Burning captions onto video..."):
|
||||
output_path = burn_subtitles(st.session_state.temp_video_path, edited_srt)
|
||||
|
||||
if output_path:
|
||||
with open(output_path, "rb") as file:
|
||||
st.download_button(
|
||||
label="Download Video with Captions",
|
||||
data=file,
|
||||
file_name="video_with_captions.mp4",
|
||||
mime="video/mp4"
|
||||
)
|
||||
|
||||
os.remove(output_path)
|
||||
os.remove(st.session_state.temp_audio_path) # Remove the temporary audio file
|
||||
if 'temp_audio_path' in st.session_state:
|
||||
del st.session_state.temp_audio_path # Remove the audio path from session state
|
||||
else:
|
||||
st.error("Failed to burn captions onto the video.")
|
||||
|
||||
os.remove(st.session_state.temp_video_path)
|
78
requirements.txt
Normal file
78
requirements.txt
Normal file
|
@ -0,0 +1,78 @@
|
|||
altair==5.4.1
|
||||
attrs==24.2.0
|
||||
audioread==3.0.1
|
||||
blinker==1.8.2
|
||||
cachetools==5.5.0
|
||||
certifi==2024.8.30
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.0
|
||||
click==8.1.7
|
||||
decorator==4.4.2
|
||||
ffmpeg-python==0.2.0
|
||||
filelock==3.16.1
|
||||
fsspec==2024.9.0
|
||||
future==1.0.0
|
||||
gitdb==4.0.11
|
||||
GitPython==3.1.43
|
||||
huggingface-hub==0.25.2
|
||||
idna==3.10
|
||||
imageio==2.36.0
|
||||
imageio-ffmpeg==0.5.1
|
||||
Jinja2==3.1.4
|
||||
joblib==1.4.2
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
lazy_loader==0.4
|
||||
librosa==0.10.2.post1
|
||||
llvmlite==0.43.0
|
||||
markdown-it-py==3.0.0
|
||||
MarkupSafe==3.0.1
|
||||
mdurl==0.1.2
|
||||
moviepy==1.0.3
|
||||
mpmath==1.3.0
|
||||
msgpack==1.1.0
|
||||
narwhals==1.9.3
|
||||
networkx==3.4.1
|
||||
numba==0.60.0
|
||||
numpy==2.0.2
|
||||
packaging==24.1
|
||||
pandas==2.2.3
|
||||
pillow==10.4.0
|
||||
platformdirs==4.3.6
|
||||
pooch==1.8.2
|
||||
proglog==0.1.10
|
||||
protobuf==5.28.2
|
||||
pyarrow==17.0.0
|
||||
pycparser==2.22
|
||||
pydeck==0.9.1
|
||||
Pygments==2.18.0
|
||||
python-dateutil==2.9.0.post0
|
||||
pytz==2024.2
|
||||
PyYAML==6.0.2
|
||||
referencing==0.35.1
|
||||
regex==2024.9.11
|
||||
requests==2.32.3
|
||||
rich==13.9.2
|
||||
rpds-py==0.20.0
|
||||
safetensors==0.4.5
|
||||
scikit-learn==1.5.2
|
||||
scipy==1.14.1
|
||||
setuptools==75.1.0
|
||||
six==1.16.0
|
||||
smmap==5.0.1
|
||||
soundfile==0.12.1
|
||||
soxr==0.5.0.post1
|
||||
streamlit==1.39.0
|
||||
sympy==1.13.3
|
||||
tenacity==9.0.0
|
||||
threadpoolctl==3.5.0
|
||||
tokenizers==0.20.1
|
||||
toml==0.10.2
|
||||
torch==2.4.1
|
||||
tornado==6.4.1
|
||||
tqdm==4.66.5
|
||||
transformers==4.45.2
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.2
|
||||
urllib3==2.2.3
|
||||
whisper==1.1.10
|
Loading…
Reference in a new issue