diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..85e7c1d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea/ diff --git a/app.py b/app.py new file mode 100644 index 0000000..8e2364f --- /dev/null +++ b/app.py @@ -0,0 +1,246 @@ +import streamlit as st +import os +import tempfile +from moviepy.editor import VideoFileClip +import ffmpeg +from transformers import WhisperProcessor, WhisperForConditionalGeneration +import tqdm + +# Load Whisper model +@st.cache_resource +def load_whisper_model(): + try: + processor = WhisperProcessor.from_pretrained("openai/whisper-medium") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium") + return processor, model + except Exception as e: + st.error(f"Failed to load Whisper model: {str(e)}") + return None, None + +processor, model = load_whisper_model() + +def transcribe_audio(audio_file, language, chunk_length=3): # Changed default to 3 seconds + if model is None or processor is None: + st.error("Whisper model is not loaded. Cannot transcribe audio.") + return [] + + # Load audio + audio_input, sr = AudioLoader.load_audio(audio_file) + + # Calculate number of samples per chunk + samples_per_chunk = int(chunk_length * sr) + + segments = [] + for i in tqdm.tqdm(range(0, len(audio_input), samples_per_chunk)): + chunk = audio_input[i:i+samples_per_chunk] + + # Pad/trim audio chunk + chunk_input = processor.feature_extractor(chunk, sampling_rate=sr, return_tensors="pt").input_features + + # Generate token ids + predicted_ids = model.generate(chunk_input, language=language) + + # Decode token ids to text + transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) + + start_time = i / sr + end_time = min((i + samples_per_chunk) / sr, len(audio_input) / sr) + + segments.append({ + "start": start_time, + "end": end_time, + "text": transcription[0].strip() + }) + + return segments + +def format_srt(segments): + srt_content = "" + for i, segment in tqdm.tqdm(enumerate(segments, start=1)): + start_time = format_timestamp(segment['start']) + end_time = format_timestamp(segment['end']) + text = segment['text'].strip() + if text: # Only add non-empty segments + srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n" + return srt_content + +def format_timestamp(seconds): + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds = seconds % 60 + milliseconds = int((seconds - int(seconds)) * 1000) + return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}" + +# Add this helper class for audio loading +class AudioLoader: + @staticmethod + def load_audio(file_path): + import librosa + audio, sr = librosa.load(file_path, sr=16000) + return audio, sr + +def burn_subtitles(video_path, srt_content): + with tempfile.NamedTemporaryFile(delete=False, suffix='.srt') as temp_srt: + temp_srt.write(srt_content.encode('utf-8')) + temp_srt_path = temp_srt.name + + output_path = os.path.splitext(video_path)[0] + '_with_captions.mp4' + temp_video_path = os.path.splitext(video_path)[0] + '_temp_video.mp4' + temp_audio_path = os.path.splitext(video_path)[0] + '_temp_audio.aac' + + try: + # Extract video metadata + probe = ffmpeg.probe(video_path) + video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None) + + if video_stream is None: + raise ValueError("No video stream found in the input file.") + + width = int(video_stream['width']) + height = int(video_stream['height']) + + # Extract audio + ffmpeg.input(video_path).output(temp_audio_path, acodec='aac', audio_bitrate='128k').overwrite_output().run(capture_stdout=True, capture_stderr=True) + + # Process video with subtitles (without audio) + subtitle_style = ( + 'Fontname=Arial,Fontsize=18,' + 'PrimaryColour=&HFFFFFF&,' + 'OutlineColour=&H000000&,' + 'BorderStyle=3,' + 'Outline=1,' + 'Shadow=1,' + 'MarginV=20' + ) + ffmpeg.input(video_path).filter( + 'subtitles', + temp_srt_path, + force_style=subtitle_style + ).output( + temp_video_path, + vcodec='libx264', + video_bitrate='2000k', + an=None, + s=f'{width}x{height}' + ).overwrite_output().run(capture_stdout=True, capture_stderr=True) + + # Combine video with subtitles and original audio + ffmpeg.concat( + ffmpeg.input(temp_video_path), + ffmpeg.input(temp_audio_path), + v=1, + a=1 + ).output(output_path, vcodec='libx264', acodec='aac').overwrite_output().run(capture_stdout=True, capture_stderr=True) + + # Check if the output file was created and has both video and audio streams + if os.path.exists(output_path): + output_probe = ffmpeg.probe(output_path) + output_video_stream = next((stream for stream in output_probe['streams'] if stream['codec_type'] == 'video'), None) + output_audio_stream = next((stream for stream in output_probe['streams'] if stream['codec_type'] == 'audio'), None) + + if output_video_stream is None or output_audio_stream is None: + raise ValueError("Output file is missing video or audio stream.") + else: + raise FileNotFoundError("Output file was not created.") + + except (ffmpeg.Error, ValueError, FileNotFoundError) as e: + st.error(f"An error occurred while burning subtitles: {str(e)}") + return None + finally: + os.unlink(temp_srt_path) + if os.path.exists(temp_video_path): + os.unlink(temp_video_path) + if os.path.exists(temp_audio_path): + os.unlink(temp_audio_path) + + return output_path + +def convert_to_web_compatible(input_path): + output_path = os.path.splitext(input_path)[0] + '_web.mp4' + try: + ( + ffmpeg + .input(input_path) + .output(output_path, vcodec='libx264', acodec='aac', + video_bitrate='1000k', audio_bitrate='128k') + .overwrite_output() + .run(capture_stdout=True, capture_stderr=True) + ) + return output_path + except ffmpeg.Error as e: + st.error(f"An error occurred while converting the video: {e.stderr.decode()}") + return None + +st.title("Reel Caption Maker") + +if 'temp_video_path' not in st.session_state: + st.session_state.temp_video_path = None +if 'web_compatible_video_path' not in st.session_state: + st.session_state.web_compatible_video_path = None + +uploaded_file = st.file_uploader("Choose a video file", type=["mp4", "mov", "avi"]) + +if uploaded_file is not None: + # Save the uploaded file to a temporary location if not already done + if st.session_state.temp_video_path is None or not os.path.exists(st.session_state.temp_video_path): + with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_video: + temp_video.write(uploaded_file.read()) + st.session_state.temp_video_path = temp_video.name + + # Convert the video to web-compatible format + st.session_state.web_compatible_video_path = convert_to_web_compatible(st.session_state.temp_video_path) + + # Create two columns for layout + col1, col2 = st.columns([2, 3]) + + with col1: + st.subheader("Video Player") + # Display the web-compatible video + if st.session_state.web_compatible_video_path: + st.video(st.session_state.web_compatible_video_path) + else: + st.error("Failed to convert video to web-compatible format.") + + with col2: + st.subheader("Captions") + language = st.selectbox("Select video language", ["French", "English"]) + lang_code = "fr" if language == "French" else "en" + + if st.button("Generate Captions"): + with st.spinner("Generating captions..."): + video = VideoFileClip(st.session_state.temp_video_path) + audio = video.audio + audio.write_audiofile("temp_audio.wav") + + segments = transcribe_audio("temp_audio.wav", lang_code, chunk_length=3) + srt_content = format_srt(segments) + + st.session_state.srt_content = srt_content + st.session_state.temp_audio_path = "temp_audio.wav" # Store the audio path + + video.close() + + if 'srt_content' in st.session_state: + edited_srt = st.text_area("Edit Captions (SRT format)", st.session_state.srt_content, height=300) + + if st.button("Burn Captions and Download"): + with st.spinner("Burning captions onto video..."): + output_path = burn_subtitles(st.session_state.temp_video_path, edited_srt) + + if output_path: + with open(output_path, "rb") as file: + st.download_button( + label="Download Video with Captions", + data=file, + file_name="video_with_captions.mp4", + mime="video/mp4" + ) + + os.remove(output_path) + os.remove(st.session_state.temp_audio_path) # Remove the temporary audio file + if 'temp_audio_path' in st.session_state: + del st.session_state.temp_audio_path # Remove the audio path from session state + else: + st.error("Failed to burn captions onto the video.") + + os.remove(st.session_state.temp_video_path) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..94feb12 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,78 @@ +altair==5.4.1 +attrs==24.2.0 +audioread==3.0.1 +blinker==1.8.2 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.4.0 +click==8.1.7 +decorator==4.4.2 +ffmpeg-python==0.2.0 +filelock==3.16.1 +fsspec==2024.9.0 +future==1.0.0 +gitdb==4.0.11 +GitPython==3.1.43 +huggingface-hub==0.25.2 +idna==3.10 +imageio==2.36.0 +imageio-ffmpeg==0.5.1 +Jinja2==3.1.4 +joblib==1.4.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +lazy_loader==0.4 +librosa==0.10.2.post1 +llvmlite==0.43.0 +markdown-it-py==3.0.0 +MarkupSafe==3.0.1 +mdurl==0.1.2 +moviepy==1.0.3 +mpmath==1.3.0 +msgpack==1.1.0 +narwhals==1.9.3 +networkx==3.4.1 +numba==0.60.0 +numpy==2.0.2 +packaging==24.1 +pandas==2.2.3 +pillow==10.4.0 +platformdirs==4.3.6 +pooch==1.8.2 +proglog==0.1.10 +protobuf==5.28.2 +pyarrow==17.0.0 +pycparser==2.22 +pydeck==0.9.1 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +pytz==2024.2 +PyYAML==6.0.2 +referencing==0.35.1 +regex==2024.9.11 +requests==2.32.3 +rich==13.9.2 +rpds-py==0.20.0 +safetensors==0.4.5 +scikit-learn==1.5.2 +scipy==1.14.1 +setuptools==75.1.0 +six==1.16.0 +smmap==5.0.1 +soundfile==0.12.1 +soxr==0.5.0.post1 +streamlit==1.39.0 +sympy==1.13.3 +tenacity==9.0.0 +threadpoolctl==3.5.0 +tokenizers==0.20.1 +toml==0.10.2 +torch==2.4.1 +tornado==6.4.1 +tqdm==4.66.5 +transformers==4.45.2 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.2.3 +whisper==1.1.10