version initiale. fonctionnelle

2024-10-14 01:30:28 -04:00 · 2024-10-14 01:30:28 -04:00 · 8e36769112
commit 8e36769112
parent 37af3f1915
3 changed files with 325 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/.idea/
--- a/app.py
+++ b/app.py
@ -0,0 +1,246 @@
+import streamlit as st
+import os
+import tempfile
+from moviepy.editor import VideoFileClip
+import ffmpeg
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import tqdm
+
+# Load Whisper model
+@st.cache_resource
+def load_whisper_model():
+    try:
+        processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
+        return processor, model
+    except Exception as e:
+        st.error(f"Failed to load Whisper model: {str(e)}")
+        return None, None
+
+processor, model = load_whisper_model()
+
+def transcribe_audio(audio_file, language, chunk_length=3):  # Changed default to 3 seconds
+    if model is None or processor is None:
+        st.error("Whisper model is not loaded. Cannot transcribe audio.")
+        return []
+    
+    # Load audio
+    audio_input, sr = AudioLoader.load_audio(audio_file)
+    
+    # Calculate number of samples per chunk
+    samples_per_chunk = int(chunk_length * sr)
+    
+    segments = []
+    for i in tqdm.tqdm(range(0, len(audio_input), samples_per_chunk)):
+        chunk = audio_input[i:i+samples_per_chunk]
+        
+        # Pad/trim audio chunk
+        chunk_input = processor.feature_extractor(chunk, sampling_rate=sr, return_tensors="pt").input_features
+
+        # Generate token ids
+        predicted_ids = model.generate(chunk_input, language=language)
+        
+        # Decode token ids to text
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        
+        start_time = i / sr
+        end_time = min((i + samples_per_chunk) / sr, len(audio_input) / sr)
+        
+        segments.append({
+            "start": start_time,
+            "end": end_time,
+            "text": transcription[0].strip()
+        })
+    
+    return segments
+
+def format_srt(segments):
+    srt_content = ""
+    for i, segment in tqdm.tqdm(enumerate(segments, start=1)):
+        start_time = format_timestamp(segment['start'])
+        end_time = format_timestamp(segment['end'])
+        text = segment['text'].strip()
+        if text:  # Only add non-empty segments
+            srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
+    return srt_content
+
+def format_timestamp(seconds):
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = seconds % 60
+    milliseconds = int((seconds - int(seconds)) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}"
+
+# Add this helper class for audio loading
+class AudioLoader:
+    @staticmethod
+    def load_audio(file_path):
+        import librosa
+        audio, sr = librosa.load(file_path, sr=16000)
+        return audio, sr
+
+def burn_subtitles(video_path, srt_content):
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.srt') as temp_srt:
+        temp_srt.write(srt_content.encode('utf-8'))
+        temp_srt_path = temp_srt.name
+
+    output_path = os.path.splitext(video_path)[0] + '_with_captions.mp4'
+    temp_video_path = os.path.splitext(video_path)[0] + '_temp_video.mp4'
+    temp_audio_path = os.path.splitext(video_path)[0] + '_temp_audio.aac'
+    
+    try:
+        # Extract video metadata
+        probe = ffmpeg.probe(video_path)
+        video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
+        
+        if video_stream is None:
+            raise ValueError("No video stream found in the input file.")
+        
+        width = int(video_stream['width'])
+        height = int(video_stream['height'])
+
+        # Extract audio
+        ffmpeg.input(video_path).output(temp_audio_path, acodec='aac', audio_bitrate='128k').overwrite_output().run(capture_stdout=True, capture_stderr=True)
+
+        # Process video with subtitles (without audio)
+        subtitle_style = (
+            'Fontname=Arial,Fontsize=18,'
+            'PrimaryColour=&HFFFFFF&,'
+            'OutlineColour=&H000000&,'
+            'BorderStyle=3,'
+            'Outline=1,'
+            'Shadow=1,'
+            'MarginV=20'
+        )
+        ffmpeg.input(video_path).filter(
+            'subtitles', 
+            temp_srt_path, 
+            force_style=subtitle_style
+        ).output(
+            temp_video_path, 
+            vcodec='libx264', 
+            video_bitrate='2000k', 
+            an=None, 
+            s=f'{width}x{height}'
+        ).overwrite_output().run(capture_stdout=True, capture_stderr=True)
+
+        # Combine video with subtitles and original audio
+        ffmpeg.concat(
+            ffmpeg.input(temp_video_path),
+            ffmpeg.input(temp_audio_path),
+            v=1,
+            a=1
+        ).output(output_path, vcodec='libx264', acodec='aac').overwrite_output().run(capture_stdout=True, capture_stderr=True)
+        
+        # Check if the output file was created and has both video and audio streams
+        if os.path.exists(output_path):
+            output_probe = ffmpeg.probe(output_path)
+            output_video_stream = next((stream for stream in output_probe['streams'] if stream['codec_type'] == 'video'), None)
+            output_audio_stream = next((stream for stream in output_probe['streams'] if stream['codec_type'] == 'audio'), None)
+            
+            if output_video_stream is None or output_audio_stream is None:
+                raise ValueError("Output file is missing video or audio stream.")
+        else:
+            raise FileNotFoundError("Output file was not created.")
+        
+    except (ffmpeg.Error, ValueError, FileNotFoundError) as e:
+        st.error(f"An error occurred while burning subtitles: {str(e)}")
+        return None
+    finally:
+        os.unlink(temp_srt_path)
+        if os.path.exists(temp_video_path):
+            os.unlink(temp_video_path)
+        if os.path.exists(temp_audio_path):
+            os.unlink(temp_audio_path)
+
+    return output_path
+
+def convert_to_web_compatible(input_path):
+    output_path = os.path.splitext(input_path)[0] + '_web.mp4'
+    try:
+        (
+            ffmpeg
+            .input(input_path)
+            .output(output_path, vcodec='libx264', acodec='aac',
+                    video_bitrate='1000k', audio_bitrate='128k')
+            .overwrite_output()
+            .run(capture_stdout=True, capture_stderr=True)
+        )
+        return output_path
+    except ffmpeg.Error as e:
+        st.error(f"An error occurred while converting the video: {e.stderr.decode()}")
+        return None
+
+st.title("Reel Caption Maker")
+
+if 'temp_video_path' not in st.session_state:
+    st.session_state.temp_video_path = None
+if 'web_compatible_video_path' not in st.session_state:
+    st.session_state.web_compatible_video_path = None
+
+uploaded_file = st.file_uploader("Choose a video file", type=["mp4", "mov", "avi"])
+
+if uploaded_file is not None:
+    # Save the uploaded file to a temporary location if not already done
+    if st.session_state.temp_video_path is None or not os.path.exists(st.session_state.temp_video_path):
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_video:
+            temp_video.write(uploaded_file.read())
+            st.session_state.temp_video_path = temp_video.name
+
+        # Convert the video to web-compatible format
+        st.session_state.web_compatible_video_path = convert_to_web_compatible(st.session_state.temp_video_path)
+
+    # Create two columns for layout
+    col1, col2 = st.columns([2, 3])
+
+    with col1:
+        st.subheader("Video Player")
+        # Display the web-compatible video
+        if st.session_state.web_compatible_video_path:
+            st.video(st.session_state.web_compatible_video_path)
+        else:
+            st.error("Failed to convert video to web-compatible format.")
+
+    with col2:
+        st.subheader("Captions")
+        language = st.selectbox("Select video language", ["French", "English"])
+        lang_code = "fr" if language == "French" else "en"
+
+        if st.button("Generate Captions"):
+            with st.spinner("Generating captions..."):
+                video = VideoFileClip(st.session_state.temp_video_path)
+                audio = video.audio
+                audio.write_audiofile("temp_audio.wav")
+
+                segments = transcribe_audio("temp_audio.wav", lang_code, chunk_length=3)
+                srt_content = format_srt(segments)
+
+                st.session_state.srt_content = srt_content
+                st.session_state.temp_audio_path = "temp_audio.wav"  # Store the audio path
+
+                video.close()
+
+        if 'srt_content' in st.session_state:
+            edited_srt = st.text_area("Edit Captions (SRT format)", st.session_state.srt_content, height=300)
+
+        if st.button("Burn Captions and Download"):
+            with st.spinner("Burning captions onto video..."):
+                output_path = burn_subtitles(st.session_state.temp_video_path, edited_srt)
+
+                if output_path:
+                    with open(output_path, "rb") as file:
+                        st.download_button(
+                            label="Download Video with Captions",
+                            data=file,
+                            file_name="video_with_captions.mp4",
+                            mime="video/mp4"
+                        )
+
+                    os.remove(output_path)
+                    os.remove(st.session_state.temp_audio_path)  # Remove the temporary audio file
+                    if 'temp_audio_path' in st.session_state:
+                        del st.session_state.temp_audio_path  # Remove the audio path from session state
+                else:
+                    st.error("Failed to burn captions onto the video.")
+
+            os.remove(st.session_state.temp_video_path)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,78 @@
+altair==5.4.1
+attrs==24.2.0
+audioread==3.0.1
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+decorator==4.4.2
+ffmpeg-python==0.2.0
+filelock==3.16.1
+fsspec==2024.9.0
+future==1.0.0
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.25.2
+idna==3.10
+imageio==2.36.0
+imageio-ffmpeg==0.5.1
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lazy_loader==0.4
+librosa==0.10.2.post1
+llvmlite==0.43.0
+markdown-it-py==3.0.0
+MarkupSafe==3.0.1
+mdurl==0.1.2
+moviepy==1.0.3
+mpmath==1.3.0
+msgpack==1.1.0
+narwhals==1.9.3
+networkx==3.4.1
+numba==0.60.0
+numpy==2.0.2
+packaging==24.1
+pandas==2.2.3
+pillow==10.4.0
+platformdirs==4.3.6
+pooch==1.8.2
+proglog==0.1.10
+protobuf==5.28.2
+pyarrow==17.0.0
+pycparser==2.22
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rich==13.9.2
+rpds-py==0.20.0
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+setuptools==75.1.0
+six==1.16.0
+smmap==5.0.1
+soundfile==0.12.1
+soxr==0.5.0.post1
+streamlit==1.39.0
+sympy==1.13.3
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tokenizers==0.20.1
+toml==0.10.2
+torch==2.4.1
+tornado==6.4.1
+tqdm==4.66.5
+transformers==4.45.2
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+whisper==1.1.10