🚀 Add feature: Using MinIO as file storage for conversion both as input and output

This commit is contained in:
François Pelletier 2025-05-14 18:44:28 -04:00
parent afdfe1dbac
commit 579a3fe379
26 changed files with 16204 additions and 133 deletions

View file

@ -1,4 +1,11 @@
from app.config import logger
import datetime
import json
import os
from typing import Any, Dict, Union
from fastapi import APIRouter, HTTPException
from app.config import logger, minio_bucket_name, minio_client
from app.convert.convert_bluesky_car import convert_bluesky_car
from app.convert.convert_export_txt import convert_export_txt
from app.convert.convert_facebook_comments_json import (
@ -33,56 +40,225 @@ from app.convert.convert_youtube_shorts_video import (
)
from app.convert.convert_youtube_video_video import convert_youtube_video_video
from app.models import ConversionRequest, ConversionResponse
from fastapi import APIRouter
convert_router = APIRouter(prefix="/convert", tags=["Convert"])
# Define a mapping of source names to converter functions
CONVERTERS = {
"linkedin_shares": {
"csv": convert_linkedin_shares_csv
},
"linkedin_comments": {
"csv": convert_linkedin_comments_csv
},
"facebook_posts": {
"json": convert_facebook_posts_json
},
"facebook_comments": {
"json": convert_facebook_comments_json
},
"instagram_posts": {
"json": convert_instagram_posts_json
},
"instagram_comments": {
"json": convert_instagram_comments_json
},
"instagram_stories": {
"json": convert_instagram_stories_json,
"image": convert_instagram_stories_image
},
"instagram_reels": {
"json": convert_instagram_reels_json,
"video": convert_instagram_reels_video
},
"bluesky": {
"car": convert_bluesky_car
},
"youtube_video": {
"video": convert_youtube_video_video
},
"youtube_shorts": {
"video": convert_youtube_shorts_video
},
"markdown": {
"txt": convert_markdown_txt
},
"export": {
"txt": convert_export_txt
}
}
def read_content_from_minio(request: ConversionRequest) -> Union[str, bytes]:
"""
Read content from MinIO storage based on the request filename.
Args:
request: The conversion request containing the filename
Returns:
The file content as string (for text files) or bytes (for binary files)
Raises:
HTTPException: If the file cannot be read or doesn't exist
"""
# Check if filename exists
if not request.filename:
logger.error("Filename is empty or invalid")
raise HTTPException(
status_code=400, detail="Filename is required"
)
# Read file from MinIO
try:
logger.info(
f"Reading file '{request.filename}' from MinIO bucket '{minio_bucket_name}'")
with minio_client.get_object(
bucket_name=minio_bucket_name, object_name=request.filename
) as response:
content_type = response.headers.get("content-type", "")
logger.debug(f"File content type: {content_type}")
if content_type.startswith("text/"):
# Read as text (UTF-8)
content = response.read().decode("utf-8")
logger.debug(f"Read {len(content)} characters from text file")
else:
# Read as binary
content = response.read()
logger.debug(f"Read {len(content)} bytes from binary file")
return content
except Exception as e:
error_msg = f"Error reading file '{request.filename}' from MinIO: {e!s}"
logger.error(error_msg)
raise HTTPException(
status_code=500, detail=error_msg
) from e
def save_to_minio(data: Dict[str, Any], source_name: str) -> str:
"""
Save converted data to MinIO as a JSON file.
Args:
data: The data to save
source_name: The source name to use in the filename
Returns:
The filename of the saved file
Raises:
HTTPException: If the file cannot be saved
"""
try:
# Generate a unique filename with timestamp
timestamp = datetime.datetime.now(tz=datetime.UTC).isoformat().replace(
":", "-")
tmp_filename = f"{source_name}_{timestamp}.json"
logger.info(f"Saving converted data to temporary file '{tmp_filename}'")
# Write to temporary file
with open(tmp_filename, "w") as f:
json.dump(data, f)
# Upload to MinIO
logger.info(
f"Uploading '{tmp_filename}' to MinIO bucket '{minio_bucket_name}'")
minio_client.fput_object(
bucket_name=minio_bucket_name,
object_name=tmp_filename,
file_path=tmp_filename
)
# Clean up temporary file
try:
os.remove(tmp_filename)
logger.debug(f"Removed temporary file '{tmp_filename}'")
except OSError as e:
logger.warning(
f"Failed to remove temporary file '{tmp_filename}': {e!s}")
return tmp_filename
except Exception as e:
error_msg = f"Error saving converted data to MinIO: {e!s}"
logger.error(error_msg)
raise HTTPException(
status_code=500, detail=error_msg
) from e
@convert_router.post("/", response_model=ConversionResponse)
def convert_data(request: ConversionRequest):
"""
Convert data from a source to normalized JSON
Convert data from a source to normalized JSON and store it in MinIO.
Args:
request: The conversion request containing source details
Returns:
A ConversionResponse with status and metadata
Raises:
HTTPException: If conversion fails or source is not supported
"""
converted_data = None
logger.info(f"Converting {request.source_name} data to normalized JSON")
try:
logger.info(
f"Processing conversion request for {request.source_name} in {request.source_format} format")
if request.source_name == "linkedin_shares":
converted_data = convert_linkedin_shares_csv(request.file)
elif request.source_name == "linkedin_comments":
converted_data = convert_linkedin_comments_csv(request.file)
elif request.source_name == "facebook_posts":
converted_data = convert_facebook_posts_json(request.file)
elif request.source_name == "facebook_comments":
converted_data = convert_facebook_comments_json(request.file)
elif request.source_name == "instagram_posts":
converted_data = convert_instagram_posts_json(request.file)
elif request.source_name == "instagram_comments":
converted_data = convert_instagram_comments_json(request.file)
elif request.source_name == "instagram_stories":
if request.source_format == "json":
converted_data = convert_instagram_stories_json(request.file)
elif request.source_format == "image":
converted_data = convert_instagram_stories_image(request.file)
elif request.source_name == "instagram_reels":
if request.source_format == "json":
converted_data = convert_instagram_reels_json(request.file)
elif request.source_format == "video":
converted_data = convert_instagram_reels_video(request.file)
elif request.source_name == "bluesky":
converted_data = convert_bluesky_car(request.file)
elif request.source_name == "youtube_video":
converted_data = convert_youtube_video_video(request.file)
elif request.source_name == "youtube_shorts":
converted_data = convert_youtube_shorts_video(request.file)
elif request.source_name == "markdown":
converted_data = convert_markdown_txt(request.file)
elif request.source_name == "export":
converted_data = convert_export_txt(request.file)
else:
value_error_message = f"Unsupported source name: {request.source_name}"
raise ValueError(value_error_message)
# Read content from MinIO
content = read_content_from_minio(request)
return {
"converted_data": converted_data,
"status": "success",
}
# Check if source and format are supported
if request.source_name not in CONVERTERS:
error_msg = f"Unsupported source name: {request.source_name}"
logger.error(error_msg)
raise HTTPException(status_code=400, detail=error_msg)
if request.source_format not in CONVERTERS[request.source_name]:
error_msg = f"Unsupported format '{request.source_format}' for source '{request.source_name}'"
logger.error(error_msg)
raise HTTPException(status_code=400, detail=error_msg)
# Get the appropriate converter function
converter = CONVERTERS[request.source_name][request.source_format]
# Convert the content
logger.info(
f"Converting {request.source_name} data using {converter.__name__}")
try:
converted_data = converter(content)
logger.info(
f"Successfully converted data with {len(converted_data)} records")
except Exception as e:
error_msg = f"Error during conversion: {e!s}"
logger.error(error_msg, exc_info=True)
raise HTTPException(status_code=500, detail=error_msg) from e
# Save the converted data to MinIO
saved_filename = save_to_minio(converted_data, request.source_name)
# Return success response
return ConversionResponse(
converted_data={}, # Empty dict as per original implementation
status="ok",
metadata={
"source": request.source_name,
"format": request.source_format,
"records_count": len(converted_data) if isinstance(
converted_data, list) else 1,
"saved_filename": saved_filename
}
)
except HTTPException:
# Re-raise HTTP exceptions
raise
except Exception as e:
# Catch any other exceptions
error_msg = f"Unexpected error during conversion process: {e!s}"
logger.exception(error_msg)
raise HTTPException(status_code=500, detail=error_msg) from e