🚀 Add feature: Charging in Milvus is now working

2025-05-21 17:21:57 -04:00 · 2025-05-21 17:21:57 -04:00 · 64832e2989
commit 64832e2989
parent f9e5a6e013
23 changed files with 354 additions and 109 deletions
--- a/backend/app/config.py
+++ b/backend/app/config.py
@ -1,5 +1,6 @@
 import logging
 import os
+import sys

 import dotenv
 from minio import Minio
@ -8,6 +9,13 @@ from app.models import AvailableSource, AvailableSourcesResponse

 dotenv.load_dotenv()

+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    stream=sys.stdout
+)
+
 logger = logging.getLogger("base_logger")

 available_sources = AvailableSourcesResponse(
@ -99,3 +107,8 @@ minio_client = Minio(
    secret_key=minio_secret_key,
    secure=minio_secure
 )
+
+ollama_url = os.environ.get("OLLAMA_URL", "http://host.docker.internal:11434")
+
+embedding_model_name = os.environ.get("OLLAMA_EMBEDDING_MODEL_NAME",
+                                      "snowflake-arctic-embed2")
--- a/backend/app/convert/convert_facebook_business_posts_json.py
+++ b/backend/app/convert/convert_facebook_business_posts_json.py
@ -16,9 +16,9 @@ class FacebookBusinessPostsConverter(BaseConverter):
        posts_medias = []
        for post in self.datadict:
            data_post_items = post['data']
-            texte_post_list = [item['post'] for item in data_post_items if
+            content_post_list = [item['post'] for item in data_post_items if
                               item.get('post')]
-            texte = "\n".join(texte_post_list)
+            content = "\n".join(content_post_list)

            for attachment in post['attachments']:
                if attachment.get('data'):
@ -27,7 +27,7 @@ class FacebookBusinessPostsConverter(BaseConverter):
                            media = data_item['media']
                            posts_medias.append({
                                "chemin": [media["uri"]],
-                                "texte": texte,
+                                "content": content,
                                "creation_timestamp": media[
                                    "creation_timestamp"]
                            })
--- a/backend/app/convert/convert_facebook_comments_json.py
+++ b/backend/app/convert/convert_facebook_comments_json.py
@ -4,9 +4,8 @@ from typing import Union, List, Dict
 import pandas as pd

 from app.config import logger
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
+from app.convert.utils.content_from_file import content_from_file
 from app.convert.utils.convert_encoding_meta import convert_encoding_meta


@ -31,7 +30,7 @@ class FacebookCommentsConverter(BaseConverter):
                    if data_item.get('comment'):
                        comment_data = data_item['comment']
                        facebook_comments.append({
-                            "texte": comment_data["comment"],
+                            "content": comment_data["comment"],
                            "creation_timestamp": comment_data["timestamp"]
                        })

--- a/backend/app/convert/convert_facebook_posts_json.py
+++ b/backend/app/convert/convert_facebook_posts_json.py
@ -1,12 +1,11 @@
+import json
 from typing import Union, List, Dict

 import pandas as pd
-import json

 from app.config import logger
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
+from app.convert.utils.content_from_file import content_from_file
 from app.convert.utils.convert_encoding_meta import convert_encoding_meta


@ -31,7 +30,7 @@ class FacebookPostsConverter(BaseConverter):

    def rename_columns(self) -> None:
        self.df.rename(columns={
-            "description": "texte",
+            "description": "content",
            "uri": "chemin"
        }, inplace=True)

--- a/backend/app/convert/convert_instagram_comments_json.py
+++ b/backend/app/convert/convert_instagram_comments_json.py
@ -4,10 +4,9 @@ from typing import Union, List, Dict
 import pandas as pd

 from app.config import logger
-from app.convert.utils.convert_encoding_meta import convert_encoding_meta
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
+from app.convert.utils.content_from_file import content_from_file
+from app.convert.utils.convert_encoding_meta import convert_encoding_meta


 class InstagramCommentsConverter(BaseConverter):
@ -29,7 +28,7 @@ class InstagramCommentsConverter(BaseConverter):
                                         []) + self.datadict.get(
                'post_comments_1', []):
            ig_comments.append({
-                "texte": comment['string_map_data']['Comment']['value'],
+                "content": comment['string_map_data']['Comment']['value'],
                'creation_timestamp': int(
                    comment['string_map_data']['Time']['timestamp']),
                'index': self.df.index,
--- a/backend/app/convert/convert_instagram_posts_json.py
+++ b/backend/app/convert/convert_instagram_posts_json.py
@ -19,7 +19,7 @@ class InstagramPostsConverter(BaseConverter):
                media = medias[0]
                posts_medias.append({
                    "chemin": [media["uri"]],
-                    "texte": media["title"],
+                    "content": media["title"],
                    "creation_timestamp": media["creation_timestamp"]
                })
            else:
@ -28,7 +28,7 @@ class InstagramPostsConverter(BaseConverter):
                list_uris = [media['uri'] for media in medias]
                posts_medias.append({
                    "chemin": list_uris,
-                    "texte": title,
+                    "content": title,
                    "creation_timestamp": creation_timestamp
                })

--- a/backend/app/convert/convert_instagram_reels_json.py
+++ b/backend/app/convert/convert_instagram_reels_json.py
@ -1,13 +1,12 @@
-import json
 import datetime
+import json
 from typing import Union, List, Dict

 import pandas as pd

 from app.config import logger
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
+from app.convert.utils.content_from_file import content_from_file
 from app.convert.utils.convert_encoding_meta import convert_encoding_meta


@ -34,7 +33,7 @@ class InstagramReelsConverter(BaseConverter):

    def rename_columns(self) -> None:
        self.df.rename(columns={
-            "title": "texte",
+            "title": "content",
            "uri": "chemin"
        }, inplace=True)

--- a/backend/app/convert/convert_instagram_stories_json.py
+++ b/backend/app/convert/convert_instagram_stories_json.py
@ -1,13 +1,12 @@
-import json
 import datetime
+import json
 from typing import Dict, Union, List

 import pandas as pd

 from app.config import logger
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
+from app.convert.utils.content_from_file import content_from_file
 from app.convert.utils.convert_encoding_meta import convert_encoding_meta


@ -33,7 +32,7 @@ class InstagramStoriesConverter(BaseConverter):

    def rename_columns(self) -> None:
        self.df.rename(columns={
-            "title": "texte",
+            "title": "content",
            "uri": "chemin"
        }, inplace=True)

--- a/backend/app/convert/convert_linkedin_comments_csv.py
+++ b/backend/app/convert/convert_linkedin_comments_csv.py
@ -1,11 +1,11 @@
-import pandas as pd
 import datetime
 from typing import Dict, Union, List

+import pandas as pd
+
 from app.config import logger
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
+from app.convert.utils.content_from_file import content_from_file


 class LinkedInCommentsConverter(BaseConverter):
@ -37,11 +37,11 @@ class LinkedInCommentsConverter(BaseConverter):
    def rename_columns(self) -> None:
        self.df.rename(columns={
            "Link": "url",
-            "Message": "texte"
+            "Message": "content"
        }, inplace=True)

    def clean_data(self) -> None:
-        self.df["texte"] = self.df["texte"].apply(lambda x: str(x))
+        self.df["content"] = self.df["content"].apply(lambda x: str(x))
        self.df["chemin"] = ""
        self.df.fillna(value="", inplace=True)

--- a/backend/app/convert/convert_linkedin_shares_csv.py
+++ b/backend/app/convert/convert_linkedin_shares_csv.py
@ -3,8 +3,8 @@ from typing import Dict, Union, List

 import pandas as pd

-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
+from app.convert.utils.content_from_file import content_from_file


 class LinkedInSharesConverter(BaseConverter):
@ -26,7 +26,7 @@ class LinkedInSharesConverter(BaseConverter):
    def rename_columns(self) -> None:
        self.df = self.df.rename(columns={
            "ShareLink": "uri",
-            "ShareCommentary": "texte",
+            "ShareCommentary": "content",
            "Date": "creation_timestamp"
        })

@ -34,8 +34,8 @@ class LinkedInSharesConverter(BaseConverter):
        """Clean and preprocess the DataFrame."""
        self.df = self.df.fillna("")
        self.df = self.df.drop_duplicates(
-            subset=["texte", "creation_timestamp"])
-        self.df = self.df[self.df["texte"].str.strip() != ""]
+            subset=["content", "creation_timestamp"])
+        self.df = self.df[self.df["content"].str.strip() != ""]


 def convert_linkedin_shares_csv(content: Union[str, bytes]) -> List[Dict]:
--- a/backend/app/models.py
+++ b/backend/app/models.py
@ -53,9 +53,15 @@ class GenerateResponse(BaseModel):


 class ImportRequest(BaseModel):
-    type: str
-    data: str
+    source_type: str
+    object_name: str


 class ImportResponse(BaseModel):
    status: str
+    message: str
+    task_id: str = None
+
+
+class AvailableCollectionsResponse(BaseModel):
+    collections: List[str]
--- a/backend/app/routers/convert_router.py
+++ b/backend/app/routers/convert_router.py
@ -1,7 +1,7 @@
 import datetime
 import json
 import os
-from typing import Dict, Union, List
+from typing import Dict, List

 from fastapi import APIRouter, HTTPException

@ -43,6 +43,7 @@ from app.convert.convert_youtube_shorts_video import (
 )
 from app.convert.convert_youtube_video_video import convert_youtube_video_video
 from app.models import ConversionRequest, ConversionResponse
+from app.routers.utils.read_content_from_minio import read_content_from_minio

 convert_router = APIRouter(prefix="/convert", tags=["Convert"])

@ -127,53 +128,6 @@ def generate_temp_file(data: List[Dict], source_type: str) -> str:

    return tmp_filename

-def read_content_from_minio(request: ConversionRequest) -> Union[str, bytes]:
-    """
-    Read content from MinIO storage based on the request filename.
-    
-    Args:
-        request: The conversion request containing the filename
-        
-    Returns:
-        The file content as string (for text files) or bytes (for binary files)
-        
-    Raises:
-        HTTPException: If the file cannot be read or doesn't exist
-    """
-    # Check if filename exists
-    if not request.filename:
-        logger.error("Filename is empty or invalid")
-        raise HTTPException(
-            status_code=400, detail="Filename is required"
-        )
-
-    # Read file from MinIO
-    try:
-        logger.info(
-            f"Reading file '{request.filename}' from MinIO bucket '{minio_bucket_name}'")
-        with minio_client.get_object(
-                bucket_name=minio_bucket_name, object_name=request.filename
-        ) as response:
-            content_type = response.headers.get("content-type", "")
-            logger.debug(f"File content type: {content_type}")
-
-            if content_type.startswith("text/"):
-                # Read as text (UTF-8)
-                content = response.read().decode("utf-8")
-                logger.debug(f"Read {len(content)} characters from text file")
-            else:
-                # Read as binary
-                content = response.read()
-                logger.debug(f"Read {len(content)} bytes from binary file")
-
-            return content
-    except Exception as e:
-        error_msg = f"Error reading file '{request.filename}' from MinIO: {e!s}"
-        logger.error(error_msg)
-        raise HTTPException(
-            status_code=500, detail=error_msg
-        ) from e
-

 def save_to_minio(data: List[Dict], source_type: str) -> str:
    """
@ -197,7 +151,7 @@ def save_to_minio(data: List[Dict], source_type: str) -> str:
            f"Uploading '{tmp_filename}' to MinIO bucket '{minio_bucket_name}'")
        minio_client.fput_object(
            bucket_name=minio_bucket_name,
-            object_name=tmp_filename,
+            object_name="output/" + tmp_filename,
            file_path=tmp_filename
        )

@ -238,7 +192,7 @@ def convert_data(request: ConversionRequest):
            f"Processing conversion request for {request.source_type} in {request.source_format} format")

        # Read content from MinIO
-        content = read_content_from_minio(request)
+        content = read_content_from_minio(request.filename)

        # Check if source and format are supported
        if request.source_type not in CONVERTERS:
--- a/backend/app/routers/import_router.py
+++ b/backend/app/routers/import_router.py
@ -1,26 +1,123 @@
-from app.config import available_sources, logger
-from app.models import AvailableSourcesResponse, ImportRequest, ImportResponse
-from fastapi import APIRouter
+import json
+import traceback
+
+from fastapi import APIRouter, HTTPException
+from pymilvus import connections, Collection, FieldSchema, CollectionSchema, \
+    DataType, utility
+
+from app.config import logger
+from app.models import ImportRequest, ImportResponse, \
+    AvailableCollectionsResponse
+from app.routers.utils.generate_embeddings import generate_embeddings
+from app.routers.utils.read_content_from_minio import read_content_from_minio

 import_router = APIRouter(prefix="/import", tags=["Import"])

-
@import_router.post("/", response_model=ImportResponse)
-def import_data(request: ImportRequest):
-    """
-    Import data (e.g., text, files, or structured data).
-    """
-    logger.info(f"Receiver importation request: {request.type}")
-    return ...
+async def import_data(request: ImportRequest):
+    try:
+        logger.info(f"Starting import process for {request.source_type}")
+
+        # Check Milvus connection
+        try:
+            connections.connect("default", host="milvus", port="19530")
+            logger.info("Successfully connected to Milvus")
+        except Exception as e:
+            logger.error(f"Failed to connect to Milvus: {str(e)}")
+            return ImportResponse(status="error",
+                                  message="Failed to connect to Milvus")
+
+        # Fetch data from MinIO
+        try:
+            data = read_content_from_minio(request.object_name)
+            logger.info(
+                f"Successfully fetched data from MinIO: {request.object_name}")
+        except Exception as e:
+            logger.error(f"Failed to fetch data from MinIO: {str(e)}")
+            return ImportResponse(status="error",
+                                  message="Failed to fetch data from MinIO")
+
+        # Process data
+        processed_data = json.loads(data)
+        logger.info("Data processed successfully")
+
+        # Generate embeddings and insert into Milvus
+        collection_name = f"{request.source_type}_collection"
+        if not utility.has_collection(collection_name):
+            create_collection(collection_name)
+
+        collection = Collection(collection_name)
+
+        total_items = len(processed_data)
+        for i, item in enumerate(processed_data, 1):
+            try:
+                item["embedding"] = generate_embeddings(item)
+                filtered_item = {
+                    "content": item.get("content", ""),
+                    "embedding": item["embedding"],
+                    "creation_timestamp": int(
+                        item.get("creation_timestamp", 0)),
+                    "index": item.get("index", ""),
+                    "type": item.get("type", ""),
+                    "network": item.get("network", ""),
+                    "url": item.get("url", "")
+                }
+                _ = collection.insert([filtered_item])
+                logger.info(
+                    f"Inserted item {i}/{total_items} into Milvus collection {collection_name}")
+            except Exception as e:
+                logger.error(f"Failed to process item {i}: {str(e)}")
+
+        logger.info(f"Import completed for {request.source_type}")
+        return ImportResponse(status="success",
+                              message="Import completed successfully")
+
+    except Exception as e:
+        logger.error(f"Unexpected error during import: {str(e)}")
+        logger.error(traceback.format_exc())
+        return ImportResponse(status="error",
+                              message=f"Unexpected error: {str(e)}")


-@import_router.get(
-    "/available_sources", response_model=AvailableSourcesResponse
-)
-def get_available_sources():
-    """
-    Get available sources from database
-    :return: Available sources in an AvailableSourcesResponse object
-    """
-    logger.info("Get available sources from database")
-    return available_sources
+def create_collection(collection_name: str):
+    fields = [
+        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True,
+                    auto_id=True),
+        FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
+        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024),
+        FieldSchema(name="creation_timestamp", dtype=DataType.INT64),
+        FieldSchema(name="index", dtype=DataType.VARCHAR, max_length=255),
+        FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=255),
+        FieldSchema(name="network", dtype=DataType.VARCHAR, max_length=255),
+        FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=2083),
+    ]
+    schema = CollectionSchema(fields, "A collection for storing embeddings")
+    collection = Collection(collection_name, schema)
+
+    index_params = {
+        "metric_type": "L2",
+        "index_type": "IVF_FLAT",
+        "params": {"nlist": 1024}
+    }
+    collection.create_index("embedding", index_params)
+    logger.info(f"Created new collection: {collection_name}")
+
+
+@import_router.get("/available_collections",
+                   response_model=AvailableCollectionsResponse)
+def get_available_collections():
+    logger.info("Getting available collections from Milvus")
+
+    try:
+        if not utility.has_collection("default"):
+            connections.connect("default", host="milvus", port="19530")
+
+        collections = utility.list_collections()
+
+        logger.info(f"Found {len(collections)} collections")
+        return AvailableCollectionsResponse(collections=collections)
+
+    except Exception as e:
+        logger.error(f"Error getting collections from Milvus: {str(e)}")
+        raise HTTPException(status_code=500,
+                            detail=f"Error getting collections from Milvus: {str(e)}")
--- a/backend/app/routers/utils/init.py
+++ b/backend/app/routers/utils/init.py
--- a/backend/app/routers/utils/generate_embeddings.py
+++ b/backend/app/routers/utils/generate_embeddings.py
@ -0,0 +1,43 @@
+import json
+
+import requests
+
+from app.config import ollama_url, embedding_model_name, logger
+
+
+def generate_embeddings(content):
+    # Convert content to string if it's not already
+    if not isinstance(content, str):
+        try:
+            content = json.dumps(content)
+        except Exception as e:
+            logger.error(
+                f"Error converting content to string: {str(e)}. Defaulting to string.")
+            content = str(content)
+
+    logger.info(
+        f"Generating embeddings for content: {content[:100]}...")  # Log first 100 chars
+    try:
+        response = requests.post(f"{ollama_url}/api/embed", json={
+            "model": embedding_model_name,
+            "input": content
+        })
+        response.raise_for_status()  # Raise an exception for bad status codes
+        embeddings = response.json().get('embeddings')[0]
+        if embeddings:
+            logger.info(
+                f"Successfully generated embeddings of length {len(embeddings)}")
+            return embeddings
+        else:
+            raise ValueError("No embeddings found in response")
+    except requests.RequestException as e:
+        logger.error(f"Error making request to Ollama API: {str(e)}")
+        logger.error(
+            f"Response content: {e.response.text if e.response else 'No response'}")
+        raise
+    except json.JSONDecodeError:
+        logger.error(f"Error decoding JSON response: {e.response.text}")
+        raise
+    except Exception as e:
+        logger.error(f"Unexpected error generating embeddings: {str(e)}")
+        raise
--- a/backend/app/routers/utils/read_content_from_minio.py
+++ b/backend/app/routers/utils/read_content_from_minio.py
@ -0,0 +1,54 @@
+from typing import Union
+
+from fastapi import HTTPException
+
+from app.config import logger, minio_bucket_name, minio_client
+
+
+def read_content_from_minio(filename: str) -> Union[str, bytes]:
+    """
+    Read content from MinIO storage based on the request filename.
+
+    Args:
+        request: The conversion request containing the filename
+
+    Returns:
+        The file content as string (for text files) or bytes (for binary files)
+
+    Raises:
+        HTTPException: If the file cannot be read or doesn't exist
+        :param filename:
+    """
+    # Check if filename exists
+    if not filename:
+        logger.error("Filename is empty or invalid")
+        raise HTTPException(
+            status_code=400, detail="Filename is required"
+        )
+
+    # Read file from MinIO
+    try:
+        logger.info(
+            f"Reading file '{filename}' from MinIO bucket '{minio_bucket_name}'")
+        with minio_client.get_object(
+                bucket_name=minio_bucket_name, object_name=filename
+        ) as response:
+            content_type = response.headers.get("content-type", "")
+            logger.debug(f"File content type: {content_type}")
+
+            if content_type.startswith("text/"):
+                # Read as text (UTF-8)
+                content = response.read().decode("utf-8")
+                logger.debug(f"Read {len(content)} characters from text file")
+            else:
+                # Read as binary
+                content = response.read()
+                logger.debug(f"Read {len(content)} bytes from binary file")
+
+            return content
+    except Exception as e:
+        error_msg = f"Error reading file '{filename}' from MinIO: {e!s}"
+        logger.error(error_msg)
+        raise HTTPException(
+            status_code=500, detail=error_msg
+        ) from e
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -7,4 +7,7 @@ minio
 python-dotenv
 xmltodict
 markdownify
-chardet
+chardet
+pymilvus
+requests
+tqdm