From 64832e2989eed72f86f983dd7a9d784b49eb95e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Wed, 21 May 2025 17:21:57 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=80=20Add=20feature:=20Charging=20in?= =?UTF-8?q?=20Milvus=20is=20now=20working?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.template | 4 +- backend/app/config.py | 13 ++ .../convert_facebook_business_posts_json.py | 6 +- .../convert/convert_facebook_comments_json.py | 5 +- .../convert/convert_facebook_posts_json.py | 7 +- .../convert_instagram_comments_json.py | 7 +- .../convert/convert_instagram_posts_json.py | 4 +- .../convert/convert_instagram_reels_json.py | 7 +- .../convert/convert_instagram_stories_json.py | 7 +- .../convert/convert_linkedin_comments_csv.py | 10 +- .../convert/convert_linkedin_shares_csv.py | 8 +- backend/app/models.py | 10 +- backend/app/routers/convert_router.py | 54 +------ backend/app/routers/import_router.py | 137 +++++++++++++++--- backend/app/routers/utils/__init__.py | 0 .../app/routers/utils/generate_embeddings.py | 43 ++++++ .../routers/utils/read_content_from_minio.py | 54 +++++++ backend/requirements.txt | 5 +- docker-compose.yml | 5 + import_requests_test.http | 49 +++++++ minio_sample_data/transfer_to_minio.sh | 2 +- requirements.txt | 5 +- test_embedding.http | 21 +++ 23 files changed, 354 insertions(+), 109 deletions(-) create mode 100644 backend/app/routers/utils/__init__.py create mode 100644 backend/app/routers/utils/generate_embeddings.py create mode 100644 backend/app/routers/utils/read_content_from_minio.py create mode 100644 import_requests_test.http create mode 100644 test_embedding.http diff --git a/.env.template b/.env.template index 7b5528b..cfa7f97 100644 --- a/.env.template +++ b/.env.template @@ -11,4 +11,6 @@ ETCD_QUOTA_BACKEND_BYTES= ETCD_SNAPSHOT_COUNT= MINIO_ROOT_USER= MINIO_ROOT_PASSWORD= -ATTU_HOST_URL= \ No newline at end of file +ATTU_HOST_URL= +OLLAMA_URL= +OLLAMA_EMBEDDING_MODEL_NAME= \ No newline at end of file diff --git a/backend/app/config.py b/backend/app/config.py index c0bcca3..e804078 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,5 +1,6 @@ import logging import os +import sys import dotenv from minio import Minio @@ -8,6 +9,13 @@ from app.models import AvailableSource, AvailableSourcesResponse dotenv.load_dotenv() +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + stream=sys.stdout +) + logger = logging.getLogger("base_logger") available_sources = AvailableSourcesResponse( @@ -99,3 +107,8 @@ minio_client = Minio( secret_key=minio_secret_key, secure=minio_secure ) + +ollama_url = os.environ.get("OLLAMA_URL", "http://host.docker.internal:11434") + +embedding_model_name = os.environ.get("OLLAMA_EMBEDDING_MODEL_NAME", + "snowflake-arctic-embed2") diff --git a/backend/app/convert/convert_facebook_business_posts_json.py b/backend/app/convert/convert_facebook_business_posts_json.py index 6147cab..fbbea23 100644 --- a/backend/app/convert/convert_facebook_business_posts_json.py +++ b/backend/app/convert/convert_facebook_business_posts_json.py @@ -16,9 +16,9 @@ class FacebookBusinessPostsConverter(BaseConverter): posts_medias = [] for post in self.datadict: data_post_items = post['data'] - texte_post_list = [item['post'] for item in data_post_items if + content_post_list = [item['post'] for item in data_post_items if item.get('post')] - texte = "\n".join(texte_post_list) + content = "\n".join(content_post_list) for attachment in post['attachments']: if attachment.get('data'): @@ -27,7 +27,7 @@ class FacebookBusinessPostsConverter(BaseConverter): media = data_item['media'] posts_medias.append({ "chemin": [media["uri"]], - "texte": texte, + "content": content, "creation_timestamp": media[ "creation_timestamp"] }) diff --git a/backend/app/convert/convert_facebook_comments_json.py b/backend/app/convert/convert_facebook_comments_json.py index 1f47fd9..3c12575 100644 --- a/backend/app/convert/convert_facebook_comments_json.py +++ b/backend/app/convert/convert_facebook_comments_json.py @@ -4,9 +4,8 @@ from typing import Union, List, Dict import pandas as pd from app.config import logger -from app.models import ConversionResponse -from app.convert.utils.content_from_file import content_from_file from app.convert.base_converter import BaseConverter +from app.convert.utils.content_from_file import content_from_file from app.convert.utils.convert_encoding_meta import convert_encoding_meta @@ -31,7 +30,7 @@ class FacebookCommentsConverter(BaseConverter): if data_item.get('comment'): comment_data = data_item['comment'] facebook_comments.append({ - "texte": comment_data["comment"], + "content": comment_data["comment"], "creation_timestamp": comment_data["timestamp"] }) diff --git a/backend/app/convert/convert_facebook_posts_json.py b/backend/app/convert/convert_facebook_posts_json.py index 16c4e5a..e223c4e 100644 --- a/backend/app/convert/convert_facebook_posts_json.py +++ b/backend/app/convert/convert_facebook_posts_json.py @@ -1,12 +1,11 @@ +import json from typing import Union, List, Dict import pandas as pd -import json from app.config import logger -from app.models import ConversionResponse -from app.convert.utils.content_from_file import content_from_file from app.convert.base_converter import BaseConverter +from app.convert.utils.content_from_file import content_from_file from app.convert.utils.convert_encoding_meta import convert_encoding_meta @@ -31,7 +30,7 @@ class FacebookPostsConverter(BaseConverter): def rename_columns(self) -> None: self.df.rename(columns={ - "description": "texte", + "description": "content", "uri": "chemin" }, inplace=True) diff --git a/backend/app/convert/convert_instagram_comments_json.py b/backend/app/convert/convert_instagram_comments_json.py index ac65fe0..edcab23 100644 --- a/backend/app/convert/convert_instagram_comments_json.py +++ b/backend/app/convert/convert_instagram_comments_json.py @@ -4,10 +4,9 @@ from typing import Union, List, Dict import pandas as pd from app.config import logger -from app.convert.utils.convert_encoding_meta import convert_encoding_meta -from app.models import ConversionResponse -from app.convert.utils.content_from_file import content_from_file from app.convert.base_converter import BaseConverter +from app.convert.utils.content_from_file import content_from_file +from app.convert.utils.convert_encoding_meta import convert_encoding_meta class InstagramCommentsConverter(BaseConverter): @@ -29,7 +28,7 @@ class InstagramCommentsConverter(BaseConverter): []) + self.datadict.get( 'post_comments_1', []): ig_comments.append({ - "texte": comment['string_map_data']['Comment']['value'], + "content": comment['string_map_data']['Comment']['value'], 'creation_timestamp': int( comment['string_map_data']['Time']['timestamp']), 'index': self.df.index, diff --git a/backend/app/convert/convert_instagram_posts_json.py b/backend/app/convert/convert_instagram_posts_json.py index 4e299ba..03039c5 100644 --- a/backend/app/convert/convert_instagram_posts_json.py +++ b/backend/app/convert/convert_instagram_posts_json.py @@ -19,7 +19,7 @@ class InstagramPostsConverter(BaseConverter): media = medias[0] posts_medias.append({ "chemin": [media["uri"]], - "texte": media["title"], + "content": media["title"], "creation_timestamp": media["creation_timestamp"] }) else: @@ -28,7 +28,7 @@ class InstagramPostsConverter(BaseConverter): list_uris = [media['uri'] for media in medias] posts_medias.append({ "chemin": list_uris, - "texte": title, + "content": title, "creation_timestamp": creation_timestamp }) diff --git a/backend/app/convert/convert_instagram_reels_json.py b/backend/app/convert/convert_instagram_reels_json.py index 4cdd44b..20cce01 100644 --- a/backend/app/convert/convert_instagram_reels_json.py +++ b/backend/app/convert/convert_instagram_reels_json.py @@ -1,13 +1,12 @@ -import json import datetime +import json from typing import Union, List, Dict import pandas as pd from app.config import logger -from app.models import ConversionResponse -from app.convert.utils.content_from_file import content_from_file from app.convert.base_converter import BaseConverter +from app.convert.utils.content_from_file import content_from_file from app.convert.utils.convert_encoding_meta import convert_encoding_meta @@ -34,7 +33,7 @@ class InstagramReelsConverter(BaseConverter): def rename_columns(self) -> None: self.df.rename(columns={ - "title": "texte", + "title": "content", "uri": "chemin" }, inplace=True) diff --git a/backend/app/convert/convert_instagram_stories_json.py b/backend/app/convert/convert_instagram_stories_json.py index 074c2b6..a8c6f36 100644 --- a/backend/app/convert/convert_instagram_stories_json.py +++ b/backend/app/convert/convert_instagram_stories_json.py @@ -1,13 +1,12 @@ -import json import datetime +import json from typing import Dict, Union, List import pandas as pd from app.config import logger -from app.models import ConversionResponse -from app.convert.utils.content_from_file import content_from_file from app.convert.base_converter import BaseConverter +from app.convert.utils.content_from_file import content_from_file from app.convert.utils.convert_encoding_meta import convert_encoding_meta @@ -33,7 +32,7 @@ class InstagramStoriesConverter(BaseConverter): def rename_columns(self) -> None: self.df.rename(columns={ - "title": "texte", + "title": "content", "uri": "chemin" }, inplace=True) diff --git a/backend/app/convert/convert_linkedin_comments_csv.py b/backend/app/convert/convert_linkedin_comments_csv.py index 18e4641..f6fc018 100644 --- a/backend/app/convert/convert_linkedin_comments_csv.py +++ b/backend/app/convert/convert_linkedin_comments_csv.py @@ -1,11 +1,11 @@ -import pandas as pd import datetime from typing import Dict, Union, List +import pandas as pd + from app.config import logger -from app.models import ConversionResponse -from app.convert.utils.content_from_file import content_from_file from app.convert.base_converter import BaseConverter +from app.convert.utils.content_from_file import content_from_file class LinkedInCommentsConverter(BaseConverter): @@ -37,11 +37,11 @@ class LinkedInCommentsConverter(BaseConverter): def rename_columns(self) -> None: self.df.rename(columns={ "Link": "url", - "Message": "texte" + "Message": "content" }, inplace=True) def clean_data(self) -> None: - self.df["texte"] = self.df["texte"].apply(lambda x: str(x)) + self.df["content"] = self.df["content"].apply(lambda x: str(x)) self.df["chemin"] = "" self.df.fillna(value="", inplace=True) diff --git a/backend/app/convert/convert_linkedin_shares_csv.py b/backend/app/convert/convert_linkedin_shares_csv.py index 47cf1d7..d9abdc6 100644 --- a/backend/app/convert/convert_linkedin_shares_csv.py +++ b/backend/app/convert/convert_linkedin_shares_csv.py @@ -3,8 +3,8 @@ from typing import Dict, Union, List import pandas as pd -from app.convert.utils.content_from_file import content_from_file from app.convert.base_converter import BaseConverter +from app.convert.utils.content_from_file import content_from_file class LinkedInSharesConverter(BaseConverter): @@ -26,7 +26,7 @@ class LinkedInSharesConverter(BaseConverter): def rename_columns(self) -> None: self.df = self.df.rename(columns={ "ShareLink": "uri", - "ShareCommentary": "texte", + "ShareCommentary": "content", "Date": "creation_timestamp" }) @@ -34,8 +34,8 @@ class LinkedInSharesConverter(BaseConverter): """Clean and preprocess the DataFrame.""" self.df = self.df.fillna("") self.df = self.df.drop_duplicates( - subset=["texte", "creation_timestamp"]) - self.df = self.df[self.df["texte"].str.strip() != ""] + subset=["content", "creation_timestamp"]) + self.df = self.df[self.df["content"].str.strip() != ""] def convert_linkedin_shares_csv(content: Union[str, bytes]) -> List[Dict]: diff --git a/backend/app/models.py b/backend/app/models.py index 9f7ec35..626c480 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -53,9 +53,15 @@ class GenerateResponse(BaseModel): class ImportRequest(BaseModel): - type: str - data: str + source_type: str + object_name: str class ImportResponse(BaseModel): status: str + message: str + task_id: str = None + + +class AvailableCollectionsResponse(BaseModel): + collections: List[str] diff --git a/backend/app/routers/convert_router.py b/backend/app/routers/convert_router.py index f8d7f2f..3dc1406 100644 --- a/backend/app/routers/convert_router.py +++ b/backend/app/routers/convert_router.py @@ -1,7 +1,7 @@ import datetime import json import os -from typing import Dict, Union, List +from typing import Dict, List from fastapi import APIRouter, HTTPException @@ -43,6 +43,7 @@ from app.convert.convert_youtube_shorts_video import ( ) from app.convert.convert_youtube_video_video import convert_youtube_video_video from app.models import ConversionRequest, ConversionResponse +from app.routers.utils.read_content_from_minio import read_content_from_minio convert_router = APIRouter(prefix="/convert", tags=["Convert"]) @@ -127,53 +128,6 @@ def generate_temp_file(data: List[Dict], source_type: str) -> str: return tmp_filename -def read_content_from_minio(request: ConversionRequest) -> Union[str, bytes]: - """ - Read content from MinIO storage based on the request filename. - - Args: - request: The conversion request containing the filename - - Returns: - The file content as string (for text files) or bytes (for binary files) - - Raises: - HTTPException: If the file cannot be read or doesn't exist - """ - # Check if filename exists - if not request.filename: - logger.error("Filename is empty or invalid") - raise HTTPException( - status_code=400, detail="Filename is required" - ) - - # Read file from MinIO - try: - logger.info( - f"Reading file '{request.filename}' from MinIO bucket '{minio_bucket_name}'") - with minio_client.get_object( - bucket_name=minio_bucket_name, object_name=request.filename - ) as response: - content_type = response.headers.get("content-type", "") - logger.debug(f"File content type: {content_type}") - - if content_type.startswith("text/"): - # Read as text (UTF-8) - content = response.read().decode("utf-8") - logger.debug(f"Read {len(content)} characters from text file") - else: - # Read as binary - content = response.read() - logger.debug(f"Read {len(content)} bytes from binary file") - - return content - except Exception as e: - error_msg = f"Error reading file '{request.filename}' from MinIO: {e!s}" - logger.error(error_msg) - raise HTTPException( - status_code=500, detail=error_msg - ) from e - def save_to_minio(data: List[Dict], source_type: str) -> str: """ @@ -197,7 +151,7 @@ def save_to_minio(data: List[Dict], source_type: str) -> str: f"Uploading '{tmp_filename}' to MinIO bucket '{minio_bucket_name}'") minio_client.fput_object( bucket_name=minio_bucket_name, - object_name=tmp_filename, + object_name="output/" + tmp_filename, file_path=tmp_filename ) @@ -238,7 +192,7 @@ def convert_data(request: ConversionRequest): f"Processing conversion request for {request.source_type} in {request.source_format} format") # Read content from MinIO - content = read_content_from_minio(request) + content = read_content_from_minio(request.filename) # Check if source and format are supported if request.source_type not in CONVERTERS: diff --git a/backend/app/routers/import_router.py b/backend/app/routers/import_router.py index bcc9546..491d54d 100644 --- a/backend/app/routers/import_router.py +++ b/backend/app/routers/import_router.py @@ -1,26 +1,123 @@ -from app.config import available_sources, logger -from app.models import AvailableSourcesResponse, ImportRequest, ImportResponse -from fastapi import APIRouter +import json +import traceback + +from fastapi import APIRouter, HTTPException +from pymilvus import connections, Collection, FieldSchema, CollectionSchema, \ + DataType, utility + +from app.config import logger +from app.models import ImportRequest, ImportResponse, \ + AvailableCollectionsResponse +from app.routers.utils.generate_embeddings import generate_embeddings +from app.routers.utils.read_content_from_minio import read_content_from_minio import_router = APIRouter(prefix="/import", tags=["Import"]) - @import_router.post("/", response_model=ImportResponse) -def import_data(request: ImportRequest): - """ - Import data (e.g., text, files, or structured data). - """ - logger.info(f"Receiver importation request: {request.type}") - return ... +async def import_data(request: ImportRequest): + try: + logger.info(f"Starting import process for {request.source_type}") + + # Check Milvus connection + try: + connections.connect("default", host="milvus", port="19530") + logger.info("Successfully connected to Milvus") + except Exception as e: + logger.error(f"Failed to connect to Milvus: {str(e)}") + return ImportResponse(status="error", + message="Failed to connect to Milvus") + + # Fetch data from MinIO + try: + data = read_content_from_minio(request.object_name) + logger.info( + f"Successfully fetched data from MinIO: {request.object_name}") + except Exception as e: + logger.error(f"Failed to fetch data from MinIO: {str(e)}") + return ImportResponse(status="error", + message="Failed to fetch data from MinIO") + + # Process data + processed_data = json.loads(data) + logger.info("Data processed successfully") + + # Generate embeddings and insert into Milvus + collection_name = f"{request.source_type}_collection" + if not utility.has_collection(collection_name): + create_collection(collection_name) + + collection = Collection(collection_name) + + total_items = len(processed_data) + for i, item in enumerate(processed_data, 1): + try: + item["embedding"] = generate_embeddings(item) + filtered_item = { + "content": item.get("content", ""), + "embedding": item["embedding"], + "creation_timestamp": int( + item.get("creation_timestamp", 0)), + "index": item.get("index", ""), + "type": item.get("type", ""), + "network": item.get("network", ""), + "url": item.get("url", "") + } + _ = collection.insert([filtered_item]) + logger.info( + f"Inserted item {i}/{total_items} into Milvus collection {collection_name}") + except Exception as e: + logger.error(f"Failed to process item {i}: {str(e)}") + + logger.info(f"Import completed for {request.source_type}") + return ImportResponse(status="success", + message="Import completed successfully") + + except Exception as e: + logger.error(f"Unexpected error during import: {str(e)}") + logger.error(traceback.format_exc()) + return ImportResponse(status="error", + message=f"Unexpected error: {str(e)}") -@import_router.get( - "/available_sources", response_model=AvailableSourcesResponse -) -def get_available_sources(): - """ - Get available sources from database - :return: Available sources in an AvailableSourcesResponse object - """ - logger.info("Get available sources from database") - return available_sources +def create_collection(collection_name: str): + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, + auto_id=True), + FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024), + FieldSchema(name="creation_timestamp", dtype=DataType.INT64), + FieldSchema(name="index", dtype=DataType.VARCHAR, max_length=255), + FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=255), + FieldSchema(name="network", dtype=DataType.VARCHAR, max_length=255), + FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=2083), + ] + schema = CollectionSchema(fields, "A collection for storing embeddings") + collection = Collection(collection_name, schema) + + index_params = { + "metric_type": "L2", + "index_type": "IVF_FLAT", + "params": {"nlist": 1024} + } + collection.create_index("embedding", index_params) + logger.info(f"Created new collection: {collection_name}") + + +@import_router.get("/available_collections", + response_model=AvailableCollectionsResponse) +def get_available_collections(): + logger.info("Getting available collections from Milvus") + + try: + if not utility.has_collection("default"): + connections.connect("default", host="milvus", port="19530") + + collections = utility.list_collections() + + logger.info(f"Found {len(collections)} collections") + return AvailableCollectionsResponse(collections=collections) + + except Exception as e: + logger.error(f"Error getting collections from Milvus: {str(e)}") + raise HTTPException(status_code=500, + detail=f"Error getting collections from Milvus: {str(e)}") diff --git a/backend/app/routers/utils/__init__.py b/backend/app/routers/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/routers/utils/generate_embeddings.py b/backend/app/routers/utils/generate_embeddings.py new file mode 100644 index 0000000..55605f0 --- /dev/null +++ b/backend/app/routers/utils/generate_embeddings.py @@ -0,0 +1,43 @@ +import json + +import requests + +from app.config import ollama_url, embedding_model_name, logger + + +def generate_embeddings(content): + # Convert content to string if it's not already + if not isinstance(content, str): + try: + content = json.dumps(content) + except Exception as e: + logger.error( + f"Error converting content to string: {str(e)}. Defaulting to string.") + content = str(content) + + logger.info( + f"Generating embeddings for content: {content[:100]}...") # Log first 100 chars + try: + response = requests.post(f"{ollama_url}/api/embed", json={ + "model": embedding_model_name, + "input": content + }) + response.raise_for_status() # Raise an exception for bad status codes + embeddings = response.json().get('embeddings')[0] + if embeddings: + logger.info( + f"Successfully generated embeddings of length {len(embeddings)}") + return embeddings + else: + raise ValueError("No embeddings found in response") + except requests.RequestException as e: + logger.error(f"Error making request to Ollama API: {str(e)}") + logger.error( + f"Response content: {e.response.text if e.response else 'No response'}") + raise + except json.JSONDecodeError: + logger.error(f"Error decoding JSON response: {e.response.text}") + raise + except Exception as e: + logger.error(f"Unexpected error generating embeddings: {str(e)}") + raise diff --git a/backend/app/routers/utils/read_content_from_minio.py b/backend/app/routers/utils/read_content_from_minio.py new file mode 100644 index 0000000..e5049b8 --- /dev/null +++ b/backend/app/routers/utils/read_content_from_minio.py @@ -0,0 +1,54 @@ +from typing import Union + +from fastapi import HTTPException + +from app.config import logger, minio_bucket_name, minio_client + + +def read_content_from_minio(filename: str) -> Union[str, bytes]: + """ + Read content from MinIO storage based on the request filename. + + Args: + request: The conversion request containing the filename + + Returns: + The file content as string (for text files) or bytes (for binary files) + + Raises: + HTTPException: If the file cannot be read or doesn't exist + :param filename: + """ + # Check if filename exists + if not filename: + logger.error("Filename is empty or invalid") + raise HTTPException( + status_code=400, detail="Filename is required" + ) + + # Read file from MinIO + try: + logger.info( + f"Reading file '{filename}' from MinIO bucket '{minio_bucket_name}'") + with minio_client.get_object( + bucket_name=minio_bucket_name, object_name=filename + ) as response: + content_type = response.headers.get("content-type", "") + logger.debug(f"File content type: {content_type}") + + if content_type.startswith("text/"): + # Read as text (UTF-8) + content = response.read().decode("utf-8") + logger.debug(f"Read {len(content)} characters from text file") + else: + # Read as binary + content = response.read() + logger.debug(f"Read {len(content)} bytes from binary file") + + return content + except Exception as e: + error_msg = f"Error reading file '{filename}' from MinIO: {e!s}" + logger.error(error_msg) + raise HTTPException( + status_code=500, detail=error_msg + ) from e diff --git a/backend/requirements.txt b/backend/requirements.txt index 4d6fd0d..3e89647 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -7,4 +7,7 @@ minio python-dotenv xmltodict markdownify -chardet \ No newline at end of file +chardet +pymilvus +requests +tqdm \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ce148db..a5c3207 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,6 +13,11 @@ services: - MINIO_SECURE=${MINIO_SECURE} depends_on: - "milvus" + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "5" frontend: build: context: ./frontend diff --git a/import_requests_test.http b/import_requests_test.http new file mode 100644 index 0000000..d409fd8 --- /dev/null +++ b/import_requests_test.http @@ -0,0 +1,49 @@ +@baseUrl = http://localhost:8080 + +### Import LinkedIn Comments +POST {{baseUrl}}/import +Content-Type: application/json + +{ + "source_type": "linkedin_comments", + "object_name": "output/linkedin_comments_2025-05-21T18-53-39.438179+00-00.json" +} + +### Import LinkedIn Shares +POST {{baseUrl}}/import +Content-Type: application/json + +{ + "source_type": "linkedin_shares", + "object_name": "output/linkedin_shares_2025-05-21T18-53-39.700335+00-00.json" +} + +### Import WordPress Posts +POST {{baseUrl}}/import +Content-Type: application/json + +{ + "source_type": "wordpress", + "object_name": "output/wordpress_2025-05-21T18-53-40.593642+00-00.json" +} + +### Import Facebook Business Posts +POST {{baseUrl}}/import +Content-Type: application/json + +{ + "source_type": "facebook_business_posts", + "object_name": "output/facebook_business_posts_2025-05-21T18-53-40.643167+00-00.json" +} + +### Import Instagram Posts +POST {{baseUrl}}/import +Content-Type: application/json + +{ + "source_type": "instagram_posts", + "object_name": "output/instagram_posts_2025-05-21T18-53-40.681279+00-00.json" +} + +### Get Available Collections +GET {{baseUrl}}/import/available_collections \ No newline at end of file diff --git a/minio_sample_data/transfer_to_minio.sh b/minio_sample_data/transfer_to_minio.sh index 22c8221..5794197 100644 --- a/minio_sample_data/transfer_to_minio.sh +++ b/minio_sample_data/transfer_to_minio.sh @@ -13,7 +13,7 @@ if [ ! -f "$FILE_PATH" ]; then fi # Create the bucket if it doesn't exist -mc alias set $MINIO_ALIAS http://localhost:9000 +mc alias set $MINIO_ALIAS http://localhost:9000 "minioadmin" "minioadmin" mc mb "$MINIO_ALIAS/$BUCKET_NAME" || true # Upload the file to the bucket diff --git a/requirements.txt b/requirements.txt index 8f6c139..aaaaaa5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,7 @@ requests~=2.32.3 streamlit~=1.45.0 xmltodict~=0.14.2 markdownify~=1.1.0 -minio~=7.2.15 \ No newline at end of file +minio~=7.2.15 +chardet~=5.2.0 +pymilvus~=2.5.9 +tqdm~=4.67.1 \ No newline at end of file diff --git a/test_embedding.http b/test_embedding.http new file mode 100644 index 0000000..49958c5 --- /dev/null +++ b/test_embedding.http @@ -0,0 +1,21 @@ +### List Available Models +GET http://localhost:11434/api/tags +Content-Type: application/json + +### Test nomic-embed-text Embedding +POST http://localhost:11434/api/embed +Content-Type: application/json + +{ + "model": "nomic-embed-text:latest", + "input": "This is another test sentence to generate an embedding using a different model." +} + +### Test snowflake-arctic-embed2 Embedding +POST http://localhost:11434/api/embed +Content-Type: application/json + +{ + "model": "snowflake-arctic-embed2:latest", + "input": "This is a test sentence to generate an embedding." +}