🚀 Add feature: Instagram and Facebook business posts conversion

2025-05-21 00:05:43 -04:00 · 2025-05-21 00:05:43 -04:00 · f9e5a6e013
commit f9e5a6e013
parent 5ee50a0f0f
8 changed files with 2357 additions and 33 deletions
--- a/backend/app/convert/convert_facebook_business_posts_json.py
+++ b/backend/app/convert/convert_facebook_business_posts_json.py
@ -1,30 +1,18 @@
 import json
-import datetime
 from typing import Dict, Union, List

 import pandas as pd

 from app.config import logger
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
-from app.convert.utils.convert_encoding_meta import convert_encoding_meta
+from app.convert.utils.encode_utf8 import encode_utf8


 class FacebookBusinessPostsConverter(BaseConverter):
    def read_file(self) -> None:
-        json_file = content_from_file(self.content)
-        content = convert_encoding_meta(json_file.read())
+        content = encode_utf8(self.content)
        self.datadict = json.loads(content)

-    def add_metadata(self) -> None:
-        self.df = self.df.assign(
-            index="facebook_business_posts",
-            type="posts",
-            network="FacebookBusiness"
-        )
-
-    def convert_columns(self) -> None:
        posts_medias = []
        for post in self.datadict:
            data_post_items = post['data']
@ -45,6 +33,15 @@ class FacebookBusinessPostsConverter(BaseConverter):
                            })

        self.df = pd.DataFrame(posts_medias).explode(['chemin'])
+
+    def add_metadata(self) -> None:
+        self.df = self.df.assign(
+            index="facebook_business_posts",
+            type="posts",
+            network="FacebookBusiness"
+        )
+
+    def convert_columns(self) -> None:
        self.df['creation_timestamp'] = self.df['creation_timestamp'].astype(
            int)

--- a/backend/app/convert/convert_instagram_posts_json.py
+++ b/backend/app/convert/convert_instagram_posts_json.py
@ -4,26 +4,14 @@ from typing import Union, List, Dict
 import pandas as pd

 from app.config import logger
-from app.models import ConversionResponse
-from app.convert.utils.content_from_file import content_from_file
 from app.convert.base_converter import BaseConverter
-from app.convert.utils.convert_encoding_meta import convert_encoding_meta
+from app.convert.utils.encode_utf8 import encode_utf8


 class InstagramPostsConverter(BaseConverter):
    def read_file(self) -> None:
-        json_file = content_from_file(self.content)
-        content = convert_encoding_meta(json_file.read())
+        content = encode_utf8(self.content)
        self.datadict = json.loads(content)
-
-    def add_metadata(self) -> None:
-        self.df = self.df.assign(
-            index="instagram_posts",
-            type="posts",
-            network="Instagram"
-        )
-
-    def convert_columns(self) -> None:
        posts_medias = []
        for post in self.datadict:
            medias = post['media']
@ -45,15 +33,22 @@ class InstagramPostsConverter(BaseConverter):
                })

        self.df = pd.DataFrame(posts_medias).explode(['chemin'])
-        self.df['creation_timestamp'] = self.df['creation_timestamp'].astype(
-            int)
+
+    def add_metadata(self) -> None:
+        self.df = self.df.assign(
+            index="instagram_posts",
+            type="posts",
+            network="Instagram"
+        )
+
+    def convert_columns(self) -> None:
+        self.df['creation_timestamp'] = (self.df['creation_timestamp']
+                                         .astype(int))

    def rename_columns(self) -> None:
-        # No column renaming needed for this converter
-        pass
+        self.df = self.df.rename(columns={})

    def clean_data(self) -> None:
-        super().clean_data()
        self.df['url'] = ""
        self.df.fillna(value="", inplace=True)

--- a/backend/app/convert/utils/encode_utf8.py
+++ b/backend/app/convert/utils/encode_utf8.py
@ -0,0 +1,18 @@
+import chardet
+
+
+def encode_utf8(raw_data):
+    # Detect the encoding of the file
+    encoding = chardet.detect(raw_data)['encoding']
+
+    # Decode the file based on the detected encoding
+    if encoding == 'utf-8':
+        content = raw_data.decode('utf-8').encode('utf-8')
+    elif encoding == 'latin-1':
+        content = raw_data.decode('latin-1').encode('utf-8')
+    elif encoding == 'ascii':
+        content = raw_data.decode('ascii').encode('utf-8')
+    else:
+        raise ValueError(f"Unsupported encoding: {encoding}")
+
+    return content
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -7,3 +7,4 @@ minio
 python-dotenv
 xmltodict
 markdownify
+chardet