commit 922ee57e65ebb5b920ebcb422118e9e2e62f9eb2 Author: François Pelletier Date: Wed Nov 8 15:03:04 2023 -0500 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f13565 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/messages_a778dec5-8e80-43e0-a1e7-f1019ef4e5e8.csv +/chatgpt-backup.iml +/.idea/ diff --git a/extract.py b/extract.py new file mode 100644 index 0000000..67db800 --- /dev/null +++ b/extract.py @@ -0,0 +1,50 @@ +import json + +import requests +import bs4 as bs +import pandas as pd + +# %% URL du lien partagé +url = "" + +# %% +r = requests.get(url).content +b = bs.BeautifulSoup(r, "html.parser") + +# %% +j = json.loads(b.findAll("script", {"id": "__NEXT_DATA__"})[0].string) + +# %% +sharedConversationId = j["props"]["pageProps"]['sharedConversationId'] +title = j["props"]["pageProps"]['serverResponse']['data']['title'] +mapping_keys = j["props"]["pageProps"]['serverResponse']['data']['mapping'].keys() + +# %% +messages = [] +for key in mapping_keys: + m = j["props"]["pageProps"]['serverResponse']['data']['mapping'][key] + try: + mapping_id = m["id"] + if m["message"]: + message_id = m["message"]["id"] + message_author_role = m["message"]["author"]["role"] + message_create_time = m["message"]["create_time"] + message_parts = m["message"]['content']['parts'] + m_parent = m["parent"] + m_children = m["children"] + message = { + "sharedConversationId": sharedConversationId, + "title": title, + "id": message_id, + "time": message_create_time, + "role": message_author_role, + "parts": message_parts, + "parent": m_parent, + "children": m_children + } + messages.append(message) + except Exception as e: + print(e) +# %% +messages_df = pd.DataFrame(messages).sort_values(by=['time']).reset_index(drop=True) +messages_df.to_csv(f"messages_{sharedConversationId}.csv", index=False) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5df8a7e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests~=2.31.0 +beautifulsoup4~=4.12.2 +pandas~=2.1.2 \ No newline at end of file