Initial commit

This commit is contained in:
François Pelletier 2023-11-08 15:03:04 -05:00
commit 922ee57e65
3 changed files with 56 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
/messages_a778dec5-8e80-43e0-a1e7-f1019ef4e5e8.csv
/chatgpt-backup.iml
/.idea/

50
extract.py Normal file
View file

@ -0,0 +1,50 @@
import json
import requests
import bs4 as bs
import pandas as pd
# %% URL du lien partagé
url = ""
# %%
r = requests.get(url).content
b = bs.BeautifulSoup(r, "html.parser")
# %%
j = json.loads(b.findAll("script", {"id": "__NEXT_DATA__"})[0].string)
# %%
sharedConversationId = j["props"]["pageProps"]['sharedConversationId']
title = j["props"]["pageProps"]['serverResponse']['data']['title']
mapping_keys = j["props"]["pageProps"]['serverResponse']['data']['mapping'].keys()
# %%
messages = []
for key in mapping_keys:
m = j["props"]["pageProps"]['serverResponse']['data']['mapping'][key]
try:
mapping_id = m["id"]
if m["message"]:
message_id = m["message"]["id"]
message_author_role = m["message"]["author"]["role"]
message_create_time = m["message"]["create_time"]
message_parts = m["message"]['content']['parts']
m_parent = m["parent"]
m_children = m["children"]
message = {
"sharedConversationId": sharedConversationId,
"title": title,
"id": message_id,
"time": message_create_time,
"role": message_author_role,
"parts": message_parts,
"parent": m_parent,
"children": m_children
}
messages.append(message)
except Exception as e:
print(e)
# %%
messages_df = pd.DataFrame(messages).sort_values(by=['time']).reset_index(drop=True)
messages_df.to_csv(f"messages_{sharedConversationId}.csv", index=False)

3
requirements.txt Normal file
View file

@ -0,0 +1,3 @@
requests~=2.31.0
beautifulsoup4~=4.12.2
pandas~=2.1.2