chatgpt-backup/extract.py

50 lines
1.6 KiB
Python
Raw Normal View History

2023-11-08 20:03:04 +00:00
import json
import requests
import bs4 as bs
import pandas as pd
# %% URL du lien partagé
url = ""
# %%
r = requests.get(url).content
b = bs.BeautifulSoup(r, "html.parser")
# %%
j = json.loads(b.findAll("script", {"id": "__NEXT_DATA__"})[0].string)
# %%
sharedConversationId = j["props"]["pageProps"]['sharedConversationId']
title = j["props"]["pageProps"]['serverResponse']['data']['title']
mapping_keys = j["props"]["pageProps"]['serverResponse']['data']['mapping'].keys()
# %%
messages = []
for key in mapping_keys:
m = j["props"]["pageProps"]['serverResponse']['data']['mapping'][key]
try:
mapping_id = m["id"]
if m["message"]:
message_id = m["message"]["id"]
message_author_role = m["message"]["author"]["role"]
message_create_time = m["message"]["create_time"]
message_parts = m["message"]['content']['parts']
m_parent = m["parent"]
m_children = m["children"]
message = {
"sharedConversationId": sharedConversationId,
"title": title,
"id": message_id,
"time": message_create_time,
"role": message_author_role,
"parts": message_parts,
"parent": m_parent,
"children": m_children
}
messages.append(message)
except Exception as e:
print(e)
# %%
messages_df = pd.DataFrame(messages).sort_values(by=['time']).reset_index(drop=True)
messages_df.to_csv(f"messages_{sharedConversationId}.csv", index=False)