From 218afb2013ec349ce3f8b085eacc513243e8e8f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= <francois@jevalide.ca>
Date: Wed, 31 Jul 2024 20:14:04 -0400
Subject: [PATCH 1/2] =?UTF-8?q?d=C3=A9but=20de=20l'analyse=20des=20donn?=
 =?UTF-8?q?=C3=A9es?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 analyse.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 analyse.py

diff --git a/analyse.py b/analyse.py
new file mode 100644
index 0000000..950ce17
--- /dev/null
+++ b/analyse.py
@@ -0,0 +1,21 @@
+import pandas as pd
+
+# %% Step 1: Load the jsonl dataset in a pandas DataFrame
+results = pd.read_json('results.jsonl', lines=True)
+
+# %% Step 2: Convert the parent column to a DataFrame
+parent_df = results['parent'].apply(pd.Series)
+
+# %% Step 3: Explode the nested array in the ns column into a new DataFrame
+ns_df = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series)
+
+# %% Step 4: Extract the IPv4 addresses from the nested dictionaries
+ns_df['ipv4_'] = ns_df['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x])
+del ns_df['ipv4']
+
+# %% Step 5: Extract the IPv6 addresses from the nested dictionaries
+ns_df['ipv6_'] = ns_df['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x])
+del ns_df['ipv6']
+
+# %% Extract values from results column
+results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)
-- 
2.39.5


From d76ab0c91475606c989685dfa47c87e4bb69cdb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= <francois@jevalide.ca>
Date: Wed, 31 Jul 2024 20:52:19 -0400
Subject: [PATCH 2/2] =?UTF-8?q?extraction=20des=20donn=C3=A9es=20dans=20un?=
 =?UTF-8?q?e=20base=20SQLite?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore             |  1 +
 analyse.py             | 21 ----------------
 extract_to_database.py | 55 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 21 deletions(-)
 delete mode 100644 analyse.py
 create mode 100644 extract_to_database.py

diff --git a/.gitignore b/.gitignore
index 7c9bbd3..337b0ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 /dns-backup-tool.iml
 /venv/
 /.idea/
+/*.db
diff --git a/analyse.py b/analyse.py
deleted file mode 100644
index 950ce17..0000000
--- a/analyse.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import pandas as pd
-
-# %% Step 1: Load the jsonl dataset in a pandas DataFrame
-results = pd.read_json('results.jsonl', lines=True)
-
-# %% Step 2: Convert the parent column to a DataFrame
-parent_df = results['parent'].apply(pd.Series)
-
-# %% Step 3: Explode the nested array in the ns column into a new DataFrame
-ns_df = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series)
-
-# %% Step 4: Extract the IPv4 addresses from the nested dictionaries
-ns_df['ipv4_'] = ns_df['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x])
-del ns_df['ipv4']
-
-# %% Step 5: Extract the IPv6 addresses from the nested dictionaries
-ns_df['ipv6_'] = ns_df['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x])
-del ns_df['ipv6']
-
-# %% Extract values from results column
-results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)
diff --git a/extract_to_database.py b/extract_to_database.py
new file mode 100644
index 0000000..1d8d1b0
--- /dev/null
+++ b/extract_to_database.py
@@ -0,0 +1,55 @@
+import pandas as pd
+from sqlalchemy import create_engine
+
+# %% Step 1: Load the jsonl dataset in a pandas DataFrame
+results = pd.read_json('results.jsonl', lines=True)
+
+# %% Step 2: Convert the parent column to a DataFrame
+parent_df = results['parent'].apply(pd.Series)
+
+# %% Step 3: Explode the nested array in the ns column into a new DataFrame
+ns_df_ = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series)
+
+# %% Step 4: Extract the IPv4 addresses from the nested dictionaries
+ns_df_['ipv4_'] = ns_df_['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x])
+del ns_df_['ipv4']
+
+# %% Step 5: Extract the IPv6 addresses from the nested dictionaries
+ns_df_['ipv6_'] = ns_df_['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x])
+del ns_df_['ipv6']
+
+# %% Extract values from results column
+results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)
+
+# %% Prepare the final DataFrames
+
+base_df = results.drop(columns=['parent', 'results']).copy()
+ns_df = ns_df_.explode(['ipv4_', 'ipv6_']).copy()
+mail_df = results_df['MAIL'].explode().dropna().apply(pd.Series).add_prefix('MAIL_')
+web4_df = results_df['WEB4'].explode().dropna().apply(pd.Series).add_prefix('WEB4_')
+web4www_df = results_df['WEB4_www'].explode().dropna().apply(pd.Series).add_prefix('WEB4_www_')
+web6_df = results_df['WEB6'].explode().dropna().apply(pd.Series).add_prefix('WEB6_')
+web6www_df = results_df['WEB6_www'].explode().dropna().apply(pd.Series).add_prefix('WEB6_www_')
+txt_df = results_df['TXT'].explode().dropna().apply(pd.Series).add_prefix('TXT_')
+
+# %% Drop GeoIP columns
+web4_df = web4_df.drop(columns=['WEB4_geoip'])
+web4www_df = web4www_df.drop(columns=['WEB4_www_geoip'])
+web6_df = web6_df.drop(columns=['WEB6_geoip'])
+web6www_df = web6www_df.drop(columns=['WEB6_www_geoip'])
+
+# %% Combine all DataFrames in a SQLite database
+
+date_today = pd.to_datetime('today').strftime('%Y-%m-%d')
+engine = create_engine(f'sqlite:///dns_results_{date_today}.db')
+
+base_df.to_sql('base_df', engine, if_exists='replace', index=True)
+ns_df.to_sql('ns_df', engine, if_exists='replace', index=True)
+mail_df.to_sql('mail_df', engine, if_exists='replace', index=True)
+web4_df.to_sql('web4_df', engine, if_exists='replace', index=True)
+web4www_df.to_sql('web4www_df', engine, if_exists='replace', index=True)
+web6_df.to_sql('web6_df', engine, if_exists='replace', index=True)
+web6www_df.to_sql('web6www_df', engine, if_exists='replace', index=True)
+txt_df.to_sql('txt_df', engine, if_exists='replace', index=True)
+
+print('Data saved to sqlite database.')
-- 
2.39.5