From 218afb2013ec349ce3f8b085eacc513243e8e8f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Wed, 31 Jul 2024 20:14:04 -0400 Subject: [PATCH] =?UTF-8?q?d=C3=A9but=20de=20l'analyse=20des=20donn=C3=A9e?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- analyse.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 analyse.py diff --git a/analyse.py b/analyse.py new file mode 100644 index 0000000..950ce17 --- /dev/null +++ b/analyse.py @@ -0,0 +1,21 @@ +import pandas as pd + +# %% Step 1: Load the jsonl dataset in a pandas DataFrame +results = pd.read_json('results.jsonl', lines=True) + +# %% Step 2: Convert the parent column to a DataFrame +parent_df = results['parent'].apply(pd.Series) + +# %% Step 3: Explode the nested array in the ns column into a new DataFrame +ns_df = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series) + +# %% Step 4: Extract the IPv4 addresses from the nested dictionaries +ns_df['ipv4_'] = ns_df['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x]) +del ns_df['ipv4'] + +# %% Step 5: Extract the IPv6 addresses from the nested dictionaries +ns_df['ipv6_'] = ns_df['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x]) +del ns_df['ipv6'] + +# %% Extract values from results column +results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)