début de l'analyse des données

This commit is contained in:
François Pelletier 2024-07-31 20:14:04 -04:00
parent 7eaa953dc4
commit 218afb2013

21
analyse.py Normal file
View file

@ -0,0 +1,21 @@
import pandas as pd
# %% Step 1: Load the jsonl dataset in a pandas DataFrame
results = pd.read_json('results.jsonl', lines=True)
# %% Step 2: Convert the parent column to a DataFrame
parent_df = results['parent'].apply(pd.Series)
# %% Step 3: Explode the nested array in the ns column into a new DataFrame
ns_df = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series)
# %% Step 4: Extract the IPv4 addresses from the nested dictionaries
ns_df['ipv4_'] = ns_df['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x])
del ns_df['ipv4']
# %% Step 5: Extract the IPv6 addresses from the nested dictionaries
ns_df['ipv6_'] = ns_df['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x])
del ns_df['ipv6']
# %% Extract values from results column
results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)