diff --git a/analyse.py b/analyse.py new file mode 100644 index 0000000..950ce17 --- /dev/null +++ b/analyse.py @@ -0,0 +1,21 @@ +import pandas as pd + +# %% Step 1: Load the jsonl dataset in a pandas DataFrame +results = pd.read_json('results.jsonl', lines=True) + +# %% Step 2: Convert the parent column to a DataFrame +parent_df = results['parent'].apply(pd.Series) + +# %% Step 3: Explode the nested array in the ns column into a new DataFrame +ns_df = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series) + +# %% Step 4: Extract the IPv4 addresses from the nested dictionaries +ns_df['ipv4_'] = ns_df['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x]) +del ns_df['ipv4'] + +# %% Step 5: Extract the IPv6 addresses from the nested dictionaries +ns_df['ipv6_'] = ns_df['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x]) +del ns_df['ipv6'] + +# %% Extract values from results column +results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)