extraction des données dans une base SQLite

This commit is contained in:
François Pelletier 2024-07-31 20:52:19 -04:00
parent 218afb2013
commit d76ab0c914
3 changed files with 56 additions and 21 deletions

1
.gitignore vendored
View file

@ -5,3 +5,4 @@
/dns-backup-tool.iml
/venv/
/.idea/
/*.db

View file

@ -1,21 +0,0 @@
import pandas as pd
# %% Step 1: Load the jsonl dataset in a pandas DataFrame
results = pd.read_json('results.jsonl', lines=True)
# %% Step 2: Convert the parent column to a DataFrame
parent_df = results['parent'].apply(pd.Series)
# %% Step 3: Explode the nested array in the ns column into a new DataFrame
ns_df = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series)
# %% Step 4: Extract the IPv4 addresses from the nested dictionaries
ns_df['ipv4_'] = ns_df['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x])
del ns_df['ipv4']
# %% Step 5: Extract the IPv6 addresses from the nested dictionaries
ns_df['ipv6_'] = ns_df['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x])
del ns_df['ipv6']
# %% Extract values from results column
results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)

55
extract_to_database.py Normal file
View file

@ -0,0 +1,55 @@
import pandas as pd
from sqlalchemy import create_engine
# %% Step 1: Load the jsonl dataset in a pandas DataFrame
results = pd.read_json('results.jsonl', lines=True)
# %% Step 2: Convert the parent column to a DataFrame
parent_df = results['parent'].apply(pd.Series)
# %% Step 3: Explode the nested array in the ns column into a new DataFrame
ns_df_ = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series)
# %% Step 4: Extract the IPv4 addresses from the nested dictionaries
ns_df_['ipv4_'] = ns_df_['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x])
del ns_df_['ipv4']
# %% Step 5: Extract the IPv6 addresses from the nested dictionaries
ns_df_['ipv6_'] = ns_df_['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x])
del ns_df_['ipv6']
# %% Extract values from results column
results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series)
# %% Prepare the final DataFrames
base_df = results.drop(columns=['parent', 'results']).copy()
ns_df = ns_df_.explode(['ipv4_', 'ipv6_']).copy()
mail_df = results_df['MAIL'].explode().dropna().apply(pd.Series).add_prefix('MAIL_')
web4_df = results_df['WEB4'].explode().dropna().apply(pd.Series).add_prefix('WEB4_')
web4www_df = results_df['WEB4_www'].explode().dropna().apply(pd.Series).add_prefix('WEB4_www_')
web6_df = results_df['WEB6'].explode().dropna().apply(pd.Series).add_prefix('WEB6_')
web6www_df = results_df['WEB6_www'].explode().dropna().apply(pd.Series).add_prefix('WEB6_www_')
txt_df = results_df['TXT'].explode().dropna().apply(pd.Series).add_prefix('TXT_')
# %% Drop GeoIP columns
web4_df = web4_df.drop(columns=['WEB4_geoip'])
web4www_df = web4www_df.drop(columns=['WEB4_www_geoip'])
web6_df = web6_df.drop(columns=['WEB6_geoip'])
web6www_df = web6www_df.drop(columns=['WEB6_www_geoip'])
# %% Combine all DataFrames in a SQLite database
date_today = pd.to_datetime('today').strftime('%Y-%m-%d')
engine = create_engine(f'sqlite:///dns_results_{date_today}.db')
base_df.to_sql('base_df', engine, if_exists='replace', index=True)
ns_df.to_sql('ns_df', engine, if_exists='replace', index=True)
mail_df.to_sql('mail_df', engine, if_exists='replace', index=True)
web4_df.to_sql('web4_df', engine, if_exists='replace', index=True)
web4www_df.to_sql('web4www_df', engine, if_exists='replace', index=True)
web6_df.to_sql('web6_df', engine, if_exists='replace', index=True)
web6www_df.to_sql('web6www_df', engine, if_exists='replace', index=True)
txt_df.to_sql('txt_df', engine, if_exists='replace', index=True)
print('Data saved to sqlite database.')