diff --git a/.gitignore b/.gitignore index 7c9bbd3..337b0ba 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ /dns-backup-tool.iml /venv/ /.idea/ +/*.db diff --git a/extract_to_database.py b/extract_to_database.py new file mode 100644 index 0000000..1d8d1b0 --- /dev/null +++ b/extract_to_database.py @@ -0,0 +1,55 @@ +import pandas as pd +from sqlalchemy import create_engine + +# %% Step 1: Load the jsonl dataset in a pandas DataFrame +results = pd.read_json('results.jsonl', lines=True) + +# %% Step 2: Convert the parent column to a DataFrame +parent_df = results['parent'].apply(pd.Series) + +# %% Step 3: Explode the nested array in the ns column into a new DataFrame +ns_df_ = pd.DataFrame({'ns': parent_df['ns']}).explode('ns').dropna()['ns'].apply(pd.Series) + +# %% Step 4: Extract the IPv4 addresses from the nested dictionaries +ns_df_['ipv4_'] = ns_df_['ipv4'].fillna('').apply(lambda x: [a['ip'] for a in x]) +del ns_df_['ipv4'] + +# %% Step 5: Extract the IPv6 addresses from the nested dictionaries +ns_df_['ipv6_'] = ns_df_['ipv6'].fillna('').apply(lambda x: [a['ip'] for a in x]) +del ns_df_['ipv6'] + +# %% Extract values from results column +results_df = results['results'].apply(lambda x: x['DNS_LOCAL']).apply(pd.Series) + +# %% Prepare the final DataFrames + +base_df = results.drop(columns=['parent', 'results']).copy() +ns_df = ns_df_.explode(['ipv4_', 'ipv6_']).copy() +mail_df = results_df['MAIL'].explode().dropna().apply(pd.Series).add_prefix('MAIL_') +web4_df = results_df['WEB4'].explode().dropna().apply(pd.Series).add_prefix('WEB4_') +web4www_df = results_df['WEB4_www'].explode().dropna().apply(pd.Series).add_prefix('WEB4_www_') +web6_df = results_df['WEB6'].explode().dropna().apply(pd.Series).add_prefix('WEB6_') +web6www_df = results_df['WEB6_www'].explode().dropna().apply(pd.Series).add_prefix('WEB6_www_') +txt_df = results_df['TXT'].explode().dropna().apply(pd.Series).add_prefix('TXT_') + +# %% Drop GeoIP columns +web4_df = web4_df.drop(columns=['WEB4_geoip']) +web4www_df = web4www_df.drop(columns=['WEB4_www_geoip']) +web6_df = web6_df.drop(columns=['WEB6_geoip']) +web6www_df = web6www_df.drop(columns=['WEB6_www_geoip']) + +# %% Combine all DataFrames in a SQLite database + +date_today = pd.to_datetime('today').strftime('%Y-%m-%d') +engine = create_engine(f'sqlite:///dns_results_{date_today}.db') + +base_df.to_sql('base_df', engine, if_exists='replace', index=True) +ns_df.to_sql('ns_df', engine, if_exists='replace', index=True) +mail_df.to_sql('mail_df', engine, if_exists='replace', index=True) +web4_df.to_sql('web4_df', engine, if_exists='replace', index=True) +web4www_df.to_sql('web4www_df', engine, if_exists='replace', index=True) +web6_df.to_sql('web6_df', engine, if_exists='replace', index=True) +web6www_df.to_sql('web6www_df', engine, if_exists='replace', index=True) +txt_df.to_sql('txt_df', engine, if_exists='replace', index=True) + +print('Data saved to sqlite database.')