add RQ0 normalization function

This commit is contained in:
Frank Xu
2026-01-27 22:34:29 -05:00
parent 7a41a766ff
commit 6fd8ac1789
31 changed files with 430 additions and 169 deletions

View File

@@ -0,0 +1,122 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "c2d824a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_commerce_20260127T175911Z.jsonl\n",
"PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
"PII_A1_wa_20260127T180213Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_wa_20260127T180213Z.jsonl\n",
"PII_A2_core_20260127T180339Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_core_20260127T180339Z.jsonl\n",
"PII_A2_journal_20260127T180440Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_journal_20260127T180440Z.jsonl\n",
"PII_A2_main_20260127T180710Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_main_20260127T180710Z.jsonl\n",
"PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
"PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
"PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
"PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
"PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
"PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
"PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
"PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
"PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
"PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
"PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
"PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> batch_results_normalized\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
"PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> batch_results_normalized\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
"PII_I3_sms_20260127T183606Z.jsonl: 5 records -> batch_results_normalized\\PII_I3_sms_20260127T183606Z.jsonl\n",
"PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> batch_results_normalized\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
"PII_I4_History_20260127T183727Z.jsonl: 5 records -> batch_results_normalized\\PII_I4_History_20260127T183727Z.jsonl\n",
"PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> batch_results_normalized\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
"PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> batch_results_normalized\\PII_I5_Extras_20260127T183857Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\stats\\batch_results_normalized\n"
]
}
],
"source": [
"import json\n",
"from pathlib import Path\n",
"\n",
"from stats_utils import normalize_and_slim_record\n",
"\n",
"IN_DIR = Path(r\"..\\\\batch_results\")\n",
"OUT_DIR = Path(r\"batch_results_normalized\")\n",
"\n",
"def process_file(in_path: Path, out_path: Path) -> int:\n",
" n = 0\n",
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
" for line in fin:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" obj = json.loads(line)\n",
" if not isinstance(obj, dict):\n",
" continue\n",
" slim = normalize_and_slim_record(obj)\n",
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
" n += 1\n",
" return n\n",
"\n",
"def main() -> None:\n",
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
" files = sorted(IN_DIR.glob(\"*.jsonl\"))\n",
" if not files:\n",
" print(f\"No .jsonl files found in: {IN_DIR.resolve()}\")\n",
" return\n",
"\n",
" total_files = 0\n",
" total_records = 0\n",
"\n",
" for fp in files:\n",
" out_fp = OUT_DIR / fp.name\n",
" n = process_file(fp, out_fp)\n",
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
" total_files += 1\n",
" total_records += n\n",
"\n",
" print(f\"Done. Files: {total_files}, Records: {total_records}\")\n",
" print(f\"Output folder: {OUT_DIR.resolve()}\")\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0074eda4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -180,7 +180,7 @@
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" out = aggregate_jsonl_folder(r\"..\\batch_results\", \"aggregated_by_app_and_type.jsonl\")\n",
" out = aggregate_jsonl_folder(r\"batch_results_normalized\", \"aggregated_by_app_and_type.jsonl\")\n",
" print(f\"Wrote: {out.resolve()}\")\n"
]
}

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A1_msgstore.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A1_msgstore.db", "PII_type": "PHONE", "PII": ["2023133725", "9106995488", "14244990541", "14359905938", "16467602090", "13346095713", "17622338037"], "Num_of_PII": 7, "source_columns": ["message.text_data", "chat.subject", "call_log.call_id", "message_text.description"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\A1_msgstore.db", "PII_type": "USERNAME", "PII": ["wealthbuildersclub", "btcoinmastersjiminvestmentteam"], "Num_of_PII": 2, "source_columns": ["chat.subject", "message.text_data", "jid.user", "user_device.user_jid_row_id"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\A1_msgstore.db", "PII_type": "PERSON_NAME", "PII": ["jim investment team", "mary garcia", "jim anderson", "professor jim", "mr. jim", "lorie logan", "benjamin", "mary", "ron desantis", "cathy wood", "michael saylor", "gary gensler", "harvey jones", "abbas al qattan", "jennifer farer", "damodaran", "moe", "abe", "matt", "ruth", "wilson", "rebeca"], "Num_of_PII": 22, "source_columns": ["chat.subject", "message.text_data", "message_text.description", "message_vcard.vcard"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\A1_msgstore.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": ["message.text_data", "message_text.description", "message_location.place_address", "message_vcard.vcard"], "Num_of_source_columns": 4}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "PHONE", "PII": ["5713298742", "5713349815", "8085096467", "8624338328", "7034241981", "2028177932", "5715917168", "2025692832", "6106046786", "6263678865", "2065937224", "9199037779", "8056377243", "9735203731", "8136743027", "7423794330"], "Num_of_PII": 16, "source_columns": ["wa_contacts.number", "wa_address_book.number"], "Num_of_source_columns": 2}
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "PERSON_NAME", "PII": ["svetlana chernoff", "ltc whalen", "capt don wayne", "karen tate", "brian reynolds", "goldie kahn", "vladamir stravinsky", "mary garcia", "abe rudder", "russ philby"], "Num_of_PII": 10, "source_columns": ["wa_contacts.display_name", "wa_contacts.given_name", "wa_contacts.family_name", "wa_contacts.nickname", "wa_address_book.display_name", "wa_address_book.given_name", "wa_address_book.family_name", "wa_address_book.nickname", "wa_biz_profiles.business_description", "wa_biz_profiles.location_name"], "Num_of_source_columns": 10}
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A2_core.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A2_core.db", "PII_type": "PHONE", "PII": ["18624338329"], "Num_of_PII": 1, "source_columns": ["configetag.etag", "configrule.config_id", "deltaforcesync.client_key", "preferences.key", "snapuserstore.textval", "snapchatuserproperties.textval"], "Num_of_source_columns": 6}
{"db_path": "selectedDBs\\A2_core.db", "PII_type": "USERNAME", "PII": ["oneil3607", "no_skin_tone", "static_image", "chqkedmxnde2nmq3zddlytu3ngyqag", "memories_and_camera_roll", "caegqokatayoqokatazaaggp", "cncbaridx8kb", "caego7zptqyoo7zptqy", "caiyascjvm+1bije0tm1bkdg0tm1bkjg0tm1blab", "camgqokatayo09lztqy", "caigqrzptqyo09lztqy", "wgiqag"], "Num_of_PII": 12, "source_columns": ["snapuserstore.textval", "snapchatuserproperties.textval"], "Num_of_source_columns": 2}
{"db_path": "selectedDBs\\A2_core.db", "PII_type": "PERSON_NAME", "PII": ["sharon oneil"], "Num_of_PII": 1, "source_columns": ["configetag.etag", "preferences.stringvalue", "snapuserstore.textval", "snapchatuserproperties.textval"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\A2_core.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A2_main.db", "PII_type": "EMAIL", "PII": ["copyright@snap.com"], "Num_of_PII": 1, "source_columns": ["billboardstrings.message", "combinedusername.originalusername", "contact.displayname", "notificationdata.userid", "story.displayname", "suggestedfriend.userid"], "Num_of_source_columns": 6}
{"db_path": "selectedDBs\\A2_main.db", "PII_type": "PHONE", "PII": ["2065937224", "8624338328", "9199037779", "8085096467", "5713298742", "2028177932", "2025692832", "5713349815", "5715917168", "6106046786", "6263678865", "8056377243", "7423794330"], "Num_of_PII": 13, "source_columns": ["contact.phone", "billboardstrings.stringkey", "legalagreementstrings.message", "story.displayname", "storysnap.displayname"], "Num_of_source_columns": 5}
{"db_path": "selectedDBs\\A2_main.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A2_main.db", "PII_type": "PERSON_NAME", "PII": ["abe rudder", "karen tate", "russ philby", "capt don wayne", "svetlana chernoff", "brian reynolds", "vladamir stravinsky", "ltc whalen", "goldie kahn", "joey", "mary garcia", "sharon oneil"], "Num_of_PII": 12, "source_columns": ["contact.displayname", "combinedusername.originalusername", "feed.specifiedname", "story.displayname", "storysnap.displayname", "suggestedfriend.suggestionreason"], "Num_of_source_columns": 6}
{"db_path": "selectedDBs\\A2_main.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "EMAIL", "PII": ["heather@cellebrite.com", "hmahalik@gmail.com"], "Num_of_PII": 2, "source_columns": ["tokens.value", "tokens_content.c1value", "tokens_stat.value", "cacheinfo.affinity_response_context"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": ["tokens.value", "tokens_content.c1value", "tokens_stat.value"], "Num_of_source_columns": 3}
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "USERNAME", "PII": ["heather", "hmahalik"], "Num_of_PII": 2, "source_columns": ["cacheinfo.affinity_response_context", "contacts.type", "tokens.value", "tokens_content.c1value", "tokens_stat.value"], "Num_of_source_columns": 5}
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["tokens.value", "tokens_stat.value", "contacts.id"], "Num_of_source_columns": 3}
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "EMAIL", "PII": ["sharononeil368@gmail.com"], "Num_of_PII": 1, "source_columns": ["bookmarks.account_name", "internet_sync.sync_key", "sync_state.account_name", "tabs.tab_title", "android_metadata.locale"], "Num_of_source_columns": 5}
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "USERNAME", "PII": ["syncinternetdata", "syncbookmarks", "syncopenpages", "syncsavedpages"], "Num_of_PII": 4, "source_columns": ["bookmarks.account_name", "tabs.account_name", "sync_state.account_name", "internet_sync.sync_key"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["bookmarks.title", "bookmarks.account_name", "bookmarks.account_type", "sync_state.data", "tabs.tab_title", "tabs.account_name", "tabs.account_type"], "Num_of_source_columns": 7}
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": ["zwaaggregatecallevent.zlinktoken", "zwacdcallevent.zcallidstring", "zwacdcallevent.zgroupcallcreatoruserjidstring", "zwacdcallevent.zgroupjidstring", "zwacdcalleventparticipant.zjidstring"], "Num_of_source_columns": 5}
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": ["zwacdcallevent.zcallidstring", "zwacdcalleventparticipant.zjidstring", "zwaaggregatecallevent.zlinktoken"], "Num_of_source_columns": 3}
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_type": "PERSON_NAME", "PII": ["rick", "otto", "reynolds", "emily", "sharon", "lisena gocaj", "andy sieg", "christian justiniano", "david wilson", "robechucks raul", "abner", "nia yuniar", "william stevenson", "amit sharma"], "Num_of_PII": 14, "source_columns": ["zwamessage.ztext", "zwamessagedataitem.zcontent1", "zwaprofilepushname.zpushname"], "Num_of_source_columns": 3}
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_type": "POSTAL_ADDRESS", "PII": ["12503 e via de palmas, chandler, az", "8500 peña blvd, denver, co"], "Num_of_PII": 2, "source_columns": ["zwamessage.ztext", "zwamessagedataitem.zcontent1", "zwamessagedataitem.zcontent2"], "Num_of_source_columns": 3}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": ["zwaaddressbookcontact.zphonenumber", "zwaaddressbookcontact.zlocalizedphonenumber"], "Num_of_source_columns": 2}
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["zwaaddressbookcontact.zfullname", "zwaaddressbookcontact.zgivenname", "zwaaddressbookcontact.zlastname", "zwaaddressbookcontact.znotes"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I3_sms.db", "PII_type": "EMAIL", "PII": ["ottomatik1234@gmail.com"], "Num_of_PII": 1, "source_columns": ["message.text", "chat.account_login", "kvtable.key", "attachment.guid", "chat.display_name"], "Num_of_source_columns": 5}
{"db_path": "selectedDBs\\I3_sms.db", "PII_type": "PHONE", "PII": ["12065937224", "12185715037", "12484345508", "13017157263", "14155346421", "14197574625", "14259796297", "14325353346", "14847353029", "15162879924", "17709195681", "17852533080", "17868720110", "18553965089", "18589330241", "18624338324", "18624338329", "19195796456", "19195796465", "19199037779"], "Num_of_PII": 20, "source_columns": ["message.text", "chat.chat_identifier"], "Num_of_source_columns": 2}
{"db_path": "selectedDBs\\I3_sms.db", "PII_type": "USERNAME", "PII": ["chat70863266579689223", "chat818965589567390604"], "Num_of_PII": 2, "source_columns": ["chat.chat_identifier", "handle.id", "message.text"], "Num_of_source_columns": 3}
{"db_path": "selectedDBs\\I3_sms.db", "PII_type": "PERSON_NAME", "PII": ["anya", "william", "eddie v", "otto", "tracy", "anna", "ronen engler", "joe church", "ella rutman doligo", "sharon oneil"], "Num_of_PII": 10, "source_columns": ["chat.display_name", "handle.id", "message.text"], "Num_of_source_columns": 3}
{"db_path": "selectedDBs\\I3_sms.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I4_History.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I4_History.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I4_History.db", "PII_type": "USERNAME", "PII": ["q1146771", "q1155133", "q104851", "q46383", "q1066823", "q81809", "q190120", "q336532", "q108143", "q27686", "q1190598", "q185165", "q170448", "q13233", "q524679", "q1076874", "q2766", "q388483", "q3180957", "q570871", "q682144", "q861508", "q15709638", "q35855", "q1229287", "q1605306", "q281", "q1390866", "q19363883", "q259011", "q154168", "q977090", "q1060705", "q22866", "q1273800", "q603481", "q1809515", "q181479", "q273027", "q2047030", "q908923", "q47616", "q372852", "q705450", "q1988120", "q16939396", "q908666", "q241987", "q62458", "q747493", "q3107826", "17ebu7rghueusrylzr6u3ccjwmmqprk28k", "cached_sync_circle_size", "current_generation", "fetch_throttler_data", "last_maintenance_date", "last_synced_generation", "profile_server_map", "push_notifications_initialized", "push_throttler_data", "server_change_token", "sync_circle_size_retrieval_throttler_data", "sync_with_manatee_container"], "Num_of_PII": 63, "source_columns": ["history_event_listeners.listener_name", "history_items.id", "history_items_to_tags.history_item", "history_tags.identifier", "history_visits.title", "metadata.key"], "Num_of_source_columns": 6}
{"db_path": "selectedDBs\\I4_History.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I4_History.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "EMAIL", "PII": ["ottomatik1234@gmail.com"], "Num_of_PII": 1, "source_columns": ["alarm.email_address", "calendar.shared_owner_address", "calendar.self_identity_email", "calendar.owner_identity_email"], "Num_of_source_columns": 4}
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "selectedDBs\\I5_Extras.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I5_Extras.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I5_Extras.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I5_Extras.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
{"db_path": "selectedDBs\\I5_Extras.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -1,118 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "346b7f2a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"==================================================\n",
"DATABASE INVENTORY: users4.db\n",
"==================================================\n",
"\n",
"TABLE: users\n",
"COLUMNS (10): user_id, first_name, last_name, email, phone, street, city, state, zip_code, created_at\n",
"--------------------------------------------------\n",
"TABLE: messages\n",
"COLUMNS (4): message_id, sender_id, content, sent_at\n",
"--------------------------------------------------\n",
"\n",
"SUMMARY:\n",
"Total Tables: 2\n",
"Total Columns: 14\n",
"==================================================\n",
"\n"
]
}
],
"source": [
"import sqlite3\n",
"\n",
"def print_database_inventory(db_path: str):\n",
" \"\"\"\n",
" Connects to a SQLite database and prints every table \n",
" along with its associated columns.\n",
" \"\"\"\n",
" try:\n",
" # 1. Connect to the database\n",
" conn = sqlite3.connect(db_path)\n",
" cur = conn.cursor()\n",
"\n",
" # 2. Get all table names\n",
" cur.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
" tables = [row[0] for row in cur.fetchall()]\n",
"\n",
" print(f\"\\n{'='*50}\")\n",
" print(f\"DATABASE INVENTORY: {db_path}\")\n",
" print(f\"{'='*50}\\n\")\n",
"\n",
" total_cols = 0\n",
"\n",
" # 3. Iterate through each table to find columns\n",
" for table_name in tables:\n",
" # PRAGMA table_info returns (id, name, type, notnull, default_value, pk)\n",
" cur.execute(f\"PRAGMA table_info('{table_name}');\")\n",
" columns = cur.fetchall()\n",
" \n",
" col_names = [col[1] for col in columns]\n",
" total_cols += len(col_names)\n",
"\n",
" # 4. Print the result in a clean format\n",
" print(f\"TABLE: {table_name}\")\n",
" print(f\"COLUMNS ({len(col_names)}): {', '.join(col_names)}\")\n",
" print(\"-\" * 50)\n",
"\n",
" print(f\"\\nSUMMARY:\")\n",
" print(f\"Total Tables: {len(tables)}\")\n",
" print(f\"Total Columns: {total_cols}\")\n",
" print(f\"{'='*50}\\n\")\n",
"\n",
" except sqlite3.Error as e:\n",
" print(f\"Database error: {e}\")\n",
" finally:\n",
" if conn:\n",
" conn.close()\n",
"\n",
"# Usage:\n",
"# print_database_inventory(\"msgstore.db\")\n",
"# Example usage:\n",
"DB_PATH = r\"users4.db\"\n",
"count = print_database_inventory(DB_PATH)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f200703",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

132
stats/stats_utils.py Normal file
View File

@@ -0,0 +1,132 @@
import re
from typing import Any, Dict, List, Optional
_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
def _collapse_spaces(s: str) -> str:
return re.sub(r"\s+", " ", s).strip()
def _dedupe_preserve_order(values: List[str]) -> List[str]:
seen = set()
out: List[str] = []
for v in values:
if v in seen:
continue
seen.add(v)
out.append(v)
return out
def normalize_email(value: Any) -> Optional[str]:
if value is None:
return None
s = str(value).strip()
if not s:
return None
s = re.sub(r"^mailto:\s*", "", s, flags=re.IGNORECASE).strip()
s = _collapse_spaces(s.lower()).strip("<>")
return s if _EMAIL_RE.match(s) else None
def normalize_phone_keep_all(value: Any) -> Optional[str]:
if value is None:
return None
s = str(value).strip()
if not s:
return None
digits = re.sub(r"\D", "", s)
return digits or None
def normalize_username(value: Any) -> Optional[str]:
"""
USERNAME: lowercase only (plus strip).
"""
if value is None:
return None
s = str(value).strip()
if not s:
return None
return s.lower()
def normalize_person_name(value: Any) -> Optional[str]:
"""
PERSON_NAME: lowercase + collapse spaces.
"""
if value is None:
return None
s = str(value).strip()
if not s:
return None
return _collapse_spaces(s.lower())
def normalize_postal_address(value: Any) -> Optional[str]:
"""
POSTAL_ADDRESS: lowercase + collapse spaces + normalize comma spacing.
"""
if value is None:
return None
s = _collapse_spaces(str(value)).lower()
if not s:
return None
s = re.sub(r"\s*,\s*", ", ", s)
s = _collapse_spaces(s)
return s or None
def normalize_source_column(value: Any) -> Optional[str]:
if value is None:
return None
s = _collapse_spaces(str(value)).lower()
return s or None
def normalize_pii_value(pii_type: str, value: Any) -> Optional[str]:
t = (pii_type or "").strip().upper()
if t == "EMAIL":
return normalize_email(value)
if t == "PHONE":
return normalize_phone_keep_all(value)
if t == "USERNAME":
return normalize_username(value)
if t == "PERSON_NAME":
return normalize_person_name(value)
if t == "POSTAL_ADDRESS":
return normalize_postal_address(value)
return None
def normalize_and_slim_record(rec: Dict[str, Any]) -> Dict[str, Any]:
"""
Output only:
db_path, PII_type, PII, Num_of_PII, source_columns, Num_of_source_columns
Also dedupes PII and source_columns and recalculates counts.
"""
db_path = rec.get("db_path", "")
pii_type = (rec.get("PII_type") or "").strip().upper()
pii_list = rec.get("PII", [])
if not isinstance(pii_list, list):
pii_list = [pii_list] if pii_list is not None else []
src_cols = rec.get("source_columns", [])
if not isinstance(src_cols, list):
src_cols = [src_cols] if src_cols is not None else []
normalized_pii: List[str] = []
for v in pii_list:
nv = normalize_pii_value(pii_type, v)
if nv is not None:
normalized_pii.append(nv)
normalized_pii = _dedupe_preserve_order(normalized_pii)
normalized_src: List[str] = []
for c in src_cols:
nc = normalize_source_column(c)
if nc is not None:
normalized_src.append(nc)
normalized_src = _dedupe_preserve_order(normalized_src)
return {
"db_path": db_path,
"PII_type": pii_type,
"PII": normalized_pii,
"Num_of_PII": len(normalized_pii),
"source_columns": normalized_src,
"Num_of_source_columns": len(normalized_src),
}