Files
mobile-pii-discovery-agent/RQs/RQ0/RQ0_1_results_normalization.ipynb
2026-02-18 16:22:33 -05:00

187 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c2d824a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PII_A1_commerce_20260211T022802Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A1_commerce_20260211T022802Z.jsonl\n",
"PII_A1_msgstore_20260211T024003Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A1_msgstore_20260211T024003Z.jsonl\n",
"PII_A1_wa_20260211T024706Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A1_wa_20260211T024706Z.jsonl\n",
"PII_A2_core_20260211T023156Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A2_core_20260211T023156Z.jsonl\n",
"PII_A2_journal_20260211T023216Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A2_journal_20260211T023216Z.jsonl\n",
"PII_A2_main_20260211T023611Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A2_main_20260211T023611Z.jsonl\n",
"PII_A3_account1cache4_20260211T023627Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A3_account1cache4_20260211T023627Z.jsonl\n",
"PII_A3_account2cache4_20260211T023643Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A3_account2cache4_20260211T023643Z.jsonl\n",
"PII_A3_account3cache4_20260211T023659Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A3_account3cache4_20260211T023659Z.jsonl\n",
"PII_A4_gmm_myplaces_20260211T025539Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A4_gmm_myplaces_20260211T025539Z.jsonl\n",
"PII_A4_gmm_storage_20260211T025558Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A4_gmm_storage_20260211T025558Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260211T025625Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260211T025625Z.jsonl\n",
"PII_A5_SBrowser2_20260211T025812Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A5_SBrowser2_20260211T025812Z.jsonl\n",
"PII_A5_SBrowser_20260211T025741Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A5_SBrowser_20260211T025741Z.jsonl\n",
"PII_A5_searchengine_20260211T025835Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A5_searchengine_20260211T025835Z.jsonl\n",
"PII_I1_CallHistory_20260211T025226Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I1_CallHistory_20260211T025226Z.jsonl\n",
"PII_I1_ChatStorage_20260211T025417Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I1_ChatStorage_20260211T025417Z.jsonl\n",
"PII_I1_ContactsV2_20260211T030947Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I1_ContactsV2_20260211T030947Z.jsonl\n",
"PII_I2_AddressBook_20260211T030143Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I2_AddressBook_20260211T030143Z.jsonl\n",
"PII_I2_AddressBookImages_20260211T030157Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I2_AddressBookImages_20260211T030157Z.jsonl\n",
"PII_I3_sms_20260211T030251Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I3_sms_20260211T030251Z.jsonl\n",
"PII_I4_CloudTabs_20260211T030321Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I4_CloudTabs_20260211T030321Z.jsonl\n",
"PII_I4_History_20260211T030358Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I4_History_20260211T030358Z.jsonl\n",
"PII_I5_Calendar_20260211T030514Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I5_Calendar_20260211T030514Z.jsonl\n",
"PII_I5_Extras_20260211T030537Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I5_Extras_20260211T030537Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\n",
"PII_A1_commerce.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_commerce.jsonl\n",
"PII_A1_msgstore.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_msgstore.jsonl\n",
"PII_A1_wa.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_wa.jsonl\n",
"PII_A2_core.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_core.jsonl\n",
"PII_A2_journal.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_journal.jsonl\n",
"PII_A2_main.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_main.jsonl\n",
"PII_A3_account1cache4.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account1cache4.jsonl\n",
"PII_A3_account2cache4.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account2cache4.jsonl\n",
"PII_A3_account3cache4.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account3cache4.jsonl\n",
"PII_A4_gmm_myplaces.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_myplaces.jsonl\n",
"PII_A4_gmm_storage.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_storage.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14.jsonl\n",
"PII_A5_SBrowser.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser.jsonl\n",
"PII_A5_SBrowser2.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser2.jsonl\n",
"PII_A5_searchengine.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_searchengine.jsonl\n",
"PII_I1_CallHistory.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_CallHistory.jsonl\n",
"PII_I1_ChatStorage.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ChatStorage.jsonl\n",
"PII_I1_ContactsV2.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ContactsV2.jsonl\n",
"PII_I2_AddressBook.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBook.jsonl\n",
"PII_I2_AddressBookImages.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBookImages.jsonl\n",
"PII_I3_sms.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I3_sms.jsonl\n",
"PII_I4_CloudTabs.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_CloudTabs.jsonl\n",
"PII_I4_History.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_History.jsonl\n",
"PII_I5_Calendar.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Calendar.jsonl\n",
"PII_I5_Extras.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Extras.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\n"
]
}
],
"source": [
"import json\n",
"import shutil\n",
"import sys\n",
"from pathlib import Path\n",
"from typing import Callable, Tuple\n",
"\n",
"\n",
"def process_file_jsonl(\n",
" in_path: Path,\n",
" out_path: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
") -> int:\n",
" n = 0\n",
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
" for line in fin:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" obj = json.loads(line)\n",
" if not isinstance(obj, dict):\n",
" continue\n",
" slim = normalize_record_fn(obj)\n",
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
" n += 1\n",
" return n\n",
"\n",
"\n",
"def normalize_jsonl_folder(\n",
" in_dir: Path,\n",
" out_dir: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
" *,\n",
" delete_out_dir_first: bool = True,\n",
") -> Tuple[int, int]:\n",
" \"\"\"\n",
" Normalize every *.jsonl file in `in_dir` and write outputs (same filenames) to `out_dir`.\n",
"\n",
" Returns: (num_files_processed, num_records_written)\n",
" \"\"\"\n",
" if delete_out_dir_first and out_dir.exists():\n",
" if out_dir.is_dir():\n",
" shutil.rmtree(out_dir)\n",
" else:\n",
" out_dir.unlink()\n",
"\n",
" out_dir.mkdir(parents=True, exist_ok=True)\n",
"\n",
" files = sorted(in_dir.glob(\"*.jsonl\"))\n",
" if not files:\n",
" print(f\"No .jsonl files found in: {in_dir.resolve()}\")\n",
" return (0, 0)\n",
"\n",
" total_records = 0\n",
" for fp in files:\n",
" out_fp = out_dir / fp.name\n",
" n = process_file_jsonl(fp, out_fp, normalize_record_fn)\n",
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
" total_records += n\n",
"\n",
" print(f\"Done. Files: {len(files)}, Records: {total_records}\")\n",
" print(f\"Output folder: {out_dir.resolve()}\")\n",
" return (len(files), total_records)\n",
"\n",
"\n",
"# ---- Example usage (your exact paths) ----\n",
"if __name__ == \"__main__\":\n",
" STATS_DIR = Path(\"..\") .resolve()# folder containing stats_utils.py\n",
" sys.path.insert(0, str(STATS_DIR))\n",
"\n",
" from stats_utils import normalize_and_slim_record\n",
"\n",
" IN_DIR = Path(r\"..\\..\\model_PII_results\\GPT-5.1\")\n",
" RESULTS_DIR = Path(r\"normalized_PII_results\\GPT-5.1\\db_level\")\n",
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" \n",
" IN_DIR = Path(r\"..\\..\\model_PII_results\\ground_truth\")\n",
" RESULTS_DIR = Path(r\"normalized_PII_results\\ground_truth\\db_level\")\n",
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0074eda4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}