Files
mobile-pii-discovery-agent/RQs/RQ0/RQ0_1_results_normalization.ipynb

187 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "c2d824a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PII_A1_commerce_20260131T203324Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A1_commerce_20260131T203324Z.jsonl\n",
"PII_A1_msgstore_20260131T203502Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A1_msgstore_20260131T203502Z.jsonl\n",
"PII_A1_wa_20260131T203943Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A1_wa_20260131T203943Z.jsonl\n",
"PII_A2_core_20260131T204055Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A2_core_20260131T204055Z.jsonl\n",
"PII_A2_journal_20260131T204142Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A2_journal_20260131T204142Z.jsonl\n",
"PII_A2_main_20260131T204345Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A2_main_20260131T204345Z.jsonl\n",
"PII_A3_account1cache4_20260131T204410Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A3_account1cache4_20260131T204410Z.jsonl\n",
"PII_A3_account2cache4_20260131T204617Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A3_account2cache4_20260131T204617Z.jsonl\n",
"PII_A3_account3cache4_20260131T204642Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A3_account3cache4_20260131T204642Z.jsonl\n",
"PII_A4_gmm_myplaces_20260131T204709Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A4_gmm_myplaces_20260131T204709Z.jsonl\n",
"PII_A4_gmm_storage_20260131T204738Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A4_gmm_storage_20260131T204738Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260131T204833Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260131T204833Z.jsonl\n",
"PII_A5_SBrowser2_20260131T205010Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A5_SBrowser2_20260131T205010Z.jsonl\n",
"PII_A5_SBrowser_20260131T204925Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A5_SBrowser_20260131T204925Z.jsonl\n",
"PII_A5_searchengine_20260131T205101Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A5_searchengine_20260131T205101Z.jsonl\n",
"PII_I1_CallHistory_20260131T205155Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I1_CallHistory_20260131T205155Z.jsonl\n",
"PII_I1_ChatStorage_20260131T205309Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I1_ChatStorage_20260131T205309Z.jsonl\n",
"PII_I1_ContactsV2_20260131T210034Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I1_ContactsV2_20260131T210034Z.jsonl\n",
"PII_I2_AddressBook_20260131T210607Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I2_AddressBook_20260131T210607Z.jsonl\n",
"PII_I2_AddressBookImages_20260131T210641Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I2_AddressBookImages_20260131T210641Z.jsonl\n",
"PII_I3_sms_20260131T210735Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I3_sms_20260131T210735Z.jsonl\n",
"PII_I4_CloudTabs_20260131T210821Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I4_CloudTabs_20260131T210821Z.jsonl\n",
"PII_I4_History_20260131T210912Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I4_History_20260131T210912Z.jsonl\n",
"PII_I5_Calendar_20260131T211008Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I5_Calendar_20260131T211008Z.jsonl\n",
"PII_I5_Extras_20260131T211054Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I5_Extras_20260131T211054Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\n",
"PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_commerce_20260127T175911Z.jsonl\n",
"PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
"PII_A1_wa_20260127T180213Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_wa_20260127T180213Z.jsonl\n",
"PII_A2_core_20260127T180339Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_core_20260127T180339Z.jsonl\n",
"PII_A2_journal_20260127T180440Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_journal_20260127T180440Z.jsonl\n",
"PII_A2_main_20260127T180710Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_main_20260127T180710Z.jsonl\n",
"PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
"PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
"PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
"PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
"PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
"PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
"PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
"PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
"PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
"PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
"PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
"PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
"PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
"PII_I3_sms_20260127T183606Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I3_sms_20260127T183606Z.jsonl\n",
"PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
"PII_I4_History_20260127T183727Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_History_20260127T183727Z.jsonl\n",
"PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
"PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Extras_20260127T183857Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\n"
]
}
],
"source": [
"import json\n",
"import shutil\n",
"import sys\n",
"from pathlib import Path\n",
"from typing import Callable, Tuple\n",
"\n",
"\n",
"def process_file_jsonl(\n",
" in_path: Path,\n",
" out_path: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
") -> int:\n",
" n = 0\n",
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
" for line in fin:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" obj = json.loads(line)\n",
" if not isinstance(obj, dict):\n",
" continue\n",
" slim = normalize_record_fn(obj)\n",
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
" n += 1\n",
" return n\n",
"\n",
"\n",
"def normalize_jsonl_folder(\n",
" in_dir: Path,\n",
" out_dir: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
" *,\n",
" delete_out_dir_first: bool = True,\n",
") -> Tuple[int, int]:\n",
" \"\"\"\n",
" Normalize every *.jsonl file in `in_dir` and write outputs (same filenames) to `out_dir`.\n",
"\n",
" Returns: (num_files_processed, num_records_written)\n",
" \"\"\"\n",
" if delete_out_dir_first and out_dir.exists():\n",
" if out_dir.is_dir():\n",
" shutil.rmtree(out_dir)\n",
" else:\n",
" out_dir.unlink()\n",
"\n",
" out_dir.mkdir(parents=True, exist_ok=True)\n",
"\n",
" files = sorted(in_dir.glob(\"*.jsonl\"))\n",
" if not files:\n",
" print(f\"No .jsonl files found in: {in_dir.resolve()}\")\n",
" return (0, 0)\n",
"\n",
" total_records = 0\n",
" for fp in files:\n",
" out_fp = out_dir / fp.name\n",
" n = process_file_jsonl(fp, out_fp, normalize_record_fn)\n",
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
" total_records += n\n",
"\n",
" print(f\"Done. Files: {len(files)}, Records: {total_records}\")\n",
" print(f\"Output folder: {out_dir.resolve()}\")\n",
" return (len(files), total_records)\n",
"\n",
"\n",
"# ---- Example usage (your exact paths) ----\n",
"if __name__ == \"__main__\":\n",
" STATS_DIR = Path(\"..\") .resolve()# folder containing stats_utils.py\n",
" sys.path.insert(0, str(STATS_DIR))\n",
"\n",
" from stats_utils import normalize_and_slim_record\n",
"\n",
" IN_DIR = Path(r\"..\\..\\model_PII_results\\gpt4o\")\n",
" RESULTS_DIR = Path(r\"normalized_PII_results\\gpt4o\\db_level\")\n",
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" \n",
" IN_DIR = Path(r\"..\\..\\model_PII_results\\ground_truth\")\n",
" RESULTS_DIR = Path(r\"normalized_PII_results\\ground_truth\\db_level\")\n",
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0074eda4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}