mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
187 lines
13 KiB
Plaintext
187 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "c2d824a6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"PII_A1_commerce_20260211T022802Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A1_commerce_20260211T022802Z.jsonl\n",
|
|
"PII_A1_msgstore_20260211T024003Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A1_msgstore_20260211T024003Z.jsonl\n",
|
|
"PII_A1_wa_20260211T024706Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A1_wa_20260211T024706Z.jsonl\n",
|
|
"PII_A2_core_20260211T023156Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A2_core_20260211T023156Z.jsonl\n",
|
|
"PII_A2_journal_20260211T023216Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A2_journal_20260211T023216Z.jsonl\n",
|
|
"PII_A2_main_20260211T023611Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A2_main_20260211T023611Z.jsonl\n",
|
|
"PII_A3_account1cache4_20260211T023627Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A3_account1cache4_20260211T023627Z.jsonl\n",
|
|
"PII_A3_account2cache4_20260211T023643Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A3_account2cache4_20260211T023643Z.jsonl\n",
|
|
"PII_A3_account3cache4_20260211T023659Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A3_account3cache4_20260211T023659Z.jsonl\n",
|
|
"PII_A4_gmm_myplaces_20260211T025539Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A4_gmm_myplaces_20260211T025539Z.jsonl\n",
|
|
"PII_A4_gmm_storage_20260211T025558Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A4_gmm_storage_20260211T025558Z.jsonl\n",
|
|
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260211T025625Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260211T025625Z.jsonl\n",
|
|
"PII_A5_SBrowser2_20260211T025812Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A5_SBrowser2_20260211T025812Z.jsonl\n",
|
|
"PII_A5_SBrowser_20260211T025741Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A5_SBrowser_20260211T025741Z.jsonl\n",
|
|
"PII_A5_searchengine_20260211T025835Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_A5_searchengine_20260211T025835Z.jsonl\n",
|
|
"PII_I1_CallHistory_20260211T025226Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I1_CallHistory_20260211T025226Z.jsonl\n",
|
|
"PII_I1_ChatStorage_20260211T025417Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I1_ChatStorage_20260211T025417Z.jsonl\n",
|
|
"PII_I1_ContactsV2_20260211T030947Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I1_ContactsV2_20260211T030947Z.jsonl\n",
|
|
"PII_I2_AddressBook_20260211T030143Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I2_AddressBook_20260211T030143Z.jsonl\n",
|
|
"PII_I2_AddressBookImages_20260211T030157Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I2_AddressBookImages_20260211T030157Z.jsonl\n",
|
|
"PII_I3_sms_20260211T030251Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I3_sms_20260211T030251Z.jsonl\n",
|
|
"PII_I4_CloudTabs_20260211T030321Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I4_CloudTabs_20260211T030321Z.jsonl\n",
|
|
"PII_I4_History_20260211T030358Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I4_History_20260211T030358Z.jsonl\n",
|
|
"PII_I5_Calendar_20260211T030514Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I5_Calendar_20260211T030514Z.jsonl\n",
|
|
"PII_I5_Extras_20260211T030537Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\\PII_I5_Extras_20260211T030537Z.jsonl\n",
|
|
"Done. Files: 25, Records: 125\n",
|
|
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\db_level\n",
|
|
"PII_A1_commerce.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_commerce.jsonl\n",
|
|
"PII_A1_msgstore.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_msgstore.jsonl\n",
|
|
"PII_A1_wa.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_wa.jsonl\n",
|
|
"PII_A2_core.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_core.jsonl\n",
|
|
"PII_A2_journal.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_journal.jsonl\n",
|
|
"PII_A2_main.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_main.jsonl\n",
|
|
"PII_A3_account1cache4.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account1cache4.jsonl\n",
|
|
"PII_A3_account2cache4.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account2cache4.jsonl\n",
|
|
"PII_A3_account3cache4.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account3cache4.jsonl\n",
|
|
"PII_A4_gmm_myplaces.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_myplaces.jsonl\n",
|
|
"PII_A4_gmm_storage.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_storage.jsonl\n",
|
|
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14.jsonl\n",
|
|
"PII_A5_SBrowser.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser.jsonl\n",
|
|
"PII_A5_SBrowser2.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser2.jsonl\n",
|
|
"PII_A5_searchengine.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_searchengine.jsonl\n",
|
|
"PII_I1_CallHistory.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_CallHistory.jsonl\n",
|
|
"PII_I1_ChatStorage.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ChatStorage.jsonl\n",
|
|
"PII_I1_ContactsV2.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ContactsV2.jsonl\n",
|
|
"PII_I2_AddressBook.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBook.jsonl\n",
|
|
"PII_I2_AddressBookImages.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBookImages.jsonl\n",
|
|
"PII_I3_sms.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I3_sms.jsonl\n",
|
|
"PII_I4_CloudTabs.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_CloudTabs.jsonl\n",
|
|
"PII_I4_History.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_History.jsonl\n",
|
|
"PII_I5_Calendar.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Calendar.jsonl\n",
|
|
"PII_I5_Extras.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Extras.jsonl\n",
|
|
"Done. Files: 25, Records: 125\n",
|
|
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import json\n",
|
|
"import shutil\n",
|
|
"import sys\n",
|
|
"from pathlib import Path\n",
|
|
"from typing import Callable, Tuple\n",
|
|
"\n",
|
|
"\n",
|
|
"def process_file_jsonl(\n",
|
|
" in_path: Path,\n",
|
|
" out_path: Path,\n",
|
|
" normalize_record_fn: Callable[[dict], dict],\n",
|
|
") -> int:\n",
|
|
" n = 0\n",
|
|
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
|
|
" for line in fin:\n",
|
|
" line = line.strip()\n",
|
|
" if not line:\n",
|
|
" continue\n",
|
|
" obj = json.loads(line)\n",
|
|
" if not isinstance(obj, dict):\n",
|
|
" continue\n",
|
|
" slim = normalize_record_fn(obj)\n",
|
|
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
|
|
" n += 1\n",
|
|
" return n\n",
|
|
"\n",
|
|
"\n",
|
|
"def normalize_jsonl_folder(\n",
|
|
" in_dir: Path,\n",
|
|
" out_dir: Path,\n",
|
|
" normalize_record_fn: Callable[[dict], dict],\n",
|
|
" *,\n",
|
|
" delete_out_dir_first: bool = True,\n",
|
|
") -> Tuple[int, int]:\n",
|
|
" \"\"\"\n",
|
|
" Normalize every *.jsonl file in `in_dir` and write outputs (same filenames) to `out_dir`.\n",
|
|
"\n",
|
|
" Returns: (num_files_processed, num_records_written)\n",
|
|
" \"\"\"\n",
|
|
" if delete_out_dir_first and out_dir.exists():\n",
|
|
" if out_dir.is_dir():\n",
|
|
" shutil.rmtree(out_dir)\n",
|
|
" else:\n",
|
|
" out_dir.unlink()\n",
|
|
"\n",
|
|
" out_dir.mkdir(parents=True, exist_ok=True)\n",
|
|
"\n",
|
|
" files = sorted(in_dir.glob(\"*.jsonl\"))\n",
|
|
" if not files:\n",
|
|
" print(f\"No .jsonl files found in: {in_dir.resolve()}\")\n",
|
|
" return (0, 0)\n",
|
|
"\n",
|
|
" total_records = 0\n",
|
|
" for fp in files:\n",
|
|
" out_fp = out_dir / fp.name\n",
|
|
" n = process_file_jsonl(fp, out_fp, normalize_record_fn)\n",
|
|
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
|
|
" total_records += n\n",
|
|
"\n",
|
|
" print(f\"Done. Files: {len(files)}, Records: {total_records}\")\n",
|
|
" print(f\"Output folder: {out_dir.resolve()}\")\n",
|
|
" return (len(files), total_records)\n",
|
|
"\n",
|
|
"\n",
|
|
"# ---- Example usage (your exact paths) ----\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" STATS_DIR = Path(\"..\") .resolve()# folder containing stats_utils.py\n",
|
|
" sys.path.insert(0, str(STATS_DIR))\n",
|
|
"\n",
|
|
" from stats_utils import normalize_and_slim_record\n",
|
|
"\n",
|
|
" IN_DIR = Path(r\"..\\..\\model_PII_results\\GPT-5.1\")\n",
|
|
" RESULTS_DIR = Path(r\"normalized_PII_results\\GPT-5.1\\db_level\")\n",
|
|
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
|
|
"\n",
|
|
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
|
|
" \n",
|
|
" IN_DIR = Path(r\"..\\..\\model_PII_results\\ground_truth\")\n",
|
|
" RESULTS_DIR = Path(r\"normalized_PII_results\\ground_truth\\db_level\")\n",
|
|
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
|
|
"\n",
|
|
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0074eda4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.18"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|