refactor code. mainly organize files into folders

This commit is contained in:
Frank Xu
2026-02-01 19:35:36 -05:00
parent 5643228ff6
commit 583c27ba0b
67 changed files with 535 additions and 231 deletions

View File

@@ -0,0 +1,186 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "c2d824a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PII_A1_commerce_20260131T203324Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A1_commerce_20260131T203324Z.jsonl\n",
"PII_A1_msgstore_20260131T203502Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A1_msgstore_20260131T203502Z.jsonl\n",
"PII_A1_wa_20260131T203943Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A1_wa_20260131T203943Z.jsonl\n",
"PII_A2_core_20260131T204055Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A2_core_20260131T204055Z.jsonl\n",
"PII_A2_journal_20260131T204142Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A2_journal_20260131T204142Z.jsonl\n",
"PII_A2_main_20260131T204345Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A2_main_20260131T204345Z.jsonl\n",
"PII_A3_account1cache4_20260131T204410Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A3_account1cache4_20260131T204410Z.jsonl\n",
"PII_A3_account2cache4_20260131T204617Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A3_account2cache4_20260131T204617Z.jsonl\n",
"PII_A3_account3cache4_20260131T204642Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A3_account3cache4_20260131T204642Z.jsonl\n",
"PII_A4_gmm_myplaces_20260131T204709Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A4_gmm_myplaces_20260131T204709Z.jsonl\n",
"PII_A4_gmm_storage_20260131T204738Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A4_gmm_storage_20260131T204738Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260131T204833Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260131T204833Z.jsonl\n",
"PII_A5_SBrowser2_20260131T205010Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A5_SBrowser2_20260131T205010Z.jsonl\n",
"PII_A5_SBrowser_20260131T204925Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A5_SBrowser_20260131T204925Z.jsonl\n",
"PII_A5_searchengine_20260131T205101Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_A5_searchengine_20260131T205101Z.jsonl\n",
"PII_I1_CallHistory_20260131T205155Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I1_CallHistory_20260131T205155Z.jsonl\n",
"PII_I1_ChatStorage_20260131T205309Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I1_ChatStorage_20260131T205309Z.jsonl\n",
"PII_I1_ContactsV2_20260131T210034Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I1_ContactsV2_20260131T210034Z.jsonl\n",
"PII_I2_AddressBook_20260131T210607Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I2_AddressBook_20260131T210607Z.jsonl\n",
"PII_I2_AddressBookImages_20260131T210641Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I2_AddressBookImages_20260131T210641Z.jsonl\n",
"PII_I3_sms_20260131T210735Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I3_sms_20260131T210735Z.jsonl\n",
"PII_I4_CloudTabs_20260131T210821Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I4_CloudTabs_20260131T210821Z.jsonl\n",
"PII_I4_History_20260131T210912Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I4_History_20260131T210912Z.jsonl\n",
"PII_I5_Calendar_20260131T211008Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I5_Calendar_20260131T211008Z.jsonl\n",
"PII_I5_Extras_20260131T211054Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\\PII_I5_Extras_20260131T211054Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\db_level\n",
"PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_commerce_20260127T175911Z.jsonl\n",
"PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
"PII_A1_wa_20260127T180213Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A1_wa_20260127T180213Z.jsonl\n",
"PII_A2_core_20260127T180339Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_core_20260127T180339Z.jsonl\n",
"PII_A2_journal_20260127T180440Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_journal_20260127T180440Z.jsonl\n",
"PII_A2_main_20260127T180710Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A2_main_20260127T180710Z.jsonl\n",
"PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
"PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
"PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
"PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
"PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
"PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
"PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
"PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
"PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
"PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
"PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
"PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
"PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
"PII_I3_sms_20260127T183606Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I3_sms_20260127T183606Z.jsonl\n",
"PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
"PII_I4_History_20260127T183727Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I4_History_20260127T183727Z.jsonl\n",
"PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
"PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\\PII_I5_Extras_20260127T183857Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\db_level\n"
]
}
],
"source": [
"import json\n",
"import shutil\n",
"import sys\n",
"from pathlib import Path\n",
"from typing import Callable, Tuple\n",
"\n",
"\n",
"def process_file_jsonl(\n",
" in_path: Path,\n",
" out_path: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
") -> int:\n",
" n = 0\n",
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
" for line in fin:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" obj = json.loads(line)\n",
" if not isinstance(obj, dict):\n",
" continue\n",
" slim = normalize_record_fn(obj)\n",
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
" n += 1\n",
" return n\n",
"\n",
"\n",
"def normalize_jsonl_folder(\n",
" in_dir: Path,\n",
" out_dir: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
" *,\n",
" delete_out_dir_first: bool = True,\n",
") -> Tuple[int, int]:\n",
" \"\"\"\n",
" Normalize every *.jsonl file in `in_dir` and write outputs (same filenames) to `out_dir`.\n",
"\n",
" Returns: (num_files_processed, num_records_written)\n",
" \"\"\"\n",
" if delete_out_dir_first and out_dir.exists():\n",
" if out_dir.is_dir():\n",
" shutil.rmtree(out_dir)\n",
" else:\n",
" out_dir.unlink()\n",
"\n",
" out_dir.mkdir(parents=True, exist_ok=True)\n",
"\n",
" files = sorted(in_dir.glob(\"*.jsonl\"))\n",
" if not files:\n",
" print(f\"No .jsonl files found in: {in_dir.resolve()}\")\n",
" return (0, 0)\n",
"\n",
" total_records = 0\n",
" for fp in files:\n",
" out_fp = out_dir / fp.name\n",
" n = process_file_jsonl(fp, out_fp, normalize_record_fn)\n",
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
" total_records += n\n",
"\n",
" print(f\"Done. Files: {len(files)}, Records: {total_records}\")\n",
" print(f\"Output folder: {out_dir.resolve()}\")\n",
" return (len(files), total_records)\n",
"\n",
"\n",
"# ---- Example usage (your exact paths) ----\n",
"if __name__ == \"__main__\":\n",
" STATS_DIR = Path(\"..\") .resolve()# folder containing stats_utils.py\n",
" sys.path.insert(0, str(STATS_DIR))\n",
"\n",
" from stats_utils import normalize_and_slim_record\n",
"\n",
" IN_DIR = Path(r\"..\\..\\batch_results_gpt4o\")\n",
" RESULTS_DIR = Path(r\"normalized_PII_results\\gpt4o\\db_level\")\n",
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" \n",
" IN_DIR = Path(r\"..\\..\\ground_truth\")\n",
" RESULTS_DIR = Path(r\"normalized_PII_results\\ground_truth\\db_level\")\n",
" OUT_DIR = STATS_DIR/ RESULTS_DIR \n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0074eda4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "234eed3f",
"metadata": {},
"outputs": [
@@ -10,8 +10,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ2\\RQ2_app_level_gpt4o.jsonl\n",
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ2\\RQ2_app_level_ground_truth.jsonl\n"
"Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\app_level\\app_level.jsonl\n",
"Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\app_level\\app_level.jsonl\n"
]
}
],
@@ -180,24 +180,24 @@
" return out_path\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
"if __name__ == \"__main__\": \n",
" # --- Aggregate GPT-4o results ---\n",
" IN_DIR = Path(r\"..\\batch_results_gpt4o_normalized\")\n",
" OUT_DIR = Path(r\".\") # pick whatever folder you want\n",
" IN_DIR = Path(r\"..\\normalized_PII_results\\gpt4o\\db_level\")\n",
" OUT_DIR = Path(r\"..\\normalized_PII_results\\gpt4o\\app_level\")\n",
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
" out_path = OUT_DIR / \"RQ2_app_level_gpt4o.jsonl\"\n",
" out_path = OUT_DIR / \"app_level.jsonl\"\n",
"\n",
" out = aggregate_jsonl_folder(IN_DIR, out_path)\n",
" print(f\"Wrote: {out.resolve()}\")\n",
" \n",
" # --- Aggregate ground truth as well ---\n",
" \n",
" IN_DIR = Path(r\"..\\ground_truth_normalized\")\n",
" OUT_DIR = Path(r\".\") # pick whatever folder you want\n",
" IN_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\db_level\")\n",
" OUT_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\app_level\")\n",
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
" out_path = OUT_DIR / \"RQ2_app_level_ground_truth.jsonl\"\n",
" out_path = OUT_DIR / \"app_level.jsonl\"\n",
"\n",
" out = aggregate_jsonl_folder(IN_DIR, out_path)\n",
" print(f\"Wrote: {out.resolve()}\")\n",

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "234eed3f",
"metadata": {},
"outputs": [
@@ -10,8 +10,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_gpt4o.jsonl\n",
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_ground_truth.jsonl\n"
"Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\gpt4o\\corpus_level\\corpus_level.jsonl\n",
"Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\corpus_level\\corpus_level.jsonl\n"
]
}
],
@@ -146,17 +146,21 @@
"\n",
"if __name__ == \"__main__\":\n",
" # Aggregate GPT-4o results\n",
" IN_DIR = Path(r\"..\\normalized_PII_results\\gpt4o\\db_level\")\n",
" OUT_DIR = Path(r\"..\\normalized_PII_results\\gpt4o\\corpus_level\\corpus_level.jsonl\")\n",
" out = aggregate_jsonl_folder_corpus_level(\n",
" r\"..\\batch_results_gpt4o_normalized\",\n",
" \"RQ3_corpus_level_gpt4o.jsonl\",\n",
" IN_DIR,\n",
" OUT_DIR\n",
" )\n",
" print(f\"Wrote: {out.resolve()}\")\n",
" \n",
" \n",
" # Aggregate Ground Truth results\n",
"# # Aggregate Ground Truth results\n",
" IN_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\db_level\")\n",
" OUT_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\corpus_level\\corpus_level.jsonl\")\n",
" out = aggregate_jsonl_folder_corpus_level(\n",
" r\"..\\ground_truth_normalized\",\n",
" \"RQ3_corpus_level_ground_truth.jsonl\",\n",
" IN_DIR,\n",
" OUT_DIR\n",
" )\n",
" print(f\"Wrote: {out.resolve()}\")"
]

View File

@@ -1,184 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "c2d824a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PII_A1_commerce_20260131T203324Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A1_commerce_20260131T203324Z.jsonl\n",
"PII_A1_msgstore_20260131T203502Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A1_msgstore_20260131T203502Z.jsonl\n",
"PII_A1_wa_20260131T203943Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A1_wa_20260131T203943Z.jsonl\n",
"PII_A2_core_20260131T204055Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A2_core_20260131T204055Z.jsonl\n",
"PII_A2_journal_20260131T204142Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A2_journal_20260131T204142Z.jsonl\n",
"PII_A2_main_20260131T204345Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A2_main_20260131T204345Z.jsonl\n",
"PII_A3_account1cache4_20260131T204410Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A3_account1cache4_20260131T204410Z.jsonl\n",
"PII_A3_account2cache4_20260131T204617Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A3_account2cache4_20260131T204617Z.jsonl\n",
"PII_A3_account3cache4_20260131T204642Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A3_account3cache4_20260131T204642Z.jsonl\n",
"PII_A4_gmm_myplaces_20260131T204709Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A4_gmm_myplaces_20260131T204709Z.jsonl\n",
"PII_A4_gmm_storage_20260131T204738Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A4_gmm_storage_20260131T204738Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260131T204833Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260131T204833Z.jsonl\n",
"PII_A5_SBrowser2_20260131T205010Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A5_SBrowser2_20260131T205010Z.jsonl\n",
"PII_A5_SBrowser_20260131T204925Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A5_SBrowser_20260131T204925Z.jsonl\n",
"PII_A5_searchengine_20260131T205101Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_A5_searchengine_20260131T205101Z.jsonl\n",
"PII_I1_CallHistory_20260131T205155Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I1_CallHistory_20260131T205155Z.jsonl\n",
"PII_I1_ChatStorage_20260131T205309Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I1_ChatStorage_20260131T205309Z.jsonl\n",
"PII_I1_ContactsV2_20260131T210034Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I1_ContactsV2_20260131T210034Z.jsonl\n",
"PII_I2_AddressBook_20260131T210607Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I2_AddressBook_20260131T210607Z.jsonl\n",
"PII_I2_AddressBookImages_20260131T210641Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I2_AddressBookImages_20260131T210641Z.jsonl\n",
"PII_I3_sms_20260131T210735Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I3_sms_20260131T210735Z.jsonl\n",
"PII_I4_CloudTabs_20260131T210821Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I4_CloudTabs_20260131T210821Z.jsonl\n",
"PII_I4_History_20260131T210912Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I4_History_20260131T210912Z.jsonl\n",
"PII_I5_Calendar_20260131T211008Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I5_Calendar_20260131T211008Z.jsonl\n",
"PII_I5_Extras_20260131T211054Z.jsonl: 5 records -> ..\\batch_results_gpt4o_normalized\\PII_I5_Extras_20260131T211054Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\batch_results_gpt4o_normalized\n",
"PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A1_commerce_20260127T175911Z.jsonl\n",
"PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
"PII_A1_wa_20260127T180213Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A1_wa_20260127T180213Z.jsonl\n",
"PII_A2_core_20260127T180339Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A2_core_20260127T180339Z.jsonl\n",
"PII_A2_journal_20260127T180440Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A2_journal_20260127T180440Z.jsonl\n",
"PII_A2_main_20260127T180710Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A2_main_20260127T180710Z.jsonl\n",
"PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
"PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
"PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
"PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
"PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
"PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
"PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
"PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
"PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
"PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
"PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
"PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
"PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
"PII_I3_sms_20260127T183606Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I3_sms_20260127T183606Z.jsonl\n",
"PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
"PII_I4_History_20260127T183727Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I4_History_20260127T183727Z.jsonl\n",
"PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
"PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> ..\\ground_truth_normalized\\PII_I5_Extras_20260127T183857Z.jsonl\n",
"Done. Files: 25, Records: 125\n",
"Output folder: I:\\project2026\\llmagent\\RQs\\ground_truth_normalized\n"
]
}
],
"source": [
"import json\n",
"import shutil\n",
"import sys\n",
"from pathlib import Path\n",
"from typing import Callable, Tuple\n",
"\n",
"\n",
"def process_file_jsonl(\n",
" in_path: Path,\n",
" out_path: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
") -> int:\n",
" n = 0\n",
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
" for line in fin:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" obj = json.loads(line)\n",
" if not isinstance(obj, dict):\n",
" continue\n",
" slim = normalize_record_fn(obj)\n",
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
" n += 1\n",
" return n\n",
"\n",
"\n",
"def normalize_jsonl_folder(\n",
" in_dir: Path,\n",
" out_dir: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
" *,\n",
" delete_out_dir_first: bool = True,\n",
") -> Tuple[int, int]:\n",
" \"\"\"\n",
" Normalize every *.jsonl file in `in_dir` and write outputs (same filenames) to `out_dir`.\n",
"\n",
" Returns: (num_files_processed, num_records_written)\n",
" \"\"\"\n",
" if delete_out_dir_first and out_dir.exists():\n",
" if out_dir.is_dir():\n",
" shutil.rmtree(out_dir)\n",
" else:\n",
" out_dir.unlink()\n",
"\n",
" out_dir.mkdir(parents=True, exist_ok=True)\n",
"\n",
" files = sorted(in_dir.glob(\"*.jsonl\"))\n",
" if not files:\n",
" print(f\"No .jsonl files found in: {in_dir.resolve()}\")\n",
" return (0, 0)\n",
"\n",
" total_records = 0\n",
" for fp in files:\n",
" out_fp = out_dir / fp.name\n",
" n = process_file_jsonl(fp, out_fp, normalize_record_fn)\n",
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
" total_records += n\n",
"\n",
" print(f\"Done. Files: {len(files)}, Records: {total_records}\")\n",
" print(f\"Output folder: {out_dir.resolve()}\")\n",
" return (len(files), total_records)\n",
"\n",
"\n",
"# ---- Example usage (your exact paths) ----\n",
"if __name__ == \"__main__\":\n",
" STATS_DIR = Path(r\"I:\\project2026\\llmagent\\RQs\").resolve() # folder containing stats_utils.py\n",
" sys.path.insert(0, str(STATS_DIR))\n",
"\n",
" from stats_utils import normalize_and_slim_record\n",
"\n",
" IN_DIR = Path(r\"..\\..\\batch_results_gpt4o\")\n",
" OUT_DIR = Path(r\"..\\batch_results_gpt4o_normalized\")\n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" \n",
" IN_DIR = Path(r\"..\\..\\ground_truth\")\n",
" OUT_DIR = Path(r\"..\\ground_truth_normalized\")\n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0074eda4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "e15e3ffe",
"metadata": {},
"outputs": [
@@ -306,7 +306,7 @@
"\n",
"\n",
"def main() -> None:\n",
" in_dir = Path(r\"..\\batch_results_gpt4o_normalized\")\n",
" in_dir = Path(r\"..\\normalized_PII_results\\gpt4o\\db_level\")\n",
" out_tex = Path(\"RQ1_t4.tex\")\n",
" out_csv = Path(\"RQ1_t4_plain.csv\")\n",
" generate_db_level_pii_tables(in_dir, out_tex, out_csv)\n",

View File

@@ -27,5 +27,9 @@ I4 & Safari & 74 & 7 & 90.54\% \\
\hline
I5 & Calendar & 541 & 0 & 100.00\% \\
\hline
test2 & test2 & 12 & 0 & 100.00\% \\
\hline
users & users & 3 & 0 & 100.00\% \\
\hline
\end{tabular}
\end{table}

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "a30eef73",
"metadata": {},
"outputs": [
@@ -10,20 +10,22 @@
"name": "stdout",
"output_type": "stream",
"text": [
"ID Apps CandidateCols ColsScanned Reduc(%)\n",
"----------------------------------------------------------\n",
"A1 WhatsApp 1637 14 99.14%\n",
"A2 Snapchat 848 2 99.76%\n",
"A3 Telegram 1197 0 100.00%\n",
"A4 Google Maps 80 2 97.50%\n",
"A5 Samsung Internet 185 11 94.05%\n",
"I1 WhatsApp 328 6 98.17%\n",
"I2 Contacts 13 13 0.00%\n",
"I3 Apple Messages 186 0 100.00%\n",
"I4 Safari 74 7 90.54%\n",
"I5 Calendar 541 0 100.00%\n",
"ID Apps CandidateCols ColsScanned Reduc(%)\n",
"-------------------------------------------------------------\n",
"A1 WhatsApp 1637 14 99.14%\n",
"A2 Snapchat 848 2 99.76%\n",
"A3 Telegram 1197 0 100.00%\n",
"A4 Google Maps 80 2 97.50%\n",
"A5 Samsung Internet 185 11 94.05%\n",
"I1 WhatsApp 328 6 98.17%\n",
"I2 Contacts 13 13 0.00%\n",
"I3 Apple Messages 186 0 100.00%\n",
"I4 Safari 74 7 90.54%\n",
"I5 Calendar 541 0 100.00%\n",
"test2 test2 12 0 100.00%\n",
"users users 3 0 100.00%\n",
"\n",
"Wrote LaTeX: I:\\project2026\\llmagent\\RQs\\RQ2\\search_space_reduction_gpt4o.tex\n"
"Wrote LaTeX: I:\\project2026\\llmagent\\RQs\\RQ2\\RQ2_search_space_reduction_gpt4o.tex\n"
]
}
],
@@ -35,7 +37,9 @@
"from typing import Dict\n",
"\n",
"CSV_PATH = Path(r\"app_total_columns.csv\")\n",
"JSONL_PATH = Path(r\"RQ2_app_level_gpt4o.jsonl\")\n",
"\n",
"BASE_DIR=Path(r\"..\\normalized_PII_results\\gpt4o\\app_level\")\n",
"JSONL_PATH = BASE_DIR /Path(r\"app_level.jsonl\")\n",
"OUT_TEX = Path(\"RQ2_search_space_reduction_gpt4o.tex\")\n",
"\n",
"APP_NAME_PLAIN = OrderedDict([\n",

View File

@@ -8,4 +8,6 @@ I1,WhatsApp,328
I2,Contacts,13
I3,Apple Messages,186
I4,Safari,74
I5,Calendar,541
I5,Calendar,541
test2,test2,12
users,users,3
1 app_code app_name total_columns
8 I2 Contacts 13
9 I3 Apple Messages 186
10 I4 Safari 74
11 I5 Calendar 541
12 test2 test2 12
13 users users 3

File diff suppressed because one or more lines are too long

View File

@@ -17,13 +17,13 @@
"\\textbf{Recall} &\n",
"\\textbf{Precision} \\\\\n",
"\\hline\n",
"Email Address & 10 & 9 & 9 & 90.0\\% & 100.0\\% \\\\\n",
"Email Address & 10 & 10 & 10 & 100.0\\% & 100.0\\% \\\\\n",
"\\hline\n",
"Phone Number & 1050 & 1050 & 1050 & 100.0\\% & 100.0\\% \\\\\n",
"Phone Number & 1050 & 801 & 225 & 21.4\\% & 28.1\\% \\\\\n",
"\\hline\n",
"User Name & 85 & 85 & 85 & 100.0\\% & 100.0\\% \\\\\n",
"User Name & 85 & 751 & 8 & 9.4\\% & 1.1\\% \\\\\n",
"\\hline\n",
"Person Name & 909 & 909 & 909 & 100.0\\% & 100.0\\% \\\\\n",
"Person Name & 909 & 787 & 783 & 86.1\\% & 99.5\\% \\\\\n",
"\\hline\n",
"Postal Address & 2 & 2 & 2 & 100.0\\% & 100.0\\% \\\\\n",
"\\hline\n"
@@ -35,8 +35,8 @@
"from pathlib import Path\n",
"from typing import Dict, Set, List\n",
"\n",
"SYSTEM_PATH = Path(\"RQ3_corpus_level_gpt4o.jsonl\")\n",
"GT_PATH = Path(\"RQ3_corpus_level_ground_truth.jsonl\")\n",
"SYSTEM_PATH = Path(r\"..\\normalized_PII_results\\gpt4o\\corpus_level\\corpus_level.jsonl\")\n",
"GT_PATH = Path(r\"..\\normalized_PII_results\\ground_truth\\corpus_level\\corpus_level.jsonl\")\n",
"\n",
"PII_TYPE_ORDER = [\"EMAIL\", \"PHONE\", \"USERNAME\", \"PERSON_NAME\", \"POSTAL_ADDRESS\"]\n",
"PII_TYPE_DISPLAY = {\n",

238
RQs/RQ3/RQ_t8.ipynb Normal file
View File

@@ -0,0 +1,238 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1affac71",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PLAIN TEXT TABLE\n",
"\n",
"PII Type | GT DBs | System DBs | Overlap | Coverage\n",
"---------------+--------+------------+---------+---------\n",
"Email Address | 0 | 6 | 0 | 0.0% \n",
"Phone Number | 0 | 6 | 0 | 0.0% \n",
"User Name | 0 | 10 | 0 | 0.0% \n",
"Person Name | 0 | 7 | 0 | 0.0% \n",
"Postal Address | 0 | 1 | 0 | 0.0% \n",
"\n",
"LATEX TABULAR\n",
"\n",
"\\begin{tabular}{|l|p{1.2cm}|p{1.5cm}|p{1.0cm}|p{1.2cm}|}\n",
"\\hline\n",
"\\textbf{PII Type} &\\textbf{DBs with PII (GT)} &\\textbf{DBs with discoveries (System)} &\\textbf{Overlap} &\\textbf{Coverage} \\\\\n",
"\\hline\n",
"Email Address & 0 & 6 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"Phone Number & 0 & 6 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"User Name & 0 & 10 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"Person Name & 0 & 7 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"Postal Address & 0 & 1 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"\\end{tabular}\n"
]
}
],
"source": [
"\n",
"import json\n",
"from collections import defaultdict\n",
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"from typing import Dict, Iterable, List, Mapping, Set, Tuple\n",
"\n",
"\n",
"# ---- Configurable PII types (JSONL -> table label) ----\n",
"PII_TYPES: List[Tuple[str, str]] = [\n",
" (\"EMAIL\", \"Email Address\"),\n",
" (\"PHONE\", \"Phone Number\"),\n",
" (\"USERNAME\", \"User Name\"),\n",
" (\"PERSON_NAME\", \"Person Name\"),\n",
" (\"POSTAL_ADDRESS\", \"Postal Address\"),\n",
"]\n",
"\n",
"\n",
"def _db_key_from_record(rec: Mapping) -> str:\n",
" \"\"\"\n",
" Prefer db_path from JSONL, fall back to 'unknown_db' if missing.\n",
" Example db_path: 'selectedDBs\\\\A1_msgstore.db' -> 'A1_msgstore'\n",
" \"\"\"\n",
" db_path = str(rec.get(\"db_path\", \"\")).strip()\n",
" if not db_path:\n",
" return \"unknown_db\"\n",
" return Path(db_path).stem\n",
"\n",
"\n",
"def _has_any_pii(rec: Mapping) -> bool:\n",
" \"\"\"\n",
" Treat a PII type as present in a DB if the record has at least one entity.\n",
" Uses the PII list when available; falls back to Num_of_PII.\n",
" \"\"\"\n",
" pii_list = rec.get(\"PII\", None)\n",
" if isinstance(pii_list, list):\n",
" return len(pii_list) > 0\n",
" try:\n",
" return int(rec.get(\"Num_of_PII\", 0)) > 0\n",
" except Exception:\n",
" return False\n",
"\n",
"\n",
"def collect_db_sets(folder: Path, pii_types: Iterable[str]) -> Dict[str, Set[str]]:\n",
" \"\"\"\n",
" Returns: pii_type -> {db_key, ...} where that pii_type appears at least once.\n",
" \"\"\"\n",
" wanted = set(pii_types)\n",
" db_sets: Dict[str, Set[str]] = defaultdict(set)\n",
"\n",
" files = sorted(folder.glob(\"*.jsonl\"))\n",
" if not files:\n",
" raise FileNotFoundError(f\"No .jsonl files found in: {folder}\")\n",
"\n",
" for fp in files:\n",
" with fp.open(\"r\", encoding=\"utf-8\") as f:\n",
" for line in f:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" rec = json.loads(line)\n",
" pii_type = str(rec.get(\"PII_type\", \"\")).strip()\n",
" if pii_type not in wanted:\n",
" continue\n",
" if _has_any_pii(rec):\n",
" db_sets[pii_type].add(_db_key_from_record(rec))\n",
"\n",
" for t in wanted:\n",
" db_sets.setdefault(t, set())\n",
"\n",
" return db_sets\n",
"\n",
"\n",
"@dataclass(frozen=True)\n",
"class CoverageRow:\n",
" label: str\n",
" gt: int\n",
" sys: int\n",
" overlap: int\n",
" coverage_pct: float\n",
"\n",
"\n",
"def compute_coverage(\n",
" gt_sets: Dict[str, Set[str]],\n",
" sys_sets: Dict[str, Set[str]],\n",
" pii_types: List[Tuple[str, str]],\n",
") -> List[CoverageRow]:\n",
" rows: List[CoverageRow] = []\n",
" for key, label in pii_types:\n",
" dg = gt_sets.get(key, set())\n",
" ds = sys_sets.get(key, set())\n",
" inter = dg & ds\n",
" cov = (len(inter) / len(dg) * 100.0) if len(dg) else 0.0\n",
" rows.append(CoverageRow(label, len(dg), len(ds), len(inter), cov))\n",
" return rows\n",
"\n",
"\n",
"def render_latex_tabular(rows: List[CoverageRow]) -> str:\n",
" \"\"\"\n",
" Print only the tabular environment (as requested).\n",
" \"\"\"\n",
" lines: List[str] = []\n",
" lines.append(r\"\\begin{tabular}{|l|p{1.2cm}|p{1.5cm}|p{1.0cm}|p{1.2cm}|}\")\n",
" lines.append(r\"\\hline\")\n",
" lines.append(\n",
" r\"\\textbf{PII Type} &\"\n",
" r\"\\textbf{DBs with PII (GT)} &\"\n",
" r\"\\textbf{DBs with discoveries (System)} &\"\n",
" r\"\\textbf{Overlap} &\"\n",
" r\"\\textbf{Coverage} \\\\\"\n",
" )\n",
" lines.append(r\"\\hline\")\n",
"\n",
" for r in rows:\n",
" lines.append(\n",
" f\"{r.label} & {r.gt} & {r.sys} & {r.overlap} & {r.coverage_pct:.1f}\\\\% \\\\\\\\\"\n",
" )\n",
" lines.append(r\"\\hline\")\n",
"\n",
" lines.append(r\"\\end{tabular}\")\n",
" return \"\\n\".join(lines)\n",
"\n",
"\n",
"def render_plain_text_table(rows: List[CoverageRow]) -> str:\n",
" \"\"\"\n",
" Simple fixed-width table for quick reading in terminal.\n",
" \"\"\"\n",
" headers = [\"PII Type\", \"GT DBs\", \"System DBs\", \"Overlap\", \"Coverage\"]\n",
" data = [\n",
" [r.label, str(r.gt), str(r.sys), str(r.overlap), f\"{r.coverage_pct:.1f}%\"]\n",
" for r in rows\n",
" ]\n",
"\n",
" # compute column widths\n",
" widths = [len(h) for h in headers]\n",
" for row in data:\n",
" for i, cell in enumerate(row):\n",
" widths[i] = max(widths[i], len(cell))\n",
"\n",
" def fmt_row(row: List[str]) -> str:\n",
" return \" | \".join(cell.ljust(widths[i]) for i, cell in enumerate(row))\n",
"\n",
" sep = \"-+-\".join(\"-\" * w for w in widths)\n",
"\n",
" out: List[str] = []\n",
" out.append(fmt_row(headers))\n",
" out.append(sep)\n",
" for row in data:\n",
" out.append(fmt_row(row))\n",
" return \"\\n\".join(out)\n",
"\n",
"\n",
"def main() -> None:\n",
" # Define these inside main so importing this module has no side effects.\n",
" SYSTEM_DIR = Path(r\"..\\normalized_PII_results\\gpt4o\\db_level\")\n",
" GT_DIR = Path(r\"..\\normalized_PII_results\\gpt4o\\app_level\")\n",
" \n",
" gt_sets = collect_db_sets(GT_DIR, [k for k, _ in PII_TYPES])\n",
" sys_sets = collect_db_sets(SYSTEM_DIR, [k for k, _ in PII_TYPES])\n",
"\n",
" rows = compute_coverage(gt_sets, sys_sets, PII_TYPES)\n",
"\n",
" print(\"PLAIN TEXT TABLE\\n\")\n",
" print(render_plain_text_table(rows))\n",
" print(\"\\nLATEX TABULAR\\n\")\n",
" print(render_latex_tabular(rows))\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long