Files
mobile-pii-discovery-agent/RQs/RQ3/RQ3_t8.ipynb
2026-02-11 22:29:04 -05:00

239 lines
8.3 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1affac71",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PLAIN TEXT TABLE\n",
"\n",
"PII Type | GT DBs | System DBs | Overlap | Coverage\n",
"---------------+--------+------------+---------+---------\n",
"Email Address | 0 | 6 | 0 | 0.0% \n",
"Phone Number | 0 | 6 | 0 | 0.0% \n",
"User Name | 0 | 10 | 0 | 0.0% \n",
"Person Name | 0 | 7 | 0 | 0.0% \n",
"Postal Address | 0 | 1 | 0 | 0.0% \n",
"\n",
"LATEX TABULAR\n",
"\n",
"\\begin{tabular}{|l|p{1.2cm}|p{1.5cm}|p{1.0cm}|p{1.2cm}|}\n",
"\\hline\n",
"\\textbf{PII Type} &\\textbf{DBs with PII (GT)} &\\textbf{DBs with discoveries (System)} &\\textbf{Overlap} &\\textbf{Coverage} \\\\\n",
"\\hline\n",
"Email Address & 0 & 6 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"Phone Number & 0 & 6 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"User Name & 0 & 10 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"Person Name & 0 & 7 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"Postal Address & 0 & 1 & 0 & 0.0\\% \\\\\n",
"\\hline\n",
"\\end{tabular}\n"
]
}
],
"source": [
"\n",
"import json\n",
"from collections import defaultdict\n",
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"from typing import Dict, Iterable, List, Mapping, Set, Tuple\n",
"\n",
"\n",
"# ---- Configurable PII types (JSONL -> table label) ----\n",
"PII_TYPES: List[Tuple[str, str]] = [\n",
" (\"EMAIL\", \"Email Address\"),\n",
" (\"PHONE\", \"Phone Number\"),\n",
" (\"USERNAME\", \"User Name\"),\n",
" (\"PERSON_NAME\", \"Person Name\"),\n",
" (\"POSTAL_ADDRESS\", \"Postal Address\"),\n",
"]\n",
"\n",
"\n",
"def _db_key_from_record(rec: Mapping) -> str:\n",
" \"\"\"\n",
" Prefer db_path from JSONL, fall back to 'unknown_db' if missing.\n",
" Example db_path: 'selectedDBs\\\\A1_msgstore.db' -> 'A1_msgstore'\n",
" \"\"\"\n",
" db_path = str(rec.get(\"db_path\", \"\")).strip()\n",
" if not db_path:\n",
" return \"unknown_db\"\n",
" return Path(db_path).stem\n",
"\n",
"\n",
"def _has_any_pii(rec: Mapping) -> bool:\n",
" \"\"\"\n",
" Treat a PII type as present in a DB if the record has at least one entity.\n",
" Uses the PII list when available; falls back to Num_of_PII.\n",
" \"\"\"\n",
" pii_list = rec.get(\"PII\", None)\n",
" if isinstance(pii_list, list):\n",
" return len(pii_list) > 0\n",
" try:\n",
" return int(rec.get(\"Num_of_PII\", 0)) > 0\n",
" except Exception:\n",
" return False\n",
"\n",
"\n",
"def collect_db_sets(folder: Path, pii_types: Iterable[str]) -> Dict[str, Set[str]]:\n",
" \"\"\"\n",
" Returns: pii_type -> {db_key, ...} where that pii_type appears at least once.\n",
" \"\"\"\n",
" wanted = set(pii_types)\n",
" db_sets: Dict[str, Set[str]] = defaultdict(set)\n",
"\n",
" files = sorted(folder.glob(\"*.jsonl\"))\n",
" if not files:\n",
" raise FileNotFoundError(f\"No .jsonl files found in: {folder}\")\n",
"\n",
" for fp in files:\n",
" with fp.open(\"r\", encoding=\"utf-8\") as f:\n",
" for line in f:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" rec = json.loads(line)\n",
" pii_type = str(rec.get(\"PII_type\", \"\")).strip()\n",
" if pii_type not in wanted:\n",
" continue\n",
" if _has_any_pii(rec):\n",
" db_sets[pii_type].add(_db_key_from_record(rec))\n",
"\n",
" for t in wanted:\n",
" db_sets.setdefault(t, set())\n",
"\n",
" return db_sets\n",
"\n",
"\n",
"@dataclass(frozen=True)\n",
"class CoverageRow:\n",
" label: str\n",
" gt: int\n",
" sys: int\n",
" overlap: int\n",
" coverage_pct: float\n",
"\n",
"\n",
"def compute_coverage(\n",
" gt_sets: Dict[str, Set[str]],\n",
" sys_sets: Dict[str, Set[str]],\n",
" pii_types: List[Tuple[str, str]],\n",
") -> List[CoverageRow]:\n",
" rows: List[CoverageRow] = []\n",
" for key, label in pii_types:\n",
" dg = gt_sets.get(key, set())\n",
" ds = sys_sets.get(key, set())\n",
" inter = dg & ds\n",
" cov = (len(inter) / len(dg) * 100.0) if len(dg) else 0.0\n",
" rows.append(CoverageRow(label, len(dg), len(ds), len(inter), cov))\n",
" return rows\n",
"\n",
"\n",
"def render_latex_tabular(rows: List[CoverageRow]) -> str:\n",
" \"\"\"\n",
" Print only the tabular environment (as requested).\n",
" \"\"\"\n",
" lines: List[str] = []\n",
" lines.append(r\"\\begin{tabular}{|l|p{1.2cm}|p{1.5cm}|p{1.0cm}|p{1.2cm}|}\")\n",
" lines.append(r\"\\hline\")\n",
" lines.append(\n",
" r\"\\textbf{PII Type} &\"\n",
" r\"\\textbf{DBs with PII (GT)} &\"\n",
" r\"\\textbf{DBs with discoveries (System)} &\"\n",
" r\"\\textbf{Overlap} &\"\n",
" r\"\\textbf{Coverage} \\\\\"\n",
" )\n",
" lines.append(r\"\\hline\")\n",
"\n",
" for r in rows:\n",
" lines.append(\n",
" f\"{r.label} & {r.gt} & {r.sys} & {r.overlap} & {r.coverage_pct:.1f}\\\\% \\\\\\\\\"\n",
" )\n",
" lines.append(r\"\\hline\")\n",
"\n",
" lines.append(r\"\\end{tabular}\")\n",
" return \"\\n\".join(lines)\n",
"\n",
"\n",
"def render_plain_text_table(rows: List[CoverageRow]) -> str:\n",
" \"\"\"\n",
" Simple fixed-width table for quick reading in terminal.\n",
" \"\"\"\n",
" headers = [\"PII Type\", \"GT DBs\", \"System DBs\", \"Overlap\", \"Coverage\"]\n",
" data = [\n",
" [r.label, str(r.gt), str(r.sys), str(r.overlap), f\"{r.coverage_pct:.1f}%\"]\n",
" for r in rows\n",
" ]\n",
"\n",
" # compute column widths\n",
" widths = [len(h) for h in headers]\n",
" for row in data:\n",
" for i, cell in enumerate(row):\n",
" widths[i] = max(widths[i], len(cell))\n",
"\n",
" def fmt_row(row: List[str]) -> str:\n",
" return \" | \".join(cell.ljust(widths[i]) for i, cell in enumerate(row))\n",
"\n",
" sep = \"-+-\".join(\"-\" * w for w in widths)\n",
"\n",
" out: List[str] = []\n",
" out.append(fmt_row(headers))\n",
" out.append(sep)\n",
" for row in data:\n",
" out.append(fmt_row(row))\n",
" return \"\\n\".join(out)\n",
"\n",
"\n",
"def main() -> None:\n",
" # Define these inside main so importing this module has no side effects.\n",
" SYSTEM_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\db_level\")\n",
" GT_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\app_level\")\n",
" \n",
" gt_sets = collect_db_sets(GT_DIR, [k for k, _ in PII_TYPES])\n",
" sys_sets = collect_db_sets(SYSTEM_DIR, [k for k, _ in PII_TYPES])\n",
"\n",
" rows = compute_coverage(gt_sets, sys_sets, PII_TYPES)\n",
"\n",
" print(\"PLAIN TEXT TABLE\\n\")\n",
" print(render_plain_text_table(rows))\n",
" print(\"\\nLATEX TABULAR\\n\")\n",
" print(render_latex_tabular(rows))\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}