mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
239 lines
8.3 KiB
Plaintext
239 lines
8.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1affac71",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"PLAIN TEXT TABLE\n",
|
|
"\n",
|
|
"PII Type | GT DBs | System DBs | Overlap | Coverage\n",
|
|
"---------------+--------+------------+---------+---------\n",
|
|
"Email Address | 0 | 6 | 0 | 0.0% \n",
|
|
"Phone Number | 0 | 6 | 0 | 0.0% \n",
|
|
"User Name | 0 | 10 | 0 | 0.0% \n",
|
|
"Person Name | 0 | 7 | 0 | 0.0% \n",
|
|
"Postal Address | 0 | 1 | 0 | 0.0% \n",
|
|
"\n",
|
|
"LATEX TABULAR\n",
|
|
"\n",
|
|
"\\begin{tabular}{|l|p{1.2cm}|p{1.5cm}|p{1.0cm}|p{1.2cm}|}\n",
|
|
"\\hline\n",
|
|
"\\textbf{PII Type} &\\textbf{DBs with PII (GT)} &\\textbf{DBs with discoveries (System)} &\\textbf{Overlap} &\\textbf{Coverage} \\\\\n",
|
|
"\\hline\n",
|
|
"Email Address & 0 & 6 & 0 & 0.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Phone Number & 0 & 6 & 0 & 0.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"User Name & 0 & 10 & 0 & 0.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Person Name & 0 & 7 & 0 & 0.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Postal Address & 0 & 1 & 0 & 0.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"\\end{tabular}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"import json\n",
|
|
"from collections import defaultdict\n",
|
|
"from dataclasses import dataclass\n",
|
|
"from pathlib import Path\n",
|
|
"from typing import Dict, Iterable, List, Mapping, Set, Tuple\n",
|
|
"\n",
|
|
"\n",
|
|
"# ---- Configurable PII types (JSONL -> table label) ----\n",
|
|
"PII_TYPES: List[Tuple[str, str]] = [\n",
|
|
" (\"EMAIL\", \"Email Address\"),\n",
|
|
" (\"PHONE\", \"Phone Number\"),\n",
|
|
" (\"USERNAME\", \"User Name\"),\n",
|
|
" (\"PERSON_NAME\", \"Person Name\"),\n",
|
|
" (\"POSTAL_ADDRESS\", \"Postal Address\"),\n",
|
|
"]\n",
|
|
"\n",
|
|
"\n",
|
|
"def _db_key_from_record(rec: Mapping) -> str:\n",
|
|
" \"\"\"\n",
|
|
" Prefer db_path from JSONL, fall back to 'unknown_db' if missing.\n",
|
|
" Example db_path: 'selectedDBs\\\\A1_msgstore.db' -> 'A1_msgstore'\n",
|
|
" \"\"\"\n",
|
|
" db_path = str(rec.get(\"db_path\", \"\")).strip()\n",
|
|
" if not db_path:\n",
|
|
" return \"unknown_db\"\n",
|
|
" return Path(db_path).stem\n",
|
|
"\n",
|
|
"\n",
|
|
"def _has_any_pii(rec: Mapping) -> bool:\n",
|
|
" \"\"\"\n",
|
|
" Treat a PII type as present in a DB if the record has at least one entity.\n",
|
|
" Uses the PII list when available; falls back to Num_of_PII.\n",
|
|
" \"\"\"\n",
|
|
" pii_list = rec.get(\"PII\", None)\n",
|
|
" if isinstance(pii_list, list):\n",
|
|
" return len(pii_list) > 0\n",
|
|
" try:\n",
|
|
" return int(rec.get(\"Num_of_PII\", 0)) > 0\n",
|
|
" except Exception:\n",
|
|
" return False\n",
|
|
"\n",
|
|
"\n",
|
|
"def collect_db_sets(folder: Path, pii_types: Iterable[str]) -> Dict[str, Set[str]]:\n",
|
|
" \"\"\"\n",
|
|
" Returns: pii_type -> {db_key, ...} where that pii_type appears at least once.\n",
|
|
" \"\"\"\n",
|
|
" wanted = set(pii_types)\n",
|
|
" db_sets: Dict[str, Set[str]] = defaultdict(set)\n",
|
|
"\n",
|
|
" files = sorted(folder.glob(\"*.jsonl\"))\n",
|
|
" if not files:\n",
|
|
" raise FileNotFoundError(f\"No .jsonl files found in: {folder}\")\n",
|
|
"\n",
|
|
" for fp in files:\n",
|
|
" with fp.open(\"r\", encoding=\"utf-8\") as f:\n",
|
|
" for line in f:\n",
|
|
" line = line.strip()\n",
|
|
" if not line:\n",
|
|
" continue\n",
|
|
" rec = json.loads(line)\n",
|
|
" pii_type = str(rec.get(\"PII_type\", \"\")).strip()\n",
|
|
" if pii_type not in wanted:\n",
|
|
" continue\n",
|
|
" if _has_any_pii(rec):\n",
|
|
" db_sets[pii_type].add(_db_key_from_record(rec))\n",
|
|
"\n",
|
|
" for t in wanted:\n",
|
|
" db_sets.setdefault(t, set())\n",
|
|
"\n",
|
|
" return db_sets\n",
|
|
"\n",
|
|
"\n",
|
|
"@dataclass(frozen=True)\n",
|
|
"class CoverageRow:\n",
|
|
" label: str\n",
|
|
" gt: int\n",
|
|
" sys: int\n",
|
|
" overlap: int\n",
|
|
" coverage_pct: float\n",
|
|
"\n",
|
|
"\n",
|
|
"def compute_coverage(\n",
|
|
" gt_sets: Dict[str, Set[str]],\n",
|
|
" sys_sets: Dict[str, Set[str]],\n",
|
|
" pii_types: List[Tuple[str, str]],\n",
|
|
") -> List[CoverageRow]:\n",
|
|
" rows: List[CoverageRow] = []\n",
|
|
" for key, label in pii_types:\n",
|
|
" dg = gt_sets.get(key, set())\n",
|
|
" ds = sys_sets.get(key, set())\n",
|
|
" inter = dg & ds\n",
|
|
" cov = (len(inter) / len(dg) * 100.0) if len(dg) else 0.0\n",
|
|
" rows.append(CoverageRow(label, len(dg), len(ds), len(inter), cov))\n",
|
|
" return rows\n",
|
|
"\n",
|
|
"\n",
|
|
"def render_latex_tabular(rows: List[CoverageRow]) -> str:\n",
|
|
" \"\"\"\n",
|
|
" Print only the tabular environment (as requested).\n",
|
|
" \"\"\"\n",
|
|
" lines: List[str] = []\n",
|
|
" lines.append(r\"\\begin{tabular}{|l|p{1.2cm}|p{1.5cm}|p{1.0cm}|p{1.2cm}|}\")\n",
|
|
" lines.append(r\"\\hline\")\n",
|
|
" lines.append(\n",
|
|
" r\"\\textbf{PII Type} &\"\n",
|
|
" r\"\\textbf{DBs with PII (GT)} &\"\n",
|
|
" r\"\\textbf{DBs with discoveries (System)} &\"\n",
|
|
" r\"\\textbf{Overlap} &\"\n",
|
|
" r\"\\textbf{Coverage} \\\\\"\n",
|
|
" )\n",
|
|
" lines.append(r\"\\hline\")\n",
|
|
"\n",
|
|
" for r in rows:\n",
|
|
" lines.append(\n",
|
|
" f\"{r.label} & {r.gt} & {r.sys} & {r.overlap} & {r.coverage_pct:.1f}\\\\% \\\\\\\\\"\n",
|
|
" )\n",
|
|
" lines.append(r\"\\hline\")\n",
|
|
"\n",
|
|
" lines.append(r\"\\end{tabular}\")\n",
|
|
" return \"\\n\".join(lines)\n",
|
|
"\n",
|
|
"\n",
|
|
"def render_plain_text_table(rows: List[CoverageRow]) -> str:\n",
|
|
" \"\"\"\n",
|
|
" Simple fixed-width table for quick reading in terminal.\n",
|
|
" \"\"\"\n",
|
|
" headers = [\"PII Type\", \"GT DBs\", \"System DBs\", \"Overlap\", \"Coverage\"]\n",
|
|
" data = [\n",
|
|
" [r.label, str(r.gt), str(r.sys), str(r.overlap), f\"{r.coverage_pct:.1f}%\"]\n",
|
|
" for r in rows\n",
|
|
" ]\n",
|
|
"\n",
|
|
" # compute column widths\n",
|
|
" widths = [len(h) for h in headers]\n",
|
|
" for row in data:\n",
|
|
" for i, cell in enumerate(row):\n",
|
|
" widths[i] = max(widths[i], len(cell))\n",
|
|
"\n",
|
|
" def fmt_row(row: List[str]) -> str:\n",
|
|
" return \" | \".join(cell.ljust(widths[i]) for i, cell in enumerate(row))\n",
|
|
"\n",
|
|
" sep = \"-+-\".join(\"-\" * w for w in widths)\n",
|
|
"\n",
|
|
" out: List[str] = []\n",
|
|
" out.append(fmt_row(headers))\n",
|
|
" out.append(sep)\n",
|
|
" for row in data:\n",
|
|
" out.append(fmt_row(row))\n",
|
|
" return \"\\n\".join(out)\n",
|
|
"\n",
|
|
"\n",
|
|
"def main() -> None:\n",
|
|
" # Define these inside main so importing this module has no side effects.\n",
|
|
" SYSTEM_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\db_level\")\n",
|
|
" GT_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\app_level\")\n",
|
|
" \n",
|
|
" gt_sets = collect_db_sets(GT_DIR, [k for k, _ in PII_TYPES])\n",
|
|
" sys_sets = collect_db_sets(SYSTEM_DIR, [k for k, _ in PII_TYPES])\n",
|
|
"\n",
|
|
" rows = compute_coverage(gt_sets, sys_sets, PII_TYPES)\n",
|
|
"\n",
|
|
" print(\"PLAIN TEXT TABLE\\n\")\n",
|
|
" print(render_plain_text_table(rows))\n",
|
|
" print(\"\\nLATEX TABULAR\\n\")\n",
|
|
" print(render_latex_tabular(rows))\n",
|
|
"\n",
|
|
"\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" main()\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.18"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|