mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
233 lines
7.9 KiB
Plaintext
233 lines
7.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "ebe9dbb1",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\\begin{table*}\n",
|
|
"\\centering\n",
|
|
"\\small\n",
|
|
"\\caption{Distribution of distinct PII entities discovered across evaluated methods and PII categories.}\n",
|
|
"\\label{tab:model_yield}\n",
|
|
"\\begin{tabular}{|l|l|l|l|l|l|l|l|}\n",
|
|
"\\hline\n",
|
|
"\\multicolumn{2}{|c|}{\\textbf{Method/LLM}} & \\textbf{Email} & \\textbf{Phone} & \\textbf{User Name} & \\textbf{Real Name} & \\textbf{Precision} & \\textbf{Recall} \\\\\n",
|
|
"\\hline\n",
|
|
"Gemini-2.5-Pro & 88.4\\% & 10 & 791 & 1664 & 1076 & 76.50\\% & 49.03\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-3.5-Turbo & xx & 4 & 13 & 0 & 276 & 24.23\\% & 1.29\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-4.1 & xx & 1289 & 680 & 531 & 683 & 33.30\\% & 19.19\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-4o-mini & 82.0\\% & 14 & 22 & 875 & 1928 & 39.98\\% & 20.54\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-5.1 & xx & 16 & 1184 & 1234 & 2154 & 51.20\\% & 42.52\\% \\\\\n",
|
|
"\\hline\n",
|
|
"LLaMA-3.1-8B-Instruct & xx & 0 & 0 & 2 & 15 & 88.24\\% & 0.27\\% \\\\\n",
|
|
"\\hline\n",
|
|
"LLaMA-3.1-70B-Instruct & xx & 6 & 6 & 34 & 15 & 39.34\\% & 0.43\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Mistral-Large & xx & 2 & 989 & 2121 & 1 & 59.56\\% & 33.56\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Mixtral-8x7B & xx & 6 & 6 & 2302 & 98 & 48.34\\% & 21.10\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Mixtral-8x22B & xx & 0 & 6 & 58 & 15 & 25.32\\% & 0.36\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Qwen2.5-72B & 82.3\\% & 961 & 45 & 528 & 1203 & 38.87\\% & 19.26\\% \\\\\n",
|
|
"\\hline\n",
|
|
"\\end{tabular}\n",
|
|
"\\end{table*}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import json\n",
|
|
"import re\n",
|
|
"from pathlib import Path\n",
|
|
"from typing import Dict, Set\n",
|
|
"\n",
|
|
"# =========================\n",
|
|
"# CONFIG\n",
|
|
"# =========================\n",
|
|
"\n",
|
|
"BASE_DIR = Path(r\"C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\\normalized_results\")\n",
|
|
"\n",
|
|
"GT_PATH = BASE_DIR / \"ground_truth\" / \"corpus_level\" / \"corpus_level.jsonl\"\n",
|
|
"\n",
|
|
"METHODS = {\n",
|
|
" \"Gemini-2.5-Pro\": {\"path\": \"gemini_2_5_pro\", \"bm\": \"88.4\\\\%\"},\n",
|
|
" \"GPT-3.5-Turbo\": {\"path\": \"gpt_3_5_turbo\", \"bm\": \"xx\"},\n",
|
|
" \"GPT-4.1\": {\"path\": \"gpt_4_1\", \"bm\": \"xx\"},\n",
|
|
" \"GPT-4o-mini\": {\"path\": \"gpt_4o_mini\", \"bm\": \"82.0\\\\%\"},\n",
|
|
" \"GPT-5.1\": {\"path\": \"gpt_5_1\", \"bm\": \"xx\"},\n",
|
|
" \"LLaMA-3.1-8B-Instruct\": {\"path\": \"llama_3_1_8b\", \"bm\": \"xx\"},\n",
|
|
" \"LLaMA-3.1-70B-Instruct\": {\"path\": \"llama_3_1_70b\", \"bm\": \"xx\"},\n",
|
|
" \"Mistral-Large\": {\"path\": \"mistral_large\", \"bm\": \"xx\"},\n",
|
|
" \"Mixtral-8x7B\": {\"path\": \"mixtral_8x7b\", \"bm\": \"xx\"},\n",
|
|
" \"Mixtral-8x22B\": {\"path\": \"mixtral_8x22b\", \"bm\": \"xx\"},\n",
|
|
" \"Qwen2.5-72B\": {\"path\": \"qwen_2_5_72b\", \"bm\": \"82.3\\\\%\"},\n",
|
|
"}\n",
|
|
"\n",
|
|
"PII_ORDER = [\"EMAIL\", \"PHONE\", \"USERNAME\", \"PERSON_NAME\"]\n",
|
|
"\n",
|
|
"# =========================\n",
|
|
"# CANONICALIZATION\n",
|
|
"# =========================\n",
|
|
"\n",
|
|
"def canonicalize(val: str, pii_type: str) -> str:\n",
|
|
" val = val.strip()\n",
|
|
"\n",
|
|
" if pii_type == \"EMAIL\":\n",
|
|
" return val.lower()\n",
|
|
"\n",
|
|
" if pii_type == \"PHONE\":\n",
|
|
" plus = val.startswith(\"+\")\n",
|
|
" digits = re.sub(r\"\\D\", \"\", val)\n",
|
|
" return \"+\" + digits if plus else digits\n",
|
|
"\n",
|
|
" if pii_type in {\"USERNAME\", \"PERSON_NAME\"}:\n",
|
|
" val = val.lower()\n",
|
|
" val = re.sub(r\"\\b(mr|ms|mrs|dr|prof)\\.?\\b\", \"\", val)\n",
|
|
" val = re.sub(r\"[^\\w\\s]\", \"\", val)\n",
|
|
" val = re.sub(r\"\\s+\", \" \", val)\n",
|
|
" return val.strip()\n",
|
|
"\n",
|
|
" return val\n",
|
|
"\n",
|
|
"\n",
|
|
"# =========================\n",
|
|
"# LOAD CORPUS\n",
|
|
"# =========================\n",
|
|
"\n",
|
|
"def load_corpus(path: Path) -> Dict[str, Set[str]]:\n",
|
|
" data = {t: set() for t in PII_ORDER}\n",
|
|
" if not path.exists():\n",
|
|
" return data\n",
|
|
"\n",
|
|
" with path.open(\"r\", encoding=\"utf-8\") as f:\n",
|
|
" for line in f:\n",
|
|
" rec = json.loads(line)\n",
|
|
" pii_type = (rec.get(\"PII_type\") or \"\").upper()\n",
|
|
" if pii_type not in PII_ORDER:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" vals = rec.get(\"PII_unique\") or rec.get(\"PII_all\") or []\n",
|
|
" canon_vals = {\n",
|
|
" canonicalize(v, pii_type)\n",
|
|
" for v in vals\n",
|
|
" if isinstance(v, str) and v.strip()\n",
|
|
" }\n",
|
|
"\n",
|
|
" data[pii_type].update(canon_vals)\n",
|
|
"\n",
|
|
" return data\n",
|
|
"\n",
|
|
"# =========================\n",
|
|
"# LOAD GT\n",
|
|
"# =========================\n",
|
|
"\n",
|
|
"GT = load_corpus(GT_PATH)\n",
|
|
"\n",
|
|
"# =========================\n",
|
|
"# COMPUTE TABLE\n",
|
|
"# =========================\n",
|
|
"\n",
|
|
"rows = []\n",
|
|
"\n",
|
|
"for name, info in METHODS.items():\n",
|
|
"\n",
|
|
" corpus_path = BASE_DIR / info[\"path\"] / \"corpus_level\" / \"corpus_level.jsonl\"\n",
|
|
" SYS = load_corpus(corpus_path)\n",
|
|
"\n",
|
|
" total_sys = 0\n",
|
|
" total_gt = 0\n",
|
|
" total_overlap = 0\n",
|
|
" counts = {}\n",
|
|
"\n",
|
|
" for t in PII_ORDER:\n",
|
|
"\n",
|
|
" gt_set = GT[t]\n",
|
|
" sys_set = SYS[t]\n",
|
|
"\n",
|
|
" counts[t] = len(sys_set)\n",
|
|
"\n",
|
|
" overlap = gt_set.intersection(sys_set)\n",
|
|
"\n",
|
|
" total_sys += len(sys_set)\n",
|
|
" total_gt += len(gt_set)\n",
|
|
" total_overlap += len(overlap)\n",
|
|
"\n",
|
|
" precision = (total_overlap / total_sys) if total_sys else 0.0\n",
|
|
" recall = (total_overlap / total_gt) if total_gt else 0.0\n",
|
|
"\n",
|
|
" rows.append((\n",
|
|
" name,\n",
|
|
" info[\"bm\"],\n",
|
|
" counts[\"EMAIL\"],\n",
|
|
" counts[\"PHONE\"],\n",
|
|
" counts[\"USERNAME\"],\n",
|
|
" counts[\"PERSON_NAME\"],\n",
|
|
" f\"{precision*100:.2f}\\\\%\",\n",
|
|
" f\"{recall*100:.2f}\\\\%\",\n",
|
|
" ))\n",
|
|
"\n",
|
|
"# =========================\n",
|
|
"# EMIT LATEX\n",
|
|
"# =========================\n",
|
|
"\n",
|
|
"print(r\"\\begin{table*}\")\n",
|
|
"print(r\"\\centering\")\n",
|
|
"print(r\"\\small\")\n",
|
|
"print(r\"\\caption{Distribution of distinct PII entities discovered across evaluated methods and PII categories.}\")\n",
|
|
"print(r\"\\label{tab:model_yield}\")\n",
|
|
"print(r\"\\begin{tabular}{|l|l|l|l|l|l|l|l|}\")\n",
|
|
"print(r\"\\hline\")\n",
|
|
"print(r\"\\multicolumn{2}{|c|}{\\textbf{Method/LLM}} & \\textbf{Email} & \\textbf{Phone} & \\textbf{User Name} & \\textbf{Real Name} & \\textbf{Precision} & \\textbf{Recall} \\\\\")\n",
|
|
"print(r\"\\hline\")\n",
|
|
"\n",
|
|
"for r in rows:\n",
|
|
" print(\" & \".join(map(str, r)) + r\" \\\\\")\n",
|
|
" print(r\"\\hline\")\n",
|
|
"\n",
|
|
"print(r\"\\end{tabular}\")\n",
|
|
"print(r\"\\end{table*}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "07108960",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|