{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ebe9dbb1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\\begin{table*}\n", "\\centering\n", "\\small\n", "\\caption{Distribution of distinct PII entities discovered across evaluated methods and PII categories.}\n", "\\label{tab:model_yield}\n", "\\begin{tabular}{|l|l|l|l|l|l|l|l|}\n", "\\hline\n", "\\multicolumn{2}{|c|}{\\textbf{Method/LLM}} & \\textbf{Email} & \\textbf{Phone} & \\textbf{User Name} & \\textbf{Real Name} & \\textbf{Precision} & \\textbf{Recall} \\\\\n", "\\hline\n", "Gemini-2.5-Pro & 88.4\\% & 10 & 791 & 1664 & 1076 & 76.50\\% & 49.03\\% \\\\\n", "\\hline\n", "GPT-3.5-Turbo & xx & 4 & 13 & 0 & 276 & 24.23\\% & 1.29\\% \\\\\n", "\\hline\n", "GPT-4.1 & xx & 1289 & 680 & 531 & 683 & 33.30\\% & 19.19\\% \\\\\n", "\\hline\n", "GPT-4o-mini & 82.0\\% & 14 & 22 & 875 & 1928 & 39.98\\% & 20.54\\% \\\\\n", "\\hline\n", "GPT-5.1 & xx & 16 & 1184 & 1234 & 2154 & 51.20\\% & 42.52\\% \\\\\n", "\\hline\n", "LLaMA-3.1-8B-Instruct & xx & 0 & 0 & 2 & 15 & 88.24\\% & 0.27\\% \\\\\n", "\\hline\n", "LLaMA-3.1-70B-Instruct & xx & 6 & 6 & 34 & 15 & 39.34\\% & 0.43\\% \\\\\n", "\\hline\n", "Mistral-Large & xx & 2 & 989 & 2121 & 1 & 59.56\\% & 33.56\\% \\\\\n", "\\hline\n", "Mixtral-8x7B & xx & 6 & 6 & 2302 & 98 & 48.34\\% & 21.10\\% \\\\\n", "\\hline\n", "Mixtral-8x22B & xx & 0 & 6 & 58 & 15 & 25.32\\% & 0.36\\% \\\\\n", "\\hline\n", "Qwen2.5-72B & 82.3\\% & 961 & 45 & 528 & 1203 & 38.87\\% & 19.26\\% \\\\\n", "\\hline\n", "\\end{tabular}\n", "\\end{table*}\n" ] } ], "source": [ "import json\n", "import re\n", "from pathlib import Path\n", "from typing import Dict, Set\n", "\n", "# =========================\n", "# CONFIG\n", "# =========================\n", "\n", "BASE_DIR = Path(r\"C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\\normalized_results\")\n", "\n", "GT_PATH = BASE_DIR / \"ground_truth\" / \"corpus_level\" / \"corpus_level.jsonl\"\n", "\n", "METHODS = {\n", " \"Gemini-2.5-Pro\": {\"path\": \"gemini_2_5_pro\", \"bm\": \"88.4\\\\%\"},\n", " \"GPT-3.5-Turbo\": {\"path\": \"gpt_3_5_turbo\", \"bm\": \"xx\"},\n", " \"GPT-4.1\": {\"path\": \"gpt_4_1\", \"bm\": \"xx\"},\n", " \"GPT-4o-mini\": {\"path\": \"gpt_4o_mini\", \"bm\": \"82.0\\\\%\"},\n", " \"GPT-5.1\": {\"path\": \"gpt_5_1\", \"bm\": \"xx\"},\n", " \"LLaMA-3.1-8B-Instruct\": {\"path\": \"llama_3_1_8b\", \"bm\": \"xx\"},\n", " \"LLaMA-3.1-70B-Instruct\": {\"path\": \"llama_3_1_70b\", \"bm\": \"xx\"},\n", " \"Mistral-Large\": {\"path\": \"mistral_large\", \"bm\": \"xx\"},\n", " \"Mixtral-8x7B\": {\"path\": \"mixtral_8x7b\", \"bm\": \"xx\"},\n", " \"Mixtral-8x22B\": {\"path\": \"mixtral_8x22b\", \"bm\": \"xx\"},\n", " \"Qwen2.5-72B\": {\"path\": \"qwen_2_5_72b\", \"bm\": \"82.3\\\\%\"},\n", "}\n", "\n", "PII_ORDER = [\"EMAIL\", \"PHONE\", \"USERNAME\", \"PERSON_NAME\"]\n", "\n", "# =========================\n", "# CANONICALIZATION\n", "# =========================\n", "\n", "def canonicalize(val: str, pii_type: str) -> str:\n", " val = val.strip()\n", "\n", " if pii_type == \"EMAIL\":\n", " return val.lower()\n", "\n", " if pii_type == \"PHONE\":\n", " plus = val.startswith(\"+\")\n", " digits = re.sub(r\"\\D\", \"\", val)\n", " return \"+\" + digits if plus else digits\n", "\n", " if pii_type in {\"USERNAME\", \"PERSON_NAME\"}:\n", " val = val.lower()\n", " val = re.sub(r\"\\b(mr|ms|mrs|dr|prof)\\.?\\b\", \"\", val)\n", " val = re.sub(r\"[^\\w\\s]\", \"\", val)\n", " val = re.sub(r\"\\s+\", \" \", val)\n", " return val.strip()\n", "\n", " return val\n", "\n", "\n", "# =========================\n", "# LOAD CORPUS\n", "# =========================\n", "\n", "def load_corpus(path: Path) -> Dict[str, Set[str]]:\n", " data = {t: set() for t in PII_ORDER}\n", " if not path.exists():\n", " return data\n", "\n", " with path.open(\"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " rec = json.loads(line)\n", " pii_type = (rec.get(\"PII_type\") or \"\").upper()\n", " if pii_type not in PII_ORDER:\n", " continue\n", "\n", " vals = rec.get(\"PII_unique\") or rec.get(\"PII_all\") or []\n", " canon_vals = {\n", " canonicalize(v, pii_type)\n", " for v in vals\n", " if isinstance(v, str) and v.strip()\n", " }\n", "\n", " data[pii_type].update(canon_vals)\n", "\n", " return data\n", "\n", "# =========================\n", "# LOAD GT\n", "# =========================\n", "\n", "GT = load_corpus(GT_PATH)\n", "\n", "# =========================\n", "# COMPUTE TABLE\n", "# =========================\n", "\n", "rows = []\n", "\n", "for name, info in METHODS.items():\n", "\n", " corpus_path = BASE_DIR / info[\"path\"] / \"corpus_level\" / \"corpus_level.jsonl\"\n", " SYS = load_corpus(corpus_path)\n", "\n", " total_sys = 0\n", " total_gt = 0\n", " total_overlap = 0\n", " counts = {}\n", "\n", " for t in PII_ORDER:\n", "\n", " gt_set = GT[t]\n", " sys_set = SYS[t]\n", "\n", " counts[t] = len(sys_set)\n", "\n", " overlap = gt_set.intersection(sys_set)\n", "\n", " total_sys += len(sys_set)\n", " total_gt += len(gt_set)\n", " total_overlap += len(overlap)\n", "\n", " precision = (total_overlap / total_sys) if total_sys else 0.0\n", " recall = (total_overlap / total_gt) if total_gt else 0.0\n", "\n", " rows.append((\n", " name,\n", " info[\"bm\"],\n", " counts[\"EMAIL\"],\n", " counts[\"PHONE\"],\n", " counts[\"USERNAME\"],\n", " counts[\"PERSON_NAME\"],\n", " f\"{precision*100:.2f}\\\\%\",\n", " f\"{recall*100:.2f}\\\\%\",\n", " ))\n", "\n", "# =========================\n", "# EMIT LATEX\n", "# =========================\n", "\n", "print(r\"\\begin{table*}\")\n", "print(r\"\\centering\")\n", "print(r\"\\small\")\n", "print(r\"\\caption{Distribution of distinct PII entities discovered across evaluated methods and PII categories.}\")\n", "print(r\"\\label{tab:model_yield}\")\n", "print(r\"\\begin{tabular}{|l|l|l|l|l|l|l|l|}\")\n", "print(r\"\\hline\")\n", "print(r\"\\multicolumn{2}{|c|}{\\textbf{Method/LLM}} & \\textbf{Email} & \\textbf{Phone} & \\textbf{User Name} & \\textbf{Real Name} & \\textbf{Precision} & \\textbf{Recall} \\\\\")\n", "print(r\"\\hline\")\n", "\n", "for r in rows:\n", " print(\" & \".join(map(str, r)) + r\" \\\\\")\n", " print(r\"\\hline\")\n", "\n", "print(r\"\\end{tabular}\")\n", "print(r\"\\end{table*}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "07108960", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 5 }