This commit is contained in:
Frank Xu
2026-01-30 20:44:31 -05:00
parent 5fd4a6033f
commit 8fbeceb03d
4 changed files with 139 additions and 3 deletions

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "234eed3f",
"metadata": {},
"outputs": [

View File

@@ -10,7 +10,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_gpt4o.jsonl\n"
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_gpt4o.jsonl\n",
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_ground_truth.jsonl\n"
]
}
],
@@ -144,13 +145,15 @@
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" # Aggregate GPT-4o results\n",
" out = aggregate_jsonl_folder_corpus_level(\n",
" r\"..\\batch_results_gpt4o_normalized\",\n",
" \"RQ3_corpus_level_gpt4o.jsonl\",\n",
" )\n",
" print(f\"Wrote: {out.resolve()}\")\n",
" \n",
" \n",
" \n",
" # Aggregate Ground Truth results\n",
" out = aggregate_jsonl_folder_corpus_level(\n",
" r\"..\\ground_truth_normalized\",\n",
" \"RQ3_corpus_level_ground_truth.jsonl\",\n",

View File

@@ -0,0 +1,128 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "234eed3f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\textbf{PII Type} &\n",
"\\textbf{GT} &\n",
"\\textbf{System} &\n",
"\\textbf{Overlap} &\n",
"\\textbf{Recall} &\n",
"\\textbf{Precision} \\\\\n",
"\\hline\n",
"Email Address & 10 & 10 & 10 & 100.0\\% & 100.0\\% \\\\\n",
"\\hline\n",
"Phone Number & 1050 & 1050 & 1050 & 100.0\\% & 100.0\\% \\\\\n",
"\\hline\n",
"User Name & 85 & 85 & 85 & 100.0\\% & 100.0\\% \\\\\n",
"\\hline\n",
"Person Name & 909 & 909 & 909 & 100.0\\% & 100.0\\% \\\\\n",
"\\hline\n",
"Postal Address & 2 & 2 & 2 & 100.0\\% & 100.0\\% \\\\\n",
"\\hline\n"
]
}
],
"source": [
"import json\n",
"from pathlib import Path\n",
"from typing import Dict, Set, List\n",
"\n",
"SYSTEM_PATH = Path(\"RQ3_corpus_level_gpt4o.jsonl\")\n",
"GT_PATH = Path(\"RQ3_corpus_level_ground_truth.jsonl\")\n",
"\n",
"PII_TYPE_ORDER = [\"EMAIL\", \"PHONE\", \"USERNAME\", \"PERSON_NAME\", \"POSTAL_ADDRESS\"]\n",
"PII_TYPE_DISPLAY = {\n",
" \"EMAIL\": \"Email Address\",\n",
" \"PHONE\": \"Phone Number\",\n",
" \"USERNAME\": \"User Name\",\n",
" \"PERSON_NAME\": \"Person Name\",\n",
" \"POSTAL_ADDRESS\": \"Postal Address\",\n",
"}\n",
"\n",
"def load_pii_unique_sets(path: Path) -> Dict[str, Set[str]]:\n",
" out: Dict[str, Set[str]] = {}\n",
" with path.open(\"r\", encoding=\"utf-8\") as f:\n",
" for line in f:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" rec = json.loads(line)\n",
" if not isinstance(rec, dict):\n",
" continue\n",
" t = (rec.get(\"PII_type\") or \"\").strip().upper()\n",
" if not t:\n",
" continue\n",
" vals = rec.get(\"PII_unique\", [])\n",
" if not isinstance(vals, list):\n",
" vals = [vals] if vals is not None else []\n",
" out[t] = set(v for v in vals if isinstance(v, str))\n",
" return out\n",
"\n",
"def fmt_pct(x: float) -> str:\n",
" return f\"{x*100:.1f}\\\\%\"\n",
"\n",
"def make_rows(gt_sets: Dict[str, Set[str]], sys_sets: Dict[str, Set[str]]) -> List[str]:\n",
" rows: List[str] = []\n",
" for t in PII_TYPE_ORDER:\n",
" G = gt_sets.get(t, set())\n",
" S = sys_sets.get(t, set())\n",
" overlap = len(G & S)\n",
" gt_n = len(G)\n",
" sys_n = len(S)\n",
" recall = (overlap / gt_n) if gt_n else 0.0\n",
" precision = (overlap / sys_n) if sys_n else 0.0\n",
"\n",
" name = PII_TYPE_DISPLAY.get(t, t)\n",
" rows.append(\n",
" f\"{name} & {gt_n} & {sys_n} & {overlap} & {fmt_pct(recall)} & {fmt_pct(precision)} \\\\\\\\\"\n",
" )\n",
" return rows\n",
"\n",
"if __name__ == \"__main__\":\n",
" sys_sets = load_pii_unique_sets(SYSTEM_PATH)\n",
" gt_sets = load_pii_unique_sets(GT_PATH)\n",
"\n",
" print(r\"\\textbf{PII Type} &\")\n",
" print(r\"\\textbf{GT} &\")\n",
" print(r\"\\textbf{System} &\")\n",
" print(r\"\\textbf{Overlap} &\")\n",
" print(r\"\\textbf{Recall} &\")\n",
" print(r\"\\textbf{Precision} \\\\\")\n",
" print(r\"\\hline\")\n",
"\n",
" for row in make_rows(gt_sets, sys_sets):\n",
" print(row)\n",
" print(r\"\\hline\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}