mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
add RQ3
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"id": "234eed3f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
||||
@@ -10,7 +10,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_gpt4o.jsonl\n"
|
||||
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_gpt4o.jsonl\n",
|
||||
"Wrote: I:\\project2026\\llmagent\\RQs\\RQ3\\RQ3_corpus_level_ground_truth.jsonl\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -144,13 +145,15 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" # Aggregate GPT-4o results\n",
|
||||
" out = aggregate_jsonl_folder_corpus_level(\n",
|
||||
" r\"..\\batch_results_gpt4o_normalized\",\n",
|
||||
" \"RQ3_corpus_level_gpt4o.jsonl\",\n",
|
||||
" )\n",
|
||||
" print(f\"Wrote: {out.resolve()}\")\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" \n",
|
||||
" # Aggregate Ground Truth results\n",
|
||||
" out = aggregate_jsonl_folder_corpus_level(\n",
|
||||
" r\"..\\ground_truth_normalized\",\n",
|
||||
" \"RQ3_corpus_level_ground_truth.jsonl\",\n",
|
||||
|
||||
128
RQs/RQ3/RQ3_t7_corpus_level_Recall_Perc.ipynb
Normal file
128
RQs/RQ3/RQ3_t7_corpus_level_Recall_Perc.ipynb
Normal file
@@ -0,0 +1,128 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "234eed3f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\\textbf{PII Type} &\n",
|
||||
"\\textbf{GT} &\n",
|
||||
"\\textbf{System} &\n",
|
||||
"\\textbf{Overlap} &\n",
|
||||
"\\textbf{Recall} &\n",
|
||||
"\\textbf{Precision} \\\\\n",
|
||||
"\\hline\n",
|
||||
"Email Address & 10 & 10 & 10 & 100.0\\% & 100.0\\% \\\\\n",
|
||||
"\\hline\n",
|
||||
"Phone Number & 1050 & 1050 & 1050 & 100.0\\% & 100.0\\% \\\\\n",
|
||||
"\\hline\n",
|
||||
"User Name & 85 & 85 & 85 & 100.0\\% & 100.0\\% \\\\\n",
|
||||
"\\hline\n",
|
||||
"Person Name & 909 & 909 & 909 & 100.0\\% & 100.0\\% \\\\\n",
|
||||
"\\hline\n",
|
||||
"Postal Address & 2 & 2 & 2 & 100.0\\% & 100.0\\% \\\\\n",
|
||||
"\\hline\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Dict, Set, List\n",
|
||||
"\n",
|
||||
"SYSTEM_PATH = Path(\"RQ3_corpus_level_gpt4o.jsonl\")\n",
|
||||
"GT_PATH = Path(\"RQ3_corpus_level_ground_truth.jsonl\")\n",
|
||||
"\n",
|
||||
"PII_TYPE_ORDER = [\"EMAIL\", \"PHONE\", \"USERNAME\", \"PERSON_NAME\", \"POSTAL_ADDRESS\"]\n",
|
||||
"PII_TYPE_DISPLAY = {\n",
|
||||
" \"EMAIL\": \"Email Address\",\n",
|
||||
" \"PHONE\": \"Phone Number\",\n",
|
||||
" \"USERNAME\": \"User Name\",\n",
|
||||
" \"PERSON_NAME\": \"Person Name\",\n",
|
||||
" \"POSTAL_ADDRESS\": \"Postal Address\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"def load_pii_unique_sets(path: Path) -> Dict[str, Set[str]]:\n",
|
||||
" out: Dict[str, Set[str]] = {}\n",
|
||||
" with path.open(\"r\", encoding=\"utf-8\") as f:\n",
|
||||
" for line in f:\n",
|
||||
" line = line.strip()\n",
|
||||
" if not line:\n",
|
||||
" continue\n",
|
||||
" rec = json.loads(line)\n",
|
||||
" if not isinstance(rec, dict):\n",
|
||||
" continue\n",
|
||||
" t = (rec.get(\"PII_type\") or \"\").strip().upper()\n",
|
||||
" if not t:\n",
|
||||
" continue\n",
|
||||
" vals = rec.get(\"PII_unique\", [])\n",
|
||||
" if not isinstance(vals, list):\n",
|
||||
" vals = [vals] if vals is not None else []\n",
|
||||
" out[t] = set(v for v in vals if isinstance(v, str))\n",
|
||||
" return out\n",
|
||||
"\n",
|
||||
"def fmt_pct(x: float) -> str:\n",
|
||||
" return f\"{x*100:.1f}\\\\%\"\n",
|
||||
"\n",
|
||||
"def make_rows(gt_sets: Dict[str, Set[str]], sys_sets: Dict[str, Set[str]]) -> List[str]:\n",
|
||||
" rows: List[str] = []\n",
|
||||
" for t in PII_TYPE_ORDER:\n",
|
||||
" G = gt_sets.get(t, set())\n",
|
||||
" S = sys_sets.get(t, set())\n",
|
||||
" overlap = len(G & S)\n",
|
||||
" gt_n = len(G)\n",
|
||||
" sys_n = len(S)\n",
|
||||
" recall = (overlap / gt_n) if gt_n else 0.0\n",
|
||||
" precision = (overlap / sys_n) if sys_n else 0.0\n",
|
||||
"\n",
|
||||
" name = PII_TYPE_DISPLAY.get(t, t)\n",
|
||||
" rows.append(\n",
|
||||
" f\"{name} & {gt_n} & {sys_n} & {overlap} & {fmt_pct(recall)} & {fmt_pct(precision)} \\\\\\\\\"\n",
|
||||
" )\n",
|
||||
" return rows\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" sys_sets = load_pii_unique_sets(SYSTEM_PATH)\n",
|
||||
" gt_sets = load_pii_unique_sets(GT_PATH)\n",
|
||||
"\n",
|
||||
" print(r\"\\textbf{PII Type} &\")\n",
|
||||
" print(r\"\\textbf{GT} &\")\n",
|
||||
" print(r\"\\textbf{System} &\")\n",
|
||||
" print(r\"\\textbf{Overlap} &\")\n",
|
||||
" print(r\"\\textbf{Recall} &\")\n",
|
||||
" print(r\"\\textbf{Precision} \\\\\")\n",
|
||||
" print(r\"\\hline\")\n",
|
||||
"\n",
|
||||
" for row in make_rows(gt_sets, sys_sets):\n",
|
||||
" print(row)\n",
|
||||
" print(r\"\\hline\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user