mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
254 lines
8.3 KiB
Plaintext
254 lines
8.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "03fbbc9f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CSV path: C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\\RQ2\\app_total_columns.csv\n",
|
|
"Exists: True\n",
|
|
"Using BASE_DIR: C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\n",
|
|
"Using CSV: C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\\RQ2\\app_total_columns.csv\n",
|
|
"CSV exists: True\n",
|
|
"\\begin{tabular}{|l|p{1.4cm}|p{1.8cm}|}\n",
|
|
"\\hline\n",
|
|
"\\textbf{Method/LLM} &\n",
|
|
"\\textbf{Avg. Cols Examined} &\n",
|
|
"\\textbf{Avg. Search Space Reduc.} \\\\\n",
|
|
"\\hline\n",
|
|
"bulk\\_extractor-v1.6 (baseline) & NA & 0.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-4o-mini & 9.1 & 85.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Gemini-2.5-Pro & 0 & 100.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Qwen2.5-72B & 31.5 & 41.2\\% \\\\\n",
|
|
"\\hline\n",
|
|
"LLaMA-3.1-70B-Instruct & 1 & 97.5\\% \\\\\n",
|
|
"\\hline\n",
|
|
"LLaMA-3.1-8B-Instruct & 0.4 & 99.9\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Mixtral-8x22B & 3.2 & 90.3\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Mixtral-8x7B & 2.6 & 97.2\\% \\\\\n",
|
|
"\\hline\n",
|
|
"Mistral-Large & 24.9 & 75.6\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-5.1 & 29.9 & 81.0\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-4.1 & 20.5 & 69.7\\% \\\\\n",
|
|
"\\hline\n",
|
|
"GPT-3.5-turbo & 2.8 & 98.6\\% \\\\\n",
|
|
"\\hline\n",
|
|
"\\end{tabular}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import json\n",
|
|
"import csv\n",
|
|
"from pathlib import Path\n",
|
|
"from statistics import mean\n",
|
|
"\n",
|
|
"# -------------------------------------------------\n",
|
|
"# Auto-detect project root (Agent directory)\n",
|
|
"# -------------------------------------------------\n",
|
|
"BASE_DIR = Path(r\"C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\")\n",
|
|
"\n",
|
|
"\n",
|
|
"CANDIDATE_CSV = BASE_DIR / \"RQ2\" / \"app_total_columns.csv\"\n",
|
|
"\n",
|
|
"print(\"CSV path:\", CANDIDATE_CSV)\n",
|
|
"print(\"Exists:\", CANDIDATE_CSV.exists())\n",
|
|
"\n",
|
|
"\n",
|
|
"print(\"Using BASE_DIR:\", BASE_DIR)\n",
|
|
"print(\"Using CSV:\", CANDIDATE_CSV)\n",
|
|
"print(\"CSV exists:\", CANDIDATE_CSV.exists())\n",
|
|
"\n",
|
|
"# -------------------------------------------------\n",
|
|
"# MODELS\n",
|
|
"# -------------------------------------------------\n",
|
|
"MODELS = {\n",
|
|
" \"gpt_4o_mini\": \"GPT-4o-mini\",\n",
|
|
" \"gemini_2_5_pro\": \"Gemini-2.5-Pro\",\n",
|
|
" \"qwen_2_5_72b\": \"Qwen2.5-72B\",\n",
|
|
" \"llama_3_1_70b\": \"LLaMA-3.1-70B-Instruct\",\n",
|
|
" \"llama_3_1_8b\": \"LLaMA-3.1-8B-Instruct\",\n",
|
|
" \"mixtral_8x22b\": \"Mixtral-8x22B\",\n",
|
|
" \"mixtral_8x7b\": \"Mixtral-8x7B\",\n",
|
|
" \"mistral_large\": \"Mistral-Large\",\n",
|
|
" \"gpt_5_1\": \"GPT-5.1\",\n",
|
|
" \"gpt_4_1\": \"GPT-4.1\",\n",
|
|
" \"gpt_3_5_turbo\": \"GPT-3.5-turbo\",\n",
|
|
"}\n",
|
|
"\n",
|
|
"# -------------------------------------------------\n",
|
|
"# Load candidate totals per app\n",
|
|
"# CSV must contain columns:\n",
|
|
"# app_code,total_columns\n",
|
|
"# -------------------------------------------------\n",
|
|
"def load_candidate_totals(csv_path):\n",
|
|
" totals = {}\n",
|
|
" with open(csv_path, newline=\"\", encoding=\"utf-8\") as f:\n",
|
|
" reader = csv.DictReader(f)\n",
|
|
" for row in reader:\n",
|
|
" app = row.get(\"app_code\")\n",
|
|
" total = row.get(\"total_columns\")\n",
|
|
" if app and total:\n",
|
|
" totals[app] = int(total)\n",
|
|
" return totals\n",
|
|
"\n",
|
|
"\n",
|
|
"# -------------------------------------------------\n",
|
|
"# Load scanned columns per app\n",
|
|
"# From normalized_results/<model>/app_level/app_level.jsonl\n",
|
|
"# -------------------------------------------------\n",
|
|
"def load_scanned_cols(app_level_jsonl: Path):\n",
|
|
" \"\"\"\n",
|
|
" Reads app_level.jsonl (corpus/app-level format).\n",
|
|
"\n",
|
|
" Your format:\n",
|
|
" {\n",
|
|
" \"db_path\": \"selectedDBs\\\\A1\",\n",
|
|
" ...\n",
|
|
" \"Num_of_source_columns_unique\": 29\n",
|
|
" }\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" { \"A1\": scanned_columns }\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" scanned = {}\n",
|
|
"\n",
|
|
" with app_level_jsonl.open(\"r\", encoding=\"utf-8\") as f:\n",
|
|
" for line in f:\n",
|
|
" line = line.strip()\n",
|
|
" if not line:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" rec = json.loads(line)\n",
|
|
"\n",
|
|
" # ---- Extract app from db_path ----\n",
|
|
" db_path = rec.get(\"db_path\", \"\")\n",
|
|
" if not db_path:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" app = Path(db_path).name # \"A1\"\n",
|
|
"\n",
|
|
" # ---- Use UNIQUE count for efficiency ----\n",
|
|
" n = rec.get(\"Num_of_source_columns_unique\")\n",
|
|
" if n is None:\n",
|
|
" n = rec.get(\"Num_of_source_columns\", 0)\n",
|
|
"\n",
|
|
" try:\n",
|
|
" n = int(n)\n",
|
|
" except:\n",
|
|
" n = 0\n",
|
|
"\n",
|
|
" # Keep maximum seen per app\n",
|
|
" scanned[app] = max(scanned.get(app, 0), n)\n",
|
|
"\n",
|
|
" return scanned\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"# -------------------------------------------------\n",
|
|
"# Main computation\n",
|
|
"# -------------------------------------------------\n",
|
|
"candidate_totals = load_candidate_totals(CANDIDATE_CSV)\n",
|
|
"\n",
|
|
"results = []\n",
|
|
"\n",
|
|
"for slug, display_name in MODELS.items():\n",
|
|
"\n",
|
|
" app_jsonl = (\n",
|
|
" BASE_DIR\n",
|
|
" / \"normalized_results\"\n",
|
|
" / slug\n",
|
|
" / \"app_level\"\n",
|
|
" / \"app_level.jsonl\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" if not app_jsonl.exists():\n",
|
|
" results.append((display_name, \"xx\", \"xx\"))\n",
|
|
" continue\n",
|
|
"\n",
|
|
" scanned_cols = load_scanned_cols(app_jsonl)\n",
|
|
"\n",
|
|
" examined_vals = []\n",
|
|
" reduction_vals = []\n",
|
|
"\n",
|
|
" for app, total_cols in candidate_totals.items():\n",
|
|
"\n",
|
|
" scanned = scanned_cols.get(app, 0)\n",
|
|
" examined_vals.append(scanned)\n",
|
|
"\n",
|
|
" if total_cols > 0:\n",
|
|
" reduction = 1 - (scanned / total_cols)\n",
|
|
" reduction_vals.append(reduction)\n",
|
|
"\n",
|
|
" if examined_vals and reduction_vals:\n",
|
|
" avg_examined = round(mean(examined_vals), 1)\n",
|
|
" avg_reduction = round(mean(reduction_vals) * 100, 1)\n",
|
|
" results.append((display_name, avg_examined, f\"{avg_reduction}\\\\%\"))\n",
|
|
" else:\n",
|
|
" results.append((display_name, \"xx\", \"xx\"))\n",
|
|
"\n",
|
|
"# -------------------------------------------------\n",
|
|
"# Print LaTeX table\n",
|
|
"# -------------------------------------------------\n",
|
|
"print(r\"\\begin{tabular}{|l|p{1.4cm}|p{1.8cm}|}\")\n",
|
|
"print(r\"\\hline\")\n",
|
|
"print(r\"\\textbf{Method/LLM} &\")\n",
|
|
"print(r\"\\textbf{Avg. Cols Examined} &\")\n",
|
|
"print(r\"\\textbf{Avg. Search Space Reduc.} \\\\\")\n",
|
|
"print(r\"\\hline\")\n",
|
|
"print(r\"bulk\\_extractor-v1.6 (baseline) & NA & 0.0\\% \\\\\")\n",
|
|
"print(r\"\\hline\")\n",
|
|
"\n",
|
|
"for name, cols, reduc in results:\n",
|
|
" print(f\"{name} & {cols} & {reduc} \\\\\\\\\")\n",
|
|
" print(r\"\\hline\")\n",
|
|
"\n",
|
|
"print(r\"\\end{tabular}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d17221d0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|