Files
mobile-pii-discovery-agent/RQs/RQ0/RQ0_2_app_level_aggregation.ipynb
2026-02-18 16:22:33 -05:00

230 lines
9.1 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "234eed3f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\app_level\\app_level.jsonl\n",
"Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\app_level\\app_level.jsonl\n"
]
}
],
"source": [
"import json\n",
"from pathlib import Path\n",
"from typing import Any, Dict, Tuple\n",
"\n",
"IGNORE_FIELDS = {\"Raw_rows_first_100\", \"Exploration_sql\", \"Extraction_sql\", \"PII_Prompt\"}\n",
"\n",
"\n",
"def get_app_code(db_path: str) -> str:\n",
" \"\"\"\n",
" selectedDBs\\\\A2_journal.db -> A2\n",
" selectedDBs/A1_msgstore.db -> A1\n",
" \"\"\"\n",
" stem = Path(db_path).stem # A2_journal\n",
" return stem.split(\"_\", 1)[0]\n",
"\n",
"\n",
"def _dedupe_preserve_order(items):\n",
" seen = set()\n",
" out = []\n",
" for x in items:\n",
" key = json.dumps(x, sort_keys=True, ensure_ascii=False) if isinstance(x, (dict, list)) else x\n",
" if key in seen:\n",
" continue\n",
" seen.add(key)\n",
" out.append(x)\n",
" return out\n",
"\n",
"\n",
"def prefix_source_columns(db_path: str, cols: list) -> list:\n",
" \"\"\"\n",
" Prefix each source column with the database filename to avoid ambiguity\n",
" after aggregating multiple DB files under the same app.\n",
"\n",
" Example:\n",
" db_path = selectedDBs\\\\A1_msgstore.db\n",
" col = message.text_data\n",
" -> A1_msgstore.db:message.text_data\n",
" \"\"\"\n",
" db_file = Path(db_path).name # includes extension\n",
" out = []\n",
" for c in cols:\n",
" if isinstance(c, str) and c:\n",
" out.append(f\"{db_file}:{c}\")\n",
" return out\n",
"\n",
"\n",
"def aggregate_jsonl_folder(in_dir: str | Path, out_path: str | Path) -> Path:\n",
" \"\"\"\n",
" Read all *.jsonl files under in_dir and aggregate records by:\n",
" (app_code derived from db_path, PII_type)\n",
"\n",
" Output per group:\n",
" - db_path: \"selectedDBs\\\\<APP_CODE>\"\n",
" - PII_type\n",
" - PII_all: with duplicates\n",
" - PII_unique: deduped (exact match)\n",
" - Num_of_PII_all: with duplicates (sum of per-record Num_of_PII or len(PII))\n",
" - Num_of_PII_unique: len(PII_unique)\n",
" - source_columns: deduped, prefixed with db filename\n",
" - other list fields: deduped\n",
" - other numeric fields: summed\n",
" - ignores Raw_rows_first_100, Exploration_sql, Extraction_sql\n",
" \"\"\"\n",
" in_dir = Path(in_dir)\n",
" out_path = Path(out_path)\n",
" out_path.parent.mkdir(parents=True, exist_ok=True)\n",
"\n",
" grouped: Dict[Tuple[str, str], Dict[str, Any]] = {}\n",
"\n",
" for jsonl_file in sorted(in_dir.glob(\"*.jsonl\")):\n",
" with jsonl_file.open(\"r\", encoding=\"utf-8\") as f:\n",
" for line_no, line in enumerate(f, start=1):\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
"\n",
" try:\n",
" rec = json.loads(line)\n",
" except json.JSONDecodeError as e:\n",
" raise ValueError(f\"Bad JSON in {jsonl_file} line {line_no}: {e}\") from e\n",
"\n",
" dbp = rec.get(\"db_path\", \"\")\n",
" pii_type = rec.get(\"PII_type\")\n",
" if not pii_type:\n",
" continue\n",
"\n",
" app_code = get_app_code(dbp)\n",
" key = (app_code, pii_type)\n",
"\n",
" if key not in grouped:\n",
" grouped[key] = {\n",
" \"db_path\": f\"selectedDBs\\\\{app_code}\",\n",
" \"PII_type\": pii_type,\n",
" \"PII_all\": [],\n",
" \"PII_unique\": [],\n",
" \"Num_of_PII_all\": 0,\n",
" \"Num_of_PII_unique\": 0,\n",
" }\n",
"\n",
" agg = grouped[key]\n",
"\n",
" # --- Special handling: PII + counts ---\n",
" pii_list = rec.get(\"PII\", [])\n",
" if isinstance(pii_list, list):\n",
" agg[\"PII_all\"].extend(pii_list)\n",
"\n",
" n = rec.get(\"Num_of_PII\")\n",
" if isinstance(n, (int, float)) and not isinstance(n, bool):\n",
" agg[\"Num_of_PII_all\"] += int(n)\n",
" else:\n",
" agg[\"Num_of_PII_all\"] += len(pii_list) if isinstance(pii_list, list) else 0\n",
"\n",
" # --- Aggregate other fields (arrays/numbers only) ---\n",
" for k, v in rec.items():\n",
" if k in IGNORE_FIELDS:\n",
" continue\n",
" if k in (\"db_path\", \"PII_type\", \"PII\", \"Num_of_PII\"):\n",
" continue\n",
"\n",
" # Prefix source_columns with db filename\n",
" if k == \"source_columns\":\n",
" cols = v if isinstance(v, list) else []\n",
" v = prefix_source_columns(dbp, cols)\n",
"\n",
" if isinstance(v, list):\n",
" if k not in agg:\n",
" agg[k] = []\n",
" if isinstance(agg[k], list):\n",
" agg[k].extend(v)\n",
"\n",
" elif isinstance(v, (int, float)) and not isinstance(v, bool):\n",
" if k not in agg:\n",
" agg[k] = 0\n",
" if isinstance(agg[k], (int, float)) and not isinstance(agg[k], bool):\n",
" agg[k] += v\n",
"\n",
" # ignore non-list, non-numeric values\n",
"\n",
" # --- Finalize: dedupe lists + compute unique PII fields ---\n",
" for agg in grouped.values():\n",
" agg[\"PII_unique\"] = _dedupe_preserve_order(agg[\"PII_all\"])\n",
" agg[\"Num_of_PII_unique\"] = len(agg[\"PII_unique\"])\n",
"\n",
" for k, v in list(agg.items()):\n",
" if isinstance(v, list) and k not in (\"PII_all\", \"PII_unique\"):\n",
" agg[k] = _dedupe_preserve_order(v)\n",
"\n",
" # source_columns counts\n",
" src = agg.get(\"source_columns\", [])\n",
" if isinstance(src, list):\n",
" agg[\"Num_of_source_columns_unique\"] = len(src)\n",
" # optional: with-dup count (before dedupe) is not available anymore here\n",
" # unless you track it separately.\n",
" else:\n",
" agg[\"Num_of_source_columns_unique\"] = 0\n",
"\n",
" # --- Write aggregated JSONL ---\n",
" with out_path.open(\"w\", encoding=\"utf-8\") as f:\n",
" for (app_code, pii_type) in sorted(grouped.keys()):\n",
" f.write(json.dumps(grouped[(app_code, pii_type)], ensure_ascii=False) + \"\\n\")\n",
"\n",
" return out_path\n",
"\n",
"\n",
"if __name__ == \"__main__\": \n",
" # --- Aggregate GPT-4o results ---\n",
" IN_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\db_level\")\n",
" OUT_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\app_level\")\n",
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
" out_path = OUT_DIR / \"app_level.jsonl\"\n",
"\n",
" out = aggregate_jsonl_folder(IN_DIR, out_path)\n",
" print(f\"Wrote: {out.resolve()}\")\n",
" \n",
" # --- Aggregate ground truth as well ---\n",
" \n",
" IN_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\db_level\")\n",
" OUT_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\app_level\")\n",
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
" out_path = OUT_DIR / \"app_level.jsonl\"\n",
"\n",
" out = aggregate_jsonl_folder(IN_DIR, out_path)\n",
" print(f\"Wrote: {out.resolve()}\")\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}