mobile-pii-discovery-agent/RQs/RQ0/RQ0_2_app_level_aggregation.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "234eed3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\GPT-5.1\\app_level\\app_level.jsonl\n",
      "Wrote: I:\\project2026\\llmagent\\RQs\\normalized_PII_results\\ground_truth\\app_level\\app_level.jsonl\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from typing import Any, Dict, Tuple\n",
    "\n",
    "IGNORE_FIELDS = {\"Raw_rows_first_100\", \"Exploration_sql\", \"Extraction_sql\", \"PII_Prompt\"}\n",
    "\n",
    "\n",
    "def get_app_code(db_path: str) -> str:\n",
    "    \"\"\"\n",
    "    selectedDBs\\\\A2_journal.db  -> A2\n",
    "    selectedDBs/A1_msgstore.db -> A1\n",
    "    \"\"\"\n",
    "    stem = Path(db_path).stem  # A2_journal\n",
    "    return stem.split(\"_\", 1)[0]\n",
    "\n",
    "\n",
    "def _dedupe_preserve_order(items):\n",
    "    seen = set()\n",
    "    out = []\n",
    "    for x in items:\n",
    "        key = json.dumps(x, sort_keys=True, ensure_ascii=False) if isinstance(x, (dict, list)) else x\n",
    "        if key in seen:\n",
    "            continue\n",
    "        seen.add(key)\n",
    "        out.append(x)\n",
    "    return out\n",
    "\n",
    "\n",
    "def prefix_source_columns(db_path: str, cols: list) -> list:\n",
    "    \"\"\"\n",
    "    Prefix each source column with the database filename to avoid ambiguity\n",
    "    after aggregating multiple DB files under the same app.\n",
    "\n",
    "    Example:\n",
    "      db_path = selectedDBs\\\\A1_msgstore.db\n",
    "      col     = message.text_data\n",
    "      -> A1_msgstore.db:message.text_data\n",
    "    \"\"\"\n",
    "    db_file = Path(db_path).name  # includes extension\n",
    "    out = []\n",
    "    for c in cols:\n",
    "        if isinstance(c, str) and c:\n",
    "            out.append(f\"{db_file}:{c}\")\n",
    "    return out\n",
    "\n",
    "\n",
    "def aggregate_jsonl_folder(in_dir: str | Path, out_path: str | Path) -> Path:\n",
    "    \"\"\"\n",
    "    Read all *.jsonl files under in_dir and aggregate records by:\n",
    "      (app_code derived from db_path, PII_type)\n",
    "\n",
    "    Output per group:\n",
    "      - db_path: \"selectedDBs\\\\<APP_CODE>\"\n",
    "      - PII_type\n",
    "      - PII_all: with duplicates\n",
    "      - PII_unique: deduped (exact match)\n",
    "      - Num_of_PII_all: with duplicates (sum of per-record Num_of_PII or len(PII))\n",
    "      - Num_of_PII_unique: len(PII_unique)\n",
    "      - source_columns: deduped, prefixed with db filename\n",
    "      - other list fields: deduped\n",
    "      - other numeric fields: summed\n",
    "      - ignores Raw_rows_first_100, Exploration_sql, Extraction_sql\n",
    "    \"\"\"\n",
    "    in_dir = Path(in_dir)\n",
    "    out_path = Path(out_path)\n",
    "    out_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "    grouped: Dict[Tuple[str, str], Dict[str, Any]] = {}\n",
    "\n",
    "    for jsonl_file in sorted(in_dir.glob(\"*.jsonl\")):\n",
    "        with jsonl_file.open(\"r\", encoding=\"utf-8\") as f:\n",
    "            for line_no, line in enumerate(f, start=1):\n",
    "                line = line.strip()\n",
    "                if not line:\n",
    "                    continue\n",
    "\n",
    "                try:\n",
    "                    rec = json.loads(line)\n",
    "                except json.JSONDecodeError as e:\n",
    "                    raise ValueError(f\"Bad JSON in {jsonl_file} line {line_no}: {e}\") from e\n",
    "\n",
    "                dbp = rec.get(\"db_path\", \"\")\n",
    "                pii_type = rec.get(\"PII_type\")\n",
    "                if not pii_type:\n",
    "                    continue\n",
    "\n",
    "                app_code = get_app_code(dbp)\n",
    "                key = (app_code, pii_type)\n",
    "\n",
    "                if key not in grouped:\n",
    "                    grouped[key] = {\n",
    "                        \"db_path\": f\"selectedDBs\\\\{app_code}\",\n",
    "                        \"PII_type\": pii_type,\n",
    "                        \"PII_all\": [],\n",
    "                        \"PII_unique\": [],\n",
    "                        \"Num_of_PII_all\": 0,\n",
    "                        \"Num_of_PII_unique\": 0,\n",
    "                    }\n",
    "\n",
    "                agg = grouped[key]\n",
    "\n",
    "                # --- Special handling: PII + counts ---\n",
    "                pii_list = rec.get(\"PII\", [])\n",
    "                if isinstance(pii_list, list):\n",
    "                    agg[\"PII_all\"].extend(pii_list)\n",
    "\n",
    "                n = rec.get(\"Num_of_PII\")\n",
    "                if isinstance(n, (int, float)) and not isinstance(n, bool):\n",
    "                    agg[\"Num_of_PII_all\"] += int(n)\n",
    "                else:\n",
    "                    agg[\"Num_of_PII_all\"] += len(pii_list) if isinstance(pii_list, list) else 0\n",
    "\n",
    "                # --- Aggregate other fields (arrays/numbers only) ---\n",
    "                for k, v in rec.items():\n",
    "                    if k in IGNORE_FIELDS:\n",
    "                        continue\n",
    "                    if k in (\"db_path\", \"PII_type\", \"PII\", \"Num_of_PII\"):\n",
    "                        continue\n",
    "\n",
    "                    # Prefix source_columns with db filename\n",
    "                    if k == \"source_columns\":\n",
    "                        cols = v if isinstance(v, list) else []\n",
    "                        v = prefix_source_columns(dbp, cols)\n",
    "\n",
    "                    if isinstance(v, list):\n",
    "                        if k not in agg:\n",
    "                            agg[k] = []\n",
    "                        if isinstance(agg[k], list):\n",
    "                            agg[k].extend(v)\n",
    "\n",
    "                    elif isinstance(v, (int, float)) and not isinstance(v, bool):\n",
    "                        if k not in agg:\n",
    "                            agg[k] = 0\n",
    "                        if isinstance(agg[k], (int, float)) and not isinstance(agg[k], bool):\n",
    "                            agg[k] += v\n",
    "\n",
    "                    # ignore non-list, non-numeric values\n",
    "\n",
    "    # --- Finalize: dedupe lists + compute unique PII fields ---\n",
    "    for agg in grouped.values():\n",
    "        agg[\"PII_unique\"] = _dedupe_preserve_order(agg[\"PII_all\"])\n",
    "        agg[\"Num_of_PII_unique\"] = len(agg[\"PII_unique\"])\n",
    "\n",
    "        for k, v in list(agg.items()):\n",
    "            if isinstance(v, list) and k not in (\"PII_all\", \"PII_unique\"):\n",
    "                agg[k] = _dedupe_preserve_order(v)\n",
    "\n",
    "        # source_columns counts\n",
    "        src = agg.get(\"source_columns\", [])\n",
    "        if isinstance(src, list):\n",
    "            agg[\"Num_of_source_columns_unique\"] = len(src)\n",
    "            # optional: with-dup count (before dedupe) is not available anymore here\n",
    "            # unless you track it separately.\n",
    "        else:\n",
    "            agg[\"Num_of_source_columns_unique\"] = 0\n",
    "\n",
    "    # --- Write aggregated JSONL ---\n",
    "    with out_path.open(\"w\", encoding=\"utf-8\") as f:\n",
    "        for (app_code, pii_type) in sorted(grouped.keys()):\n",
    "            f.write(json.dumps(grouped[(app_code, pii_type)], ensure_ascii=False) + \"\\n\")\n",
    "\n",
    "    return out_path\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":    \n",
    "    # --- Aggregate GPT-4o results ---\n",
    "    IN_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\db_level\")\n",
    "    OUT_DIR = Path(r\"..\\normalized_PII_results\\GPT-5.1\\app_level\")\n",
    "    OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "    out_path = OUT_DIR / \"app_level.jsonl\"\n",
    "\n",
    "    out = aggregate_jsonl_folder(IN_DIR, out_path)\n",
    "    print(f\"Wrote: {out.resolve()}\")\n",
    "    \n",
    "    # --- Aggregate ground truth as well ---\n",
    "    \n",
    "    IN_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\db_level\")\n",
    "    OUT_DIR = Path(r\"..\\normalized_PII_results\\ground_truth\\app_level\")\n",
    "    OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "    out_path = OUT_DIR / \"app_level.jsonl\"\n",
    "\n",
    "    out = aggregate_jsonl_folder(IN_DIR, out_path)\n",
    "    print(f\"Wrote: {out.resolve()}\")\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}