reorginze RQs in different folders

2026-02-20 13:40:41 +00:00 · 2026-01-28 12:22:08 -05:00
parent a231e83c66
commit 19aaf185e2
8 changed files with 322 additions and 125 deletions
--- a/RQs/RQ0/RQ0_batch_results_normalization.ipynb
+++ b/RQs/RQ0/RQ0_batch_results_normalization.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2d824a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A1_commerce_20260127T175911Z.jsonl\n",
+      "PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
+      "PII_A1_wa_20260127T180213Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A1_wa_20260127T180213Z.jsonl\n",
+      "PII_A2_core_20260127T180339Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A2_core_20260127T180339Z.jsonl\n",
+      "PII_A2_journal_20260127T180440Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A2_journal_20260127T180440Z.jsonl\n",
+      "PII_A2_main_20260127T180710Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A2_main_20260127T180710Z.jsonl\n",
+      "PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
+      "PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
+      "PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
+      "PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
+      "PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
+      "PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
+      "PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
+      "PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
+      "PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
+      "PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
+      "PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
+      "PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
+      "PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
+      "PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
+      "PII_I3_sms_20260127T183606Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I3_sms_20260127T183606Z.jsonl\n",
+      "PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
+      "PII_I4_History_20260127T183727Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I4_History_20260127T183727Z.jsonl\n",
+      "PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
+      "PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I5_Extras_20260127T183857Z.jsonl\n",
+      "Done. Files: 25, Records: 125\n",
+      "Output folder: I:\\project2026\\llmagent\\RQs\\batch_results_normalized\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "import sys\n",
+    "import shutil\n",
+    "\n",
+    "# IMPORTANT: sys.path needs a DIRECTORY, not the .py file itself\n",
+    "STATS_DIR = Path(r\"I:\\project2026\\llmagent\\RQs\").resolve()  # folder containing stats_utils.py\n",
+    "sys.path.insert(0, str(STATS_DIR))\n",
+    "\n",
+    "from stats_utils import normalize_and_slim_record\n",
+    "\n",
+    "IN_DIR = Path(r\"..\\..\\batch_results\")\n",
+    "OUT_DIR = Path(r\"..\\batch_results_normalized\")\n",
+    "\n",
+    "\n",
+    "def process_file(in_path: Path, out_path: Path) -> int:\n",
+    "    n = 0\n",
+    "    with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
+    "        for line in fin:\n",
+    "            line = line.strip()\n",
+    "            if not line:\n",
+    "                continue\n",
+    "            obj = json.loads(line)\n",
+    "            if not isinstance(obj, dict):\n",
+    "                continue\n",
+    "            slim = normalize_and_slim_record(obj)\n",
+    "            fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
+    "            n += 1\n",
+    "    return n\n",
+    "\n",
+    "\n",
+    "def main() -> None:\n",
+    "    # Delete OUT_DIR if it exists, then recreate it cleanly\n",
+    "    if OUT_DIR.exists():\n",
+    "        if OUT_DIR.is_dir():\n",
+    "            shutil.rmtree(OUT_DIR)\n",
+    "        else:\n",
+    "            OUT_DIR.unlink()\n",
+    "\n",
+    "    OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    files = sorted(IN_DIR.glob(\"*.jsonl\"))\n",
+    "    if not files:\n",
+    "        print(f\"No .jsonl files found in: {IN_DIR.resolve()}\")\n",
+    "        return\n",
+    "\n",
+    "    total_files = 0\n",
+    "    total_records = 0\n",
+    "\n",
+    "    for fp in files:\n",
+    "        out_fp = OUT_DIR / fp.name\n",
+    "        n = process_file(fp, out_fp)\n",
+    "        print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
+    "        total_files += 1\n",
+    "        total_records += n\n",
+    "\n",
+    "    print(f\"Done. Files: {total_files}, Records: {total_records}\")\n",
+    "    print(f\"Output folder: {OUT_DIR.resolve()}\")\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0074eda4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/RQs/RQ0_batch_results_normalization.ipynb
+++ b/RQs/RQ0_batch_results_normalization.ipynb
@@ -1,122 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "c2d824a6",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_commerce_20260127T175911Z.jsonl\n",
-      "PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
-      "PII_A1_wa_20260127T180213Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_wa_20260127T180213Z.jsonl\n",
-      "PII_A2_core_20260127T180339Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_core_20260127T180339Z.jsonl\n",
-      "PII_A2_journal_20260127T180440Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_journal_20260127T180440Z.jsonl\n",
-      "PII_A2_main_20260127T180710Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_main_20260127T180710Z.jsonl\n",
-      "PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
-      "PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
-      "PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
-      "PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
-      "PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
-      "PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
-      "PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
-      "PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
-      "PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
-      "PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
-      "PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
-      "PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
-      "PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> batch_results_normalized\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
-      "PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> batch_results_normalized\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
-      "PII_I3_sms_20260127T183606Z.jsonl: 5 records -> batch_results_normalized\\PII_I3_sms_20260127T183606Z.jsonl\n",
-      "PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> batch_results_normalized\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
-      "PII_I4_History_20260127T183727Z.jsonl: 5 records -> batch_results_normalized\\PII_I4_History_20260127T183727Z.jsonl\n",
-      "PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> batch_results_normalized\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
-      "PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> batch_results_normalized\\PII_I5_Extras_20260127T183857Z.jsonl\n",
-      "Done. Files: 25, Records: 125\n",
-      "Output folder: I:\\project2026\\llmagent\\stats\\batch_results_normalized\n"
-     ]
-    }
-   ],
-   "source": [
-    "import json\n",
-    "from pathlib import Path\n",
-    "\n",
-    "from stats_utils import normalize_and_slim_record\n",
-    "\n",
-    "IN_DIR = Path(r\"..\\\\batch_results\")\n",
-    "OUT_DIR = Path(r\"batch_results_normalized\")\n",
-    "\n",
-    "def process_file(in_path: Path, out_path: Path) -> int:\n",
-    "    n = 0\n",
-    "    with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
-    "        for line in fin:\n",
-    "            line = line.strip()\n",
-    "            if not line:\n",
-    "                continue\n",
-    "            obj = json.loads(line)\n",
-    "            if not isinstance(obj, dict):\n",
-    "                continue\n",
-    "            slim = normalize_and_slim_record(obj)\n",
-    "            fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
-    "            n += 1\n",
-    "    return n\n",
-    "\n",
-    "def main() -> None:\n",
-    "    OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "    files = sorted(IN_DIR.glob(\"*.jsonl\"))\n",
-    "    if not files:\n",
-    "        print(f\"No .jsonl files found in: {IN_DIR.resolve()}\")\n",
-    "        return\n",
-    "\n",
-    "    total_files = 0\n",
-    "    total_records = 0\n",
-    "\n",
-    "    for fp in files:\n",
-    "        out_fp = OUT_DIR / fp.name\n",
-    "        n = process_file(fp, out_fp)\n",
-    "        print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
-    "        total_files += 1\n",
-    "        total_records += n\n",
-    "\n",
-    "    print(f\"Done. Files: {total_files}, Records: {total_records}\")\n",
-    "    print(f\"Output folder: {OUT_DIR.resolve()}\")\n",
-    "\n",
-    "if __name__ == \"__main__\":\n",
-    "    main()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0074eda4",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.18"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/RQs/RQ1/RQ1.ipynb
+++ b/RQs/RQ1/RQ1.ipynb
--- a/RQs/RQ3/RQ3_aggregated_by_app_and_type.jsonl
+++ b/RQs/RQ3/RQ3_aggregated_by_app_and_type.jsonl
--- a/RQs/RQ3/RQ3_aggregated_corpus_by_type.jsonl
+++ b/RQs/RQ3/RQ3_aggregated_corpus_by_type.jsonl
--- a/RQs/RQ3/RQ3_t7_app_level.ipynb
+++ b/RQs/RQ3/RQ3_t7_app_level.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "id": "234eed3f",
   "metadata": {},
   "outputs": [
@@ -10,7 +10,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Wrote: I:\\project2026\\llmagent\\stats\\aggregated_by_app_and_type.jsonl\n"
+      "Wrote: I:\\project2026\\llmagent\\stats\\RQ3_aggregated_by_app_and_type.jsonl\n"
     ]
    }
   ],
@@ -180,7 +180,7 @@
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
-    "    out = aggregate_jsonl_folder(r\"batch_results_normalized\", \"aggregated_by_app_and_type.jsonl\")\n",
+    "    out = aggregate_jsonl_folder(r\"batch_results_normalized\", \"RQ3_aggregated_by_app_and_type.jsonl\")\n",
    "    print(f\"Wrote: {out.resolve()}\")\n"
   ]
  }
--- a/RQs/RQ3/RQ3_t7_corpus_level.ipynb
+++ b/RQs/RQ3/RQ3_t7_corpus_level.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "234eed3f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrote: I:\\project2026\\llmagent\\stats\\RQ3_aggregated_corpus_by_type.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "from typing import Any, Dict, Tuple\n",
+    "\n",
+    "IGNORE_FIELDS = {\"Raw_rows_first_100\", \"Exploration_sql\", \"Extraction_sql\", \"PII_Prompt\"}\n",
+    "\n",
+    "\n",
+    "def _dedupe_preserve_order(items):\n",
+    "    \"\"\"\n",
+    "    Stable dedupe for lists that may contain scalars, dicts, or lists.\n",
+    "    \"\"\"\n",
+    "    seen = set()\n",
+    "    out = []\n",
+    "    for x in items:\n",
+    "        key = json.dumps(x, sort_keys=True, ensure_ascii=False) if isinstance(x, (dict, list)) else x\n",
+    "        if key in seen:\n",
+    "            continue\n",
+    "        seen.add(key)\n",
+    "        out.append(x)\n",
+    "    return out\n",
+    "\n",
+    "\n",
+    "def prefix_source_columns(db_path: str, cols: list) -> list:\n",
+    "    \"\"\"\n",
+    "    Prefix each source column with the database filename to avoid ambiguity\n",
+    "    after aggregating across many DBs.\n",
+    "\n",
+    "    Example:\n",
+    "      db_path = selectedDBs\\\\A1_msgstore.db\n",
+    "      col     = message.text_data\n",
+    "      -> A1_msgstore.db:message.text_data\n",
+    "    \"\"\"\n",
+    "    db_file = Path(db_path).name\n",
+    "    out = []\n",
+    "    for c in cols:\n",
+    "        if isinstance(c, str) and c:\n",
+    "            out.append(f\"{db_file}:{c}\")\n",
+    "    return out\n",
+    "\n",
+    "\n",
+    "def aggregate_jsonl_folder_corpus_level(in_dir: str | Path, out_path: str | Path) -> Path:\n",
+    "    \"\"\"\n",
+    "    Corpus-level aggregation across all *.jsonl files in in_dir, grouped ONLY by PII_type.\n",
+    "\n",
+    "    Input records are expected to already be normalized (your batch_results_normalized),\n",
+    "    but this function still performs dedupe at aggregation time.\n",
+    "\n",
+    "    Output per PII_type keeps:\n",
+    "      - PII_type\n",
+    "      - PII_all: concatenated across corpus (with duplicates)\n",
+    "      - PII_unique: deduped\n",
+    "      - Num_of_PII_all: total count with duplicates (sum of per-record Num_of_PII or len(PII))\n",
+    "      - Num_of_PII_unique: len(PII_unique)\n",
+    "      - source_columns: deduped, prefixed with db filename\n",
+    "      - Num_of_source_columns: len(source_columns)\n",
+    "\n",
+    "    It ignores IGNORE_FIELDS and discards all other keys.\n",
+    "    \"\"\"\n",
+    "    in_dir = Path(in_dir)\n",
+    "    out_path = Path(out_path)\n",
+    "    out_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    grouped: Dict[str, Dict[str, Any]] = {}\n",
+    "\n",
+    "    for jsonl_file in sorted(in_dir.glob(\"*.jsonl\")):\n",
+    "        with jsonl_file.open(\"r\", encoding=\"utf-8\") as f:\n",
+    "            for line_no, line in enumerate(f, start=1):\n",
+    "                line = line.strip()\n",
+    "                if not line:\n",
+    "                    continue\n",
+    "\n",
+    "                try:\n",
+    "                    rec = json.loads(line)\n",
+    "                except json.JSONDecodeError as e:\n",
+    "                    raise ValueError(f\"Bad JSON in {jsonl_file} line {line_no}: {e}\") from e\n",
+    "\n",
+    "                pii_type = rec.get(\"PII_type\")\n",
+    "                if not pii_type:\n",
+    "                    continue\n",
+    "\n",
+    "                if pii_type not in grouped:\n",
+    "                    grouped[pii_type] = {\n",
+    "                        \"PII_type\": pii_type,\n",
+    "                        \"PII_all\": [],\n",
+    "                        \"PII_unique\": [],\n",
+    "                        \"Num_of_PII_all\": 0,\n",
+    "                        \"Num_of_PII_unique\": 0,\n",
+    "                        \"source_columns\": [],\n",
+    "                        \"Num_of_source_columns\": 0,\n",
+    "                    }\n",
+    "\n",
+    "                agg = grouped[pii_type]\n",
+    "\n",
+    "                # --- PII + count (with-dup) ---\n",
+    "                pii_list = rec.get(\"PII\", [])\n",
+    "                if isinstance(pii_list, list):\n",
+    "                    agg[\"PII_all\"].extend(pii_list)\n",
+    "\n",
+    "                n = rec.get(\"Num_of_PII\")\n",
+    "                if isinstance(n, (int, float)) and not isinstance(n, bool):\n",
+    "                    agg[\"Num_of_PII_all\"] += int(n)\n",
+    "                else:\n",
+    "                    agg[\"Num_of_PII_all\"] += len(pii_list) if isinstance(pii_list, list) else 0\n",
+    "\n",
+    "                # --- source_columns (with-dup) ---\n",
+    "                dbp = rec.get(\"db_path\", \"\")\n",
+    "                cols = rec.get(\"source_columns\", [])\n",
+    "                if isinstance(cols, list):\n",
+    "                    agg[\"source_columns\"].extend(prefix_source_columns(dbp, cols))\n",
+    "\n",
+    "                # ignore everything else (and IGNORE_FIELDS)\n",
+    "\n",
+    "    # --- Finalize: dedupe lists + compute unique counts ---\n",
+    "    for agg in grouped.values():\n",
+    "        agg[\"PII_unique\"] = _dedupe_preserve_order(agg[\"PII_all\"])\n",
+    "        agg[\"Num_of_PII_unique\"] = len(agg[\"PII_unique\"])\n",
+    "\n",
+    "        agg[\"source_columns\"] = _dedupe_preserve_order(agg[\"source_columns\"])\n",
+    "        agg[\"Num_of_source_columns\"] = len(agg[\"source_columns\"])\n",
+    "\n",
+    "    # --- Write aggregated JSONL ---\n",
+    "    with out_path.open(\"w\", encoding=\"utf-8\") as f:\n",
+    "        for pii_type in sorted(grouped.keys()):\n",
+    "            f.write(json.dumps(grouped[pii_type], ensure_ascii=False) + \"\\n\")\n",
+    "\n",
+    "    return out_path\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    out = aggregate_jsonl_folder_corpus_level(\n",
+    "        r\"batch_results_normalized\",\n",
+    "        \"RQ3_aggregated_corpus_by_type.jsonl\",\n",
+    "    )\n",
+    "    print(f\"Wrote: {out.resolve()}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/RQs/pycache/stats_utils.cpython-310.pyc
+++ b/RQs/pycache/stats_utils.cpython-310.pyc