mobile-pii-discovery-agent/RQs/RQ1/RQ1_t4.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e15e3ffe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID  Application       Database              Email  Phone  UserName  PersonName  PostalAddr  Total\n",
      "-------------------------------------------------------------------------------------------------\n",
      "A1  WhatsApp          commerce.db               0      0         0           0           0      0\n",
      "A1  WhatsApp          msgstore.db               0      0         3          26           0     29\n",
      "A1  WhatsApp          wa.db                     0    408         0         129           0    537\n",
      "A2  Snapchat          core.db                   1      1         1           0           0      3\n",
      "A2  Snapchat          journal.db                0      0         0           0           0      0\n",
      "A2  Snapchat          main.db                   1     13      1229         252           0   1495\n",
      "A3  Telegram          account1cache4.db         0      0         0           0           0      0\n",
      "A3  Telegram          account2cache4.db         0      0         0           0           0      0\n",
      "A3  Telegram          account3cache4.db         0      0         0           0           0      0\n",
      "A4  Google Maps       gmm_myplaces.db           0      0         0           0           0      0\n",
      "A4  Google Maps       gmm_storage.db            0      0         0           0           0      0\n",
      "A4  Google Maps       peopleCache_sh....db      2      1         2           2           0      7\n",
      "A5  Samsung Internet  SBrowser.db               1      0         0           2           0      3\n",
      "A5  Samsung Internet  SBrowser2.db              0      0         0           1           0      1\n",
      "A5  Samsung Internet  searchengine.db           0      0         0           0           0      0\n",
      "I1  WhatsApp          CallHistory.sqlite        0      3         0           0           0      3\n",
      "I1  WhatsApp          ChatStorage.sqlite        0     63         0          66           2    131\n",
      "I1  WhatsApp          ContactsV2.sqlite         0      0         0        1350           0   1350\n",
      "I2  Contacts          AddressBook.sqlitedb      6    716         0         733           0   1455\n",
      "I2  Contacts          AddressB....sqlitedb      0      0         0           0           0      0\n",
      "I3  Apple Messages    sms.db                    7      0         0          14           0     21\n",
      "I4  Safari            CloudTabs.db              0      0         0           0           0      0\n",
      "I4  Safari            History.db                0      0         0           0           0      0\n",
      "I5  Calendar          Calendar.sqlitedb         1      0         0           1           0      2\n",
      "I5  Calendar          Extras.db                 0      0         0           0           0      0\n",
      "\n",
      "Wrote LaTeX: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4.tex\n",
      "Wrote CSV:   I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4_plain.csv\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import csv\n",
    "from pathlib import Path\n",
    "from collections import defaultdict, OrderedDict\n",
    "from typing import Dict, Tuple, List\n",
    "\n",
    "\n",
    "\n",
    "APP_NAME = OrderedDict([\n",
    "    (\"A1\", \"WhatsApp\"),\n",
    "    (\"A2\", \"Snapchat\"),\n",
    "    (\"A3\", \"Telegram\"),\n",
    "    (\"A4\", r\"\\begin{tabular}[c]{@{}l@{}}Google \\\\Maps\\end{tabular}\"),\n",
    "    (\"A5\", r\"\\begin{tabular}[c]{@{}l@{}}Samsung \\\\Internet\\end{tabular}\"),\n",
    "    (\"I1\", \"WhatsApp\"),\n",
    "    (\"I2\", \"Contacts\"),\n",
    "    (\"I3\", r\"\\begin{tabular}[c]{@{}l@{}}Apple \\\\Messages\\end{tabular}\"),\n",
    "    (\"I4\", \"Safari\"),\n",
    "    (\"I5\", \"Calendar\"),\n",
    "])\n",
    "\n",
    "APP_NAME_PLAIN = OrderedDict([\n",
    "    (\"A1\", \"WhatsApp\"),\n",
    "    (\"A2\", \"Snapchat\"),\n",
    "    (\"A3\", \"Telegram\"),\n",
    "    (\"A4\", \"Google Maps\"),\n",
    "    (\"A5\", \"Samsung Internet\"),\n",
    "    (\"I1\", \"WhatsApp\"),\n",
    "    (\"I2\", \"Contacts\"),\n",
    "    (\"I3\", \"Apple Messages\"),\n",
    "    (\"I4\", \"Safari\"),\n",
    "    (\"I5\", \"Calendar\"),\n",
    "])\n",
    "\n",
    "PII_COLS = OrderedDict([\n",
    "    (\"EMAIL\", \"Email\"),\n",
    "    (\"PHONE\", \"Phone\"),\n",
    "    (\"USERNAME\", \"UserName\"),\n",
    "    (\"PERSON_NAME\", \"PersonName\"),\n",
    "    (\"POSTAL_ADDRESS\", \"PostalAddr\"),\n",
    "])\n",
    "\n",
    "MAX_DB_NAME_LEN = 20\n",
    "\n",
    "\n",
    "def get_app_code(db_path: str) -> str:\n",
    "    stem = Path(db_path).stem\n",
    "    return stem.split(\"_\", 1)[0] if \"_\" in stem else stem.split(\"-\", 1)[0]\n",
    "\n",
    "\n",
    "def get_db_filename(db_path: str) -> str:\n",
    "    name = Path(db_path).name\n",
    "    if \"_\" in name:\n",
    "        return name.split(\"_\", 1)[1]\n",
    "    return name\n",
    "\n",
    "\n",
    "def latex_escape(s: str) -> str:\n",
    "    return s.replace(\"\\\\\", r\"\\textbackslash{}\").replace(\"_\", r\"\\_\")\n",
    "\n",
    "\n",
    "def shorten_db_name(db_file: str, max_len: int = MAX_DB_NAME_LEN) -> str:\n",
    "    s = db_file.strip()\n",
    "    if len(s) <= max_len:\n",
    "        return s\n",
    "\n",
    "    p = Path(s)\n",
    "    stem = p.stem\n",
    "    suffix = p.suffix\n",
    "    if not suffix:\n",
    "        return s[: max_len - 3] + \"...\"\n",
    "\n",
    "    keep = max_len - len(suffix) - 3\n",
    "    if keep <= 0:\n",
    "        return s[: max_len - 3] + \"...\"\n",
    "    return stem[:keep] + \"...\" + suffix\n",
    "\n",
    "\n",
    "def load_db_level_counts(folder: Path) -> Dict[Tuple[str, str], Dict[str, int]]:\n",
    "    counts: Dict[Tuple[str, str], Dict[str, int]] = defaultdict(lambda: {t: 0 for t in PII_COLS.keys()})\n",
    "\n",
    "    for fp in sorted(folder.glob(\"*.jsonl\")):\n",
    "        with fp.open(\"r\", encoding=\"utf-8\") as f:\n",
    "            for line in f:\n",
    "                line = line.strip()\n",
    "                if not line:\n",
    "                    continue\n",
    "                rec = json.loads(line)\n",
    "                if not isinstance(rec, dict):\n",
    "                    continue\n",
    "\n",
    "                db_path = rec.get(\"db_path\", \"\")\n",
    "                pii_type = (rec.get(\"PII_type\") or \"\").strip().upper()\n",
    "                if not db_path or pii_type not in PII_COLS:\n",
    "                    continue\n",
    "\n",
    "                app = get_app_code(db_path)\n",
    "                db_file = get_db_filename(db_path)\n",
    "\n",
    "                n = rec.get(\"Num_of_PII\", 0)\n",
    "                if isinstance(n, bool) or not isinstance(n, (int, float)):\n",
    "                    n = 0\n",
    "                counts[(app, db_file)][pii_type] += int(n)\n",
    "\n",
    "    return counts\n",
    "\n",
    "\n",
    "def build_rows_for_plain_and_csv(\n",
    "    counts: Dict[Tuple[str, str], Dict[str, int]],\n",
    ") -> List[List[str]]:\n",
    "    by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
    "    for (app, db_file), c in counts.items():\n",
    "        by_app[app].append((db_file, c))\n",
    "\n",
    "    for app in by_app:\n",
    "        by_app[app].sort(key=lambda x: x[0].lower())\n",
    "\n",
    "    header = [\"ID\", \"Application\", \"Database\"] + list(PII_COLS.values()) + [\"Total\"]\n",
    "    rows: List[List[str]] = [header]\n",
    "\n",
    "    app_order = list(APP_NAME_PLAIN.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME_PLAIN]\n",
    "\n",
    "    for app in app_order:\n",
    "        if app not in by_app:\n",
    "            continue\n",
    "        app_disp = APP_NAME_PLAIN.get(app, app)\n",
    "\n",
    "        for db_file, cdict in by_app[app]:\n",
    "            vals = [int(cdict[k]) for k in PII_COLS.keys()]\n",
    "            total = sum(vals)\n",
    "            db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
    "\n",
    "            rows.append(\n",
    "                [app, app_disp, db_show] + [str(v) for v in vals] + [str(total)]\n",
    "            )\n",
    "\n",
    "    return rows\n",
    "\n",
    "\n",
    "def build_plain_text_table_from_rows(rows: List[List[str]]) -> str:\n",
    "    widths = [0] * len(rows[0])\n",
    "    for r in rows:\n",
    "        for i, cell in enumerate(r):\n",
    "            widths[i] = max(widths[i], len(cell))\n",
    "\n",
    "    def fmt_row(r: List[str]) -> str:\n",
    "        parts = []\n",
    "        for i, cell in enumerate(r):\n",
    "            # text columns\n",
    "            if i <= 2:\n",
    "                parts.append(cell.ljust(widths[i]))\n",
    "            else:\n",
    "                parts.append(cell.rjust(widths[i]))\n",
    "        return \"  \".join(parts)\n",
    "\n",
    "    lines = [fmt_row(rows[0])]\n",
    "    lines.append(\"-\" * len(lines[0]))\n",
    "    for r in rows[1:]:\n",
    "        lines.append(fmt_row(r))\n",
    "    return \"\\n\".join(lines)\n",
    "\n",
    "\n",
    "def write_csv(rows: List[List[str]], out_csv: Path) -> None:\n",
    "    out_csv.parent.mkdir(parents=True, exist_ok=True)\n",
    "    with out_csv.open(\"w\", encoding=\"utf-8\", newline=\"\") as f:\n",
    "        csv.writer(f).writerows(rows)\n",
    "\n",
    "\n",
    "def format_row_tex(\n",
    "    app_code: str,\n",
    "    app_display: str,\n",
    "    db_file: str,\n",
    "    counts_for_db: Dict[str, int],\n",
    "    is_first_row_for_app: bool,\n",
    "    nrows_for_app: int,\n",
    ") -> str:\n",
    "    vals = {k: int(counts_for_db[k]) for k in PII_COLS.keys()}\n",
    "    total = sum(vals.values())\n",
    "\n",
    "    db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
    "    db_tex = latex_escape(db_show)\n",
    "\n",
    "    if is_first_row_for_app:\n",
    "        return (\n",
    "            rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_code}}} & \"\n",
    "            rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_display}}} & \"\n",
    "            rf\"{db_tex} & \"\n",
    "            + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
    "            + rf\" & \\textbf{{{total}}} \\\\\"\n",
    "        )\n",
    "\n",
    "    return (\n",
    "        rf\" &  & {db_tex} & \"\n",
    "        + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
    "        + rf\" & \\textbf{{{total}}} \\\\\"\n",
    "    )\n",
    "\n",
    "\n",
    "def build_table_tex(counts: Dict[Tuple[str, str], Dict[str, int]]) -> str:\n",
    "    by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
    "    for (app, db_file), c in counts.items():\n",
    "        by_app[app].append((db_file, c))\n",
    "\n",
    "    for app in by_app:\n",
    "        by_app[app].sort(key=lambda x: x[0].lower())\n",
    "\n",
    "    lines: List[str] = []\n",
    "    lines.append(r\"\\begin{table*}[th]\")\n",
    "    lines.append(r\"\\centering\")\n",
    "    lines.append(r\"\\small\")\n",
    "    lines.append(r\"\\caption{PII Discovered Per Application and Database (ChatGPT 4o-mini)}\")\n",
    "    lines.append(r\"\\label{tab:pii_per_app_gpt_details}\")\n",
    "    lines.append(r\"\\begin{tabular}{|c|l|l|r|r|r|r|r|r|}\")\n",
    "    lines.append(r\"\\hline\")\n",
    "    lines.append(\n",
    "        r\"\\textbf{ID} & \\textbf{Application} & \\textbf{Database} & \"\n",
    "        r\"\\textbf{Email} & \\textbf{Phone} & \\textbf{User Name} & \\textbf{Person Name} & \"\n",
    "        r\"\\textbf{Postal Address} & \\textbf{Total PII} \\\\\"\n",
    "    )\n",
    "    lines.append(r\"\\hline\")\n",
    "\n",
    "    app_order = list(APP_NAME.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME]\n",
    "\n",
    "    for app in app_order:\n",
    "        if app not in by_app:\n",
    "            continue\n",
    "        db_rows = by_app[app]\n",
    "        nrows = len(db_rows)\n",
    "        app_display = APP_NAME.get(app, app)\n",
    "\n",
    "        for i, (db_file, cdict) in enumerate(db_rows):\n",
    "            lines.append(format_row_tex(app, app_display, db_file, cdict, i == 0, nrows))\n",
    "            if i < nrows - 1:\n",
    "                lines.append(r\"\\cline{3-9}\")\n",
    "            else:\n",
    "                lines.append(r\"\\hline\")\n",
    "\n",
    "    lines.append(r\"\\end{tabular}\")\n",
    "    lines.append(r\"\\end{table*}\")\n",
    "    return \"\\n\".join(lines)\n",
    "\n",
    "\n",
    "def generate_db_level_pii_tables(\n",
    "    in_dir: Path,\n",
    "    out_tex: Path,\n",
    "    out_csv: Path,\n",
    ") -> None:\n",
    "    counts = load_db_level_counts(in_dir)\n",
    "\n",
    "    tex = build_table_tex(counts)\n",
    "    out_tex.parent.mkdir(parents=True, exist_ok=True)\n",
    "    out_tex.write_text(tex, encoding=\"utf-8\")\n",
    "\n",
    "    rows = build_rows_for_plain_and_csv(counts)\n",
    "    write_csv(rows, out_csv)\n",
    "\n",
    "    print(build_plain_text_table_from_rows(rows))\n",
    "    print(f\"\\nWrote LaTeX: {out_tex.resolve()}\")\n",
    "    print(f\"Wrote CSV:   {out_csv.resolve()}\")\n",
    "\n",
    "\n",
    "def main() -> None:\n",
    "    in_dir = Path(r\"..\\normalized_PII_results\\GPT-5.1\\db_level\")\n",
    "    out_tex = Path(\"RQ1_t4.tex\")\n",
    "    out_csv = Path(\"RQ1_t4_plain.csv\")\n",
    "    generate_db_level_pii_tables(in_dir, out_tex, out_csv)\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}