Files
mobile-pii-discovery-agent/RQs/RQ1/RQ1_t4.ipynb
2026-02-11 22:29:04 -05:00

341 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e15e3ffe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID Application Database Email Phone UserName PersonName PostalAddr Total\n",
"-------------------------------------------------------------------------------------------------\n",
"A1 WhatsApp commerce.db 0 0 0 0 0 0\n",
"A1 WhatsApp msgstore.db 0 0 3 26 0 29\n",
"A1 WhatsApp wa.db 0 408 0 129 0 537\n",
"A2 Snapchat core.db 1 1 1 0 0 3\n",
"A2 Snapchat journal.db 0 0 0 0 0 0\n",
"A2 Snapchat main.db 1 13 1229 252 0 1495\n",
"A3 Telegram account1cache4.db 0 0 0 0 0 0\n",
"A3 Telegram account2cache4.db 0 0 0 0 0 0\n",
"A3 Telegram account3cache4.db 0 0 0 0 0 0\n",
"A4 Google Maps gmm_myplaces.db 0 0 0 0 0 0\n",
"A4 Google Maps gmm_storage.db 0 0 0 0 0 0\n",
"A4 Google Maps peopleCache_sh....db 2 1 2 2 0 7\n",
"A5 Samsung Internet SBrowser.db 1 0 0 2 0 3\n",
"A5 Samsung Internet SBrowser2.db 0 0 0 1 0 1\n",
"A5 Samsung Internet searchengine.db 0 0 0 0 0 0\n",
"I1 WhatsApp CallHistory.sqlite 0 3 0 0 0 3\n",
"I1 WhatsApp ChatStorage.sqlite 0 63 0 66 2 131\n",
"I1 WhatsApp ContactsV2.sqlite 0 0 0 1350 0 1350\n",
"I2 Contacts AddressBook.sqlitedb 6 716 0 733 0 1455\n",
"I2 Contacts AddressB....sqlitedb 0 0 0 0 0 0\n",
"I3 Apple Messages sms.db 7 0 0 14 0 21\n",
"I4 Safari CloudTabs.db 0 0 0 0 0 0\n",
"I4 Safari History.db 0 0 0 0 0 0\n",
"I5 Calendar Calendar.sqlitedb 1 0 0 1 0 2\n",
"I5 Calendar Extras.db 0 0 0 0 0 0\n",
"\n",
"Wrote LaTeX: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4.tex\n",
"Wrote CSV: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4_plain.csv\n"
]
}
],
"source": [
"import json\n",
"import csv\n",
"from pathlib import Path\n",
"from collections import defaultdict, OrderedDict\n",
"from typing import Dict, Tuple, List\n",
"\n",
"\n",
"\n",
"APP_NAME = OrderedDict([\n",
" (\"A1\", \"WhatsApp\"),\n",
" (\"A2\", \"Snapchat\"),\n",
" (\"A3\", \"Telegram\"),\n",
" (\"A4\", r\"\\begin{tabular}[c]{@{}l@{}}Google \\\\Maps\\end{tabular}\"),\n",
" (\"A5\", r\"\\begin{tabular}[c]{@{}l@{}}Samsung \\\\Internet\\end{tabular}\"),\n",
" (\"I1\", \"WhatsApp\"),\n",
" (\"I2\", \"Contacts\"),\n",
" (\"I3\", r\"\\begin{tabular}[c]{@{}l@{}}Apple \\\\Messages\\end{tabular}\"),\n",
" (\"I4\", \"Safari\"),\n",
" (\"I5\", \"Calendar\"),\n",
"])\n",
"\n",
"APP_NAME_PLAIN = OrderedDict([\n",
" (\"A1\", \"WhatsApp\"),\n",
" (\"A2\", \"Snapchat\"),\n",
" (\"A3\", \"Telegram\"),\n",
" (\"A4\", \"Google Maps\"),\n",
" (\"A5\", \"Samsung Internet\"),\n",
" (\"I1\", \"WhatsApp\"),\n",
" (\"I2\", \"Contacts\"),\n",
" (\"I3\", \"Apple Messages\"),\n",
" (\"I4\", \"Safari\"),\n",
" (\"I5\", \"Calendar\"),\n",
"])\n",
"\n",
"PII_COLS = OrderedDict([\n",
" (\"EMAIL\", \"Email\"),\n",
" (\"PHONE\", \"Phone\"),\n",
" (\"USERNAME\", \"UserName\"),\n",
" (\"PERSON_NAME\", \"PersonName\"),\n",
" (\"POSTAL_ADDRESS\", \"PostalAddr\"),\n",
"])\n",
"\n",
"MAX_DB_NAME_LEN = 20\n",
"\n",
"\n",
"def get_app_code(db_path: str) -> str:\n",
" stem = Path(db_path).stem\n",
" return stem.split(\"_\", 1)[0] if \"_\" in stem else stem.split(\"-\", 1)[0]\n",
"\n",
"\n",
"def get_db_filename(db_path: str) -> str:\n",
" name = Path(db_path).name\n",
" if \"_\" in name:\n",
" return name.split(\"_\", 1)[1]\n",
" return name\n",
"\n",
"\n",
"def latex_escape(s: str) -> str:\n",
" return s.replace(\"\\\\\", r\"\\textbackslash{}\").replace(\"_\", r\"\\_\")\n",
"\n",
"\n",
"def shorten_db_name(db_file: str, max_len: int = MAX_DB_NAME_LEN) -> str:\n",
" s = db_file.strip()\n",
" if len(s) <= max_len:\n",
" return s\n",
"\n",
" p = Path(s)\n",
" stem = p.stem\n",
" suffix = p.suffix\n",
" if not suffix:\n",
" return s[: max_len - 3] + \"...\"\n",
"\n",
" keep = max_len - len(suffix) - 3\n",
" if keep <= 0:\n",
" return s[: max_len - 3] + \"...\"\n",
" return stem[:keep] + \"...\" + suffix\n",
"\n",
"\n",
"def load_db_level_counts(folder: Path) -> Dict[Tuple[str, str], Dict[str, int]]:\n",
" counts: Dict[Tuple[str, str], Dict[str, int]] = defaultdict(lambda: {t: 0 for t in PII_COLS.keys()})\n",
"\n",
" for fp in sorted(folder.glob(\"*.jsonl\")):\n",
" with fp.open(\"r\", encoding=\"utf-8\") as f:\n",
" for line in f:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" rec = json.loads(line)\n",
" if not isinstance(rec, dict):\n",
" continue\n",
"\n",
" db_path = rec.get(\"db_path\", \"\")\n",
" pii_type = (rec.get(\"PII_type\") or \"\").strip().upper()\n",
" if not db_path or pii_type not in PII_COLS:\n",
" continue\n",
"\n",
" app = get_app_code(db_path)\n",
" db_file = get_db_filename(db_path)\n",
"\n",
" n = rec.get(\"Num_of_PII\", 0)\n",
" if isinstance(n, bool) or not isinstance(n, (int, float)):\n",
" n = 0\n",
" counts[(app, db_file)][pii_type] += int(n)\n",
"\n",
" return counts\n",
"\n",
"\n",
"def build_rows_for_plain_and_csv(\n",
" counts: Dict[Tuple[str, str], Dict[str, int]],\n",
") -> List[List[str]]:\n",
" by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
" for (app, db_file), c in counts.items():\n",
" by_app[app].append((db_file, c))\n",
"\n",
" for app in by_app:\n",
" by_app[app].sort(key=lambda x: x[0].lower())\n",
"\n",
" header = [\"ID\", \"Application\", \"Database\"] + list(PII_COLS.values()) + [\"Total\"]\n",
" rows: List[List[str]] = [header]\n",
"\n",
" app_order = list(APP_NAME_PLAIN.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME_PLAIN]\n",
"\n",
" for app in app_order:\n",
" if app not in by_app:\n",
" continue\n",
" app_disp = APP_NAME_PLAIN.get(app, app)\n",
"\n",
" for db_file, cdict in by_app[app]:\n",
" vals = [int(cdict[k]) for k in PII_COLS.keys()]\n",
" total = sum(vals)\n",
" db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
"\n",
" rows.append(\n",
" [app, app_disp, db_show] + [str(v) for v in vals] + [str(total)]\n",
" )\n",
"\n",
" return rows\n",
"\n",
"\n",
"def build_plain_text_table_from_rows(rows: List[List[str]]) -> str:\n",
" widths = [0] * len(rows[0])\n",
" for r in rows:\n",
" for i, cell in enumerate(r):\n",
" widths[i] = max(widths[i], len(cell))\n",
"\n",
" def fmt_row(r: List[str]) -> str:\n",
" parts = []\n",
" for i, cell in enumerate(r):\n",
" # text columns\n",
" if i <= 2:\n",
" parts.append(cell.ljust(widths[i]))\n",
" else:\n",
" parts.append(cell.rjust(widths[i]))\n",
" return \" \".join(parts)\n",
"\n",
" lines = [fmt_row(rows[0])]\n",
" lines.append(\"-\" * len(lines[0]))\n",
" for r in rows[1:]:\n",
" lines.append(fmt_row(r))\n",
" return \"\\n\".join(lines)\n",
"\n",
"\n",
"def write_csv(rows: List[List[str]], out_csv: Path) -> None:\n",
" out_csv.parent.mkdir(parents=True, exist_ok=True)\n",
" with out_csv.open(\"w\", encoding=\"utf-8\", newline=\"\") as f:\n",
" csv.writer(f).writerows(rows)\n",
"\n",
"\n",
"def format_row_tex(\n",
" app_code: str,\n",
" app_display: str,\n",
" db_file: str,\n",
" counts_for_db: Dict[str, int],\n",
" is_first_row_for_app: bool,\n",
" nrows_for_app: int,\n",
") -> str:\n",
" vals = {k: int(counts_for_db[k]) for k in PII_COLS.keys()}\n",
" total = sum(vals.values())\n",
"\n",
" db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
" db_tex = latex_escape(db_show)\n",
"\n",
" if is_first_row_for_app:\n",
" return (\n",
" rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_code}}} & \"\n",
" rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_display}}} & \"\n",
" rf\"{db_tex} & \"\n",
" + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
" + rf\" & \\textbf{{{total}}} \\\\\"\n",
" )\n",
"\n",
" return (\n",
" rf\" & & {db_tex} & \"\n",
" + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
" + rf\" & \\textbf{{{total}}} \\\\\"\n",
" )\n",
"\n",
"\n",
"def build_table_tex(counts: Dict[Tuple[str, str], Dict[str, int]]) -> str:\n",
" by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
" for (app, db_file), c in counts.items():\n",
" by_app[app].append((db_file, c))\n",
"\n",
" for app in by_app:\n",
" by_app[app].sort(key=lambda x: x[0].lower())\n",
"\n",
" lines: List[str] = []\n",
" lines.append(r\"\\begin{table*}[th]\")\n",
" lines.append(r\"\\centering\")\n",
" lines.append(r\"\\small\")\n",
" lines.append(r\"\\caption{PII Discovered Per Application and Database (ChatGPT 4o-mini)}\")\n",
" lines.append(r\"\\label{tab:pii_per_app_gpt_details}\")\n",
" lines.append(r\"\\begin{tabular}{|c|l|l|r|r|r|r|r|r|}\")\n",
" lines.append(r\"\\hline\")\n",
" lines.append(\n",
" r\"\\textbf{ID} & \\textbf{Application} & \\textbf{Database} & \"\n",
" r\"\\textbf{Email} & \\textbf{Phone} & \\textbf{User Name} & \\textbf{Person Name} & \"\n",
" r\"\\textbf{Postal Address} & \\textbf{Total PII} \\\\\"\n",
" )\n",
" lines.append(r\"\\hline\")\n",
"\n",
" app_order = list(APP_NAME.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME]\n",
"\n",
" for app in app_order:\n",
" if app not in by_app:\n",
" continue\n",
" db_rows = by_app[app]\n",
" nrows = len(db_rows)\n",
" app_display = APP_NAME.get(app, app)\n",
"\n",
" for i, (db_file, cdict) in enumerate(db_rows):\n",
" lines.append(format_row_tex(app, app_display, db_file, cdict, i == 0, nrows))\n",
" if i < nrows - 1:\n",
" lines.append(r\"\\cline{3-9}\")\n",
" else:\n",
" lines.append(r\"\\hline\")\n",
"\n",
" lines.append(r\"\\end{tabular}\")\n",
" lines.append(r\"\\end{table*}\")\n",
" return \"\\n\".join(lines)\n",
"\n",
"\n",
"def generate_db_level_pii_tables(\n",
" in_dir: Path,\n",
" out_tex: Path,\n",
" out_csv: Path,\n",
") -> None:\n",
" counts = load_db_level_counts(in_dir)\n",
"\n",
" tex = build_table_tex(counts)\n",
" out_tex.parent.mkdir(parents=True, exist_ok=True)\n",
" out_tex.write_text(tex, encoding=\"utf-8\")\n",
"\n",
" rows = build_rows_for_plain_and_csv(counts)\n",
" write_csv(rows, out_csv)\n",
"\n",
" print(build_plain_text_table_from_rows(rows))\n",
" print(f\"\\nWrote LaTeX: {out_tex.resolve()}\")\n",
" print(f\"Wrote CSV: {out_csv.resolve()}\")\n",
"\n",
"\n",
"def main() -> None:\n",
" in_dir = Path(r\"..\\normalized_PII_results\\GPT-5.1\\db_level\")\n",
" out_tex = Path(\"RQ1_t4.tex\")\n",
" out_csv = Path(\"RQ1_t4_plain.csv\")\n",
" generate_db_level_pii_tables(in_dir, out_tex, out_csv)\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}