mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
341 lines
14 KiB
Plaintext
341 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "e15e3ffe",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"ID Application Database Email Phone UserName PersonName PostalAddr Total\n",
|
|
"-------------------------------------------------------------------------------------------------\n",
|
|
"A1 WhatsApp commerce.db 0 0 0 0 0 0\n",
|
|
"A1 WhatsApp msgstore.db 0 0 3 26 0 29\n",
|
|
"A1 WhatsApp wa.db 0 408 0 129 0 537\n",
|
|
"A2 Snapchat core.db 1 1 1 0 0 3\n",
|
|
"A2 Snapchat journal.db 0 0 0 0 0 0\n",
|
|
"A2 Snapchat main.db 1 13 1229 252 0 1495\n",
|
|
"A3 Telegram account1cache4.db 0 0 0 0 0 0\n",
|
|
"A3 Telegram account2cache4.db 0 0 0 0 0 0\n",
|
|
"A3 Telegram account3cache4.db 0 0 0 0 0 0\n",
|
|
"A4 Google Maps gmm_myplaces.db 0 0 0 0 0 0\n",
|
|
"A4 Google Maps gmm_storage.db 0 0 0 0 0 0\n",
|
|
"A4 Google Maps peopleCache_sh....db 2 1 2 2 0 7\n",
|
|
"A5 Samsung Internet SBrowser.db 1 0 0 2 0 3\n",
|
|
"A5 Samsung Internet SBrowser2.db 0 0 0 1 0 1\n",
|
|
"A5 Samsung Internet searchengine.db 0 0 0 0 0 0\n",
|
|
"I1 WhatsApp CallHistory.sqlite 0 3 0 0 0 3\n",
|
|
"I1 WhatsApp ChatStorage.sqlite 0 63 0 66 2 131\n",
|
|
"I1 WhatsApp ContactsV2.sqlite 0 0 0 1350 0 1350\n",
|
|
"I2 Contacts AddressBook.sqlitedb 6 716 0 733 0 1455\n",
|
|
"I2 Contacts AddressB....sqlitedb 0 0 0 0 0 0\n",
|
|
"I3 Apple Messages sms.db 7 0 0 14 0 21\n",
|
|
"I4 Safari CloudTabs.db 0 0 0 0 0 0\n",
|
|
"I4 Safari History.db 0 0 0 0 0 0\n",
|
|
"I5 Calendar Calendar.sqlitedb 1 0 0 1 0 2\n",
|
|
"I5 Calendar Extras.db 0 0 0 0 0 0\n",
|
|
"\n",
|
|
"Wrote LaTeX: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4.tex\n",
|
|
"Wrote CSV: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4_plain.csv\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import json\n",
|
|
"import csv\n",
|
|
"from pathlib import Path\n",
|
|
"from collections import defaultdict, OrderedDict\n",
|
|
"from typing import Dict, Tuple, List\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"APP_NAME = OrderedDict([\n",
|
|
" (\"A1\", \"WhatsApp\"),\n",
|
|
" (\"A2\", \"Snapchat\"),\n",
|
|
" (\"A3\", \"Telegram\"),\n",
|
|
" (\"A4\", r\"\\begin{tabular}[c]{@{}l@{}}Google \\\\Maps\\end{tabular}\"),\n",
|
|
" (\"A5\", r\"\\begin{tabular}[c]{@{}l@{}}Samsung \\\\Internet\\end{tabular}\"),\n",
|
|
" (\"I1\", \"WhatsApp\"),\n",
|
|
" (\"I2\", \"Contacts\"),\n",
|
|
" (\"I3\", r\"\\begin{tabular}[c]{@{}l@{}}Apple \\\\Messages\\end{tabular}\"),\n",
|
|
" (\"I4\", \"Safari\"),\n",
|
|
" (\"I5\", \"Calendar\"),\n",
|
|
"])\n",
|
|
"\n",
|
|
"APP_NAME_PLAIN = OrderedDict([\n",
|
|
" (\"A1\", \"WhatsApp\"),\n",
|
|
" (\"A2\", \"Snapchat\"),\n",
|
|
" (\"A3\", \"Telegram\"),\n",
|
|
" (\"A4\", \"Google Maps\"),\n",
|
|
" (\"A5\", \"Samsung Internet\"),\n",
|
|
" (\"I1\", \"WhatsApp\"),\n",
|
|
" (\"I2\", \"Contacts\"),\n",
|
|
" (\"I3\", \"Apple Messages\"),\n",
|
|
" (\"I4\", \"Safari\"),\n",
|
|
" (\"I5\", \"Calendar\"),\n",
|
|
"])\n",
|
|
"\n",
|
|
"PII_COLS = OrderedDict([\n",
|
|
" (\"EMAIL\", \"Email\"),\n",
|
|
" (\"PHONE\", \"Phone\"),\n",
|
|
" (\"USERNAME\", \"UserName\"),\n",
|
|
" (\"PERSON_NAME\", \"PersonName\"),\n",
|
|
" (\"POSTAL_ADDRESS\", \"PostalAddr\"),\n",
|
|
"])\n",
|
|
"\n",
|
|
"MAX_DB_NAME_LEN = 20\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_app_code(db_path: str) -> str:\n",
|
|
" stem = Path(db_path).stem\n",
|
|
" return stem.split(\"_\", 1)[0] if \"_\" in stem else stem.split(\"-\", 1)[0]\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_db_filename(db_path: str) -> str:\n",
|
|
" name = Path(db_path).name\n",
|
|
" if \"_\" in name:\n",
|
|
" return name.split(\"_\", 1)[1]\n",
|
|
" return name\n",
|
|
"\n",
|
|
"\n",
|
|
"def latex_escape(s: str) -> str:\n",
|
|
" return s.replace(\"\\\\\", r\"\\textbackslash{}\").replace(\"_\", r\"\\_\")\n",
|
|
"\n",
|
|
"\n",
|
|
"def shorten_db_name(db_file: str, max_len: int = MAX_DB_NAME_LEN) -> str:\n",
|
|
" s = db_file.strip()\n",
|
|
" if len(s) <= max_len:\n",
|
|
" return s\n",
|
|
"\n",
|
|
" p = Path(s)\n",
|
|
" stem = p.stem\n",
|
|
" suffix = p.suffix\n",
|
|
" if not suffix:\n",
|
|
" return s[: max_len - 3] + \"...\"\n",
|
|
"\n",
|
|
" keep = max_len - len(suffix) - 3\n",
|
|
" if keep <= 0:\n",
|
|
" return s[: max_len - 3] + \"...\"\n",
|
|
" return stem[:keep] + \"...\" + suffix\n",
|
|
"\n",
|
|
"\n",
|
|
"def load_db_level_counts(folder: Path) -> Dict[Tuple[str, str], Dict[str, int]]:\n",
|
|
" counts: Dict[Tuple[str, str], Dict[str, int]] = defaultdict(lambda: {t: 0 for t in PII_COLS.keys()})\n",
|
|
"\n",
|
|
" for fp in sorted(folder.glob(\"*.jsonl\")):\n",
|
|
" with fp.open(\"r\", encoding=\"utf-8\") as f:\n",
|
|
" for line in f:\n",
|
|
" line = line.strip()\n",
|
|
" if not line:\n",
|
|
" continue\n",
|
|
" rec = json.loads(line)\n",
|
|
" if not isinstance(rec, dict):\n",
|
|
" continue\n",
|
|
"\n",
|
|
" db_path = rec.get(\"db_path\", \"\")\n",
|
|
" pii_type = (rec.get(\"PII_type\") or \"\").strip().upper()\n",
|
|
" if not db_path or pii_type not in PII_COLS:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" app = get_app_code(db_path)\n",
|
|
" db_file = get_db_filename(db_path)\n",
|
|
"\n",
|
|
" n = rec.get(\"Num_of_PII\", 0)\n",
|
|
" if isinstance(n, bool) or not isinstance(n, (int, float)):\n",
|
|
" n = 0\n",
|
|
" counts[(app, db_file)][pii_type] += int(n)\n",
|
|
"\n",
|
|
" return counts\n",
|
|
"\n",
|
|
"\n",
|
|
"def build_rows_for_plain_and_csv(\n",
|
|
" counts: Dict[Tuple[str, str], Dict[str, int]],\n",
|
|
") -> List[List[str]]:\n",
|
|
" by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
|
|
" for (app, db_file), c in counts.items():\n",
|
|
" by_app[app].append((db_file, c))\n",
|
|
"\n",
|
|
" for app in by_app:\n",
|
|
" by_app[app].sort(key=lambda x: x[0].lower())\n",
|
|
"\n",
|
|
" header = [\"ID\", \"Application\", \"Database\"] + list(PII_COLS.values()) + [\"Total\"]\n",
|
|
" rows: List[List[str]] = [header]\n",
|
|
"\n",
|
|
" app_order = list(APP_NAME_PLAIN.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME_PLAIN]\n",
|
|
"\n",
|
|
" for app in app_order:\n",
|
|
" if app not in by_app:\n",
|
|
" continue\n",
|
|
" app_disp = APP_NAME_PLAIN.get(app, app)\n",
|
|
"\n",
|
|
" for db_file, cdict in by_app[app]:\n",
|
|
" vals = [int(cdict[k]) for k in PII_COLS.keys()]\n",
|
|
" total = sum(vals)\n",
|
|
" db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
|
|
"\n",
|
|
" rows.append(\n",
|
|
" [app, app_disp, db_show] + [str(v) for v in vals] + [str(total)]\n",
|
|
" )\n",
|
|
"\n",
|
|
" return rows\n",
|
|
"\n",
|
|
"\n",
|
|
"def build_plain_text_table_from_rows(rows: List[List[str]]) -> str:\n",
|
|
" widths = [0] * len(rows[0])\n",
|
|
" for r in rows:\n",
|
|
" for i, cell in enumerate(r):\n",
|
|
" widths[i] = max(widths[i], len(cell))\n",
|
|
"\n",
|
|
" def fmt_row(r: List[str]) -> str:\n",
|
|
" parts = []\n",
|
|
" for i, cell in enumerate(r):\n",
|
|
" # text columns\n",
|
|
" if i <= 2:\n",
|
|
" parts.append(cell.ljust(widths[i]))\n",
|
|
" else:\n",
|
|
" parts.append(cell.rjust(widths[i]))\n",
|
|
" return \" \".join(parts)\n",
|
|
"\n",
|
|
" lines = [fmt_row(rows[0])]\n",
|
|
" lines.append(\"-\" * len(lines[0]))\n",
|
|
" for r in rows[1:]:\n",
|
|
" lines.append(fmt_row(r))\n",
|
|
" return \"\\n\".join(lines)\n",
|
|
"\n",
|
|
"\n",
|
|
"def write_csv(rows: List[List[str]], out_csv: Path) -> None:\n",
|
|
" out_csv.parent.mkdir(parents=True, exist_ok=True)\n",
|
|
" with out_csv.open(\"w\", encoding=\"utf-8\", newline=\"\") as f:\n",
|
|
" csv.writer(f).writerows(rows)\n",
|
|
"\n",
|
|
"\n",
|
|
"def format_row_tex(\n",
|
|
" app_code: str,\n",
|
|
" app_display: str,\n",
|
|
" db_file: str,\n",
|
|
" counts_for_db: Dict[str, int],\n",
|
|
" is_first_row_for_app: bool,\n",
|
|
" nrows_for_app: int,\n",
|
|
") -> str:\n",
|
|
" vals = {k: int(counts_for_db[k]) for k in PII_COLS.keys()}\n",
|
|
" total = sum(vals.values())\n",
|
|
"\n",
|
|
" db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
|
|
" db_tex = latex_escape(db_show)\n",
|
|
"\n",
|
|
" if is_first_row_for_app:\n",
|
|
" return (\n",
|
|
" rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_code}}} & \"\n",
|
|
" rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_display}}} & \"\n",
|
|
" rf\"{db_tex} & \"\n",
|
|
" + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
|
|
" + rf\" & \\textbf{{{total}}} \\\\\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" return (\n",
|
|
" rf\" & & {db_tex} & \"\n",
|
|
" + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
|
|
" + rf\" & \\textbf{{{total}}} \\\\\"\n",
|
|
" )\n",
|
|
"\n",
|
|
"\n",
|
|
"def build_table_tex(counts: Dict[Tuple[str, str], Dict[str, int]]) -> str:\n",
|
|
" by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
|
|
" for (app, db_file), c in counts.items():\n",
|
|
" by_app[app].append((db_file, c))\n",
|
|
"\n",
|
|
" for app in by_app:\n",
|
|
" by_app[app].sort(key=lambda x: x[0].lower())\n",
|
|
"\n",
|
|
" lines: List[str] = []\n",
|
|
" lines.append(r\"\\begin{table*}[th]\")\n",
|
|
" lines.append(r\"\\centering\")\n",
|
|
" lines.append(r\"\\small\")\n",
|
|
" lines.append(r\"\\caption{PII Discovered Per Application and Database (ChatGPT 4o-mini)}\")\n",
|
|
" lines.append(r\"\\label{tab:pii_per_app_gpt_details}\")\n",
|
|
" lines.append(r\"\\begin{tabular}{|c|l|l|r|r|r|r|r|r|}\")\n",
|
|
" lines.append(r\"\\hline\")\n",
|
|
" lines.append(\n",
|
|
" r\"\\textbf{ID} & \\textbf{Application} & \\textbf{Database} & \"\n",
|
|
" r\"\\textbf{Email} & \\textbf{Phone} & \\textbf{User Name} & \\textbf{Person Name} & \"\n",
|
|
" r\"\\textbf{Postal Address} & \\textbf{Total PII} \\\\\"\n",
|
|
" )\n",
|
|
" lines.append(r\"\\hline\")\n",
|
|
"\n",
|
|
" app_order = list(APP_NAME.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME]\n",
|
|
"\n",
|
|
" for app in app_order:\n",
|
|
" if app not in by_app:\n",
|
|
" continue\n",
|
|
" db_rows = by_app[app]\n",
|
|
" nrows = len(db_rows)\n",
|
|
" app_display = APP_NAME.get(app, app)\n",
|
|
"\n",
|
|
" for i, (db_file, cdict) in enumerate(db_rows):\n",
|
|
" lines.append(format_row_tex(app, app_display, db_file, cdict, i == 0, nrows))\n",
|
|
" if i < nrows - 1:\n",
|
|
" lines.append(r\"\\cline{3-9}\")\n",
|
|
" else:\n",
|
|
" lines.append(r\"\\hline\")\n",
|
|
"\n",
|
|
" lines.append(r\"\\end{tabular}\")\n",
|
|
" lines.append(r\"\\end{table*}\")\n",
|
|
" return \"\\n\".join(lines)\n",
|
|
"\n",
|
|
"\n",
|
|
"def generate_db_level_pii_tables(\n",
|
|
" in_dir: Path,\n",
|
|
" out_tex: Path,\n",
|
|
" out_csv: Path,\n",
|
|
") -> None:\n",
|
|
" counts = load_db_level_counts(in_dir)\n",
|
|
"\n",
|
|
" tex = build_table_tex(counts)\n",
|
|
" out_tex.parent.mkdir(parents=True, exist_ok=True)\n",
|
|
" out_tex.write_text(tex, encoding=\"utf-8\")\n",
|
|
"\n",
|
|
" rows = build_rows_for_plain_and_csv(counts)\n",
|
|
" write_csv(rows, out_csv)\n",
|
|
"\n",
|
|
" print(build_plain_text_table_from_rows(rows))\n",
|
|
" print(f\"\\nWrote LaTeX: {out_tex.resolve()}\")\n",
|
|
" print(f\"Wrote CSV: {out_csv.resolve()}\")\n",
|
|
"\n",
|
|
"\n",
|
|
"def main() -> None:\n",
|
|
" in_dir = Path(r\"..\\normalized_PII_results\\GPT-5.1\\db_level\")\n",
|
|
" out_tex = Path(\"RQ1_t4.tex\")\n",
|
|
" out_csv = Path(\"RQ1_t4_plain.csv\")\n",
|
|
" generate_db_level_pii_tables(in_dir, out_tex, out_csv)\n",
|
|
"\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" main()\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.18"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|