mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
377 lines
12 KiB
Plaintext
377 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## RQ3: Table 9 - Application-level source coverage by PII type"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import os\n",
|
|
"import glob\n",
|
|
"import pandas as pd\n",
|
|
"from collections import defaultdict\n",
|
|
"import re"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import sys\n",
|
|
"import os\n",
|
|
"# Add the parent directory (RQs) to the path to find the config file\n",
|
|
"if '..' not in sys.path:\n",
|
|
" sys.path.insert(1, os.path.abspath('..'))\n",
|
|
"import config"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def parse_filename(filepath):\n",
|
|
" \"\"\"Parses a filename to extract the app ID and database name.\"\"\"\n",
|
|
" base_name = os.path.basename(filepath)\n",
|
|
" # Format: PII_{APP_ID}_{DB_NAME}_{TIMESTAMP}.jsonl\n",
|
|
" match = re.match(r'PII_([A-Z0-9]+)_(.*)_\\d{8}T\\d{6}Z\\.jsonl', base_name)\n",
|
|
" if match:\n",
|
|
" app_id = match.group(1)\n",
|
|
" db_name = match.group(2)\n",
|
|
" return app_id, db_name\n",
|
|
" return None, None\n",
|
|
"\n",
|
|
"def load_data(path):\n",
|
|
" \"\"\"Loads PII presence data from a directory of jsonl files.\"\"\"\n",
|
|
" # Structure: {app_id: {db_name: {pii_type: has_pii_bool}}}\n",
|
|
" data = defaultdict(lambda: defaultdict(lambda: defaultdict(bool)))\n",
|
|
" files = glob.glob(os.path.join(path, '*.jsonl'))\n",
|
|
" for f_path in files:\n",
|
|
" app_id, db_name = parse_filename(f_path)\n",
|
|
" if not app_id or not db_name:\n",
|
|
" continue\n",
|
|
" with open(f_path, 'r') as f:\n",
|
|
" for line in f:\n",
|
|
" record = json.loads(line)\n",
|
|
" pii_type = record['PII_type']\n",
|
|
" if len(record['PII']) > 0:\n",
|
|
" data[app_id][db_name][pii_type] = True\n",
|
|
" return data\n",
|
|
"\n",
|
|
"gt_data = load_data(os.path.join('..', config.GROUND_TRUTH_DIR))\n",
|
|
"system_data = load_data(os.path.join('..', config.GPT4O_RESULTS_DIR))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"table_data = []\n",
|
|
"\n",
|
|
"for app_id, app_name in config.APP_MAPPING.items():\n",
|
|
" row = {'ID': app_id, 'Application': app_name}\n",
|
|
" \n",
|
|
" app_dbs_in_gt = gt_data.get(app_id, {}).keys()\n",
|
|
"\n",
|
|
" # --- Per-PII Type Calculation ---\n",
|
|
" for pii_type in config.PII_TYPES:\n",
|
|
" col_name = config.COLUMN_MAPPING[pii_type]\n",
|
|
" \n",
|
|
" # DG(a,t): set of databases for app 'a' that contain pii_type 't' in ground truth\n",
|
|
" gt_dbs_with_pii = {db for db in app_dbs_in_gt if gt_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n",
|
|
" \n",
|
|
" # DS(a,t): set of databases for app 'a' that contain pii_type 't' in system output\n",
|
|
" system_dbs_with_pii = {db for db in app_dbs_in_gt if system_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n",
|
|
" \n",
|
|
" gt_count = len(gt_dbs_with_pii)\n",
|
|
" \n",
|
|
" if gt_count == 0:\n",
|
|
" row[col_name] = '-'\n",
|
|
" else:\n",
|
|
" # covered = |DG(a,t) ∩ DS(a,t)|\n",
|
|
" covered_count = len(gt_dbs_with_pii.intersection(system_dbs_with_pii))\n",
|
|
" row[col_name] = f\"{covered_count}/{gt_count}\"\n",
|
|
"\n",
|
|
" # --- All PII Calculation ---\n",
|
|
" # Databases in GT for this app that have *any* PII type\n",
|
|
" gt_dbs_with_any_pii = {\n",
|
|
" db for db in app_dbs_in_gt \n",
|
|
" if any(gt_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in config.PII_TYPES)\n",
|
|
" }\n",
|
|
" \n",
|
|
" # Databases in system output for this app that have *any* PII type\n",
|
|
" system_dbs_with_any_pii = {\n",
|
|
" db for db in app_dbs_in_gt\n",
|
|
" if any(system_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in config.PII_TYPES)\n",
|
|
" }\n",
|
|
"\n",
|
|
" all_gt_count = len(gt_dbs_with_any_pii)\n",
|
|
" if all_gt_count == 0:\n",
|
|
" row['All PII'] = '-'\n",
|
|
" else:\n",
|
|
" all_covered_count = len(gt_dbs_with_any_pii.intersection(system_dbs_with_any_pii))\n",
|
|
" row['All PII'] = f\"{all_covered_count}/{all_gt_count}\"\n",
|
|
" \n",
|
|
" table_data.append(row)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Application</th>\n",
|
|
" <th>Email</th>\n",
|
|
" <th>Phone</th>\n",
|
|
" <th>User Name</th>\n",
|
|
" <th>Person Name</th>\n",
|
|
" <th>Postal Address</th>\n",
|
|
" <th>All PII</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>ID</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>A1</th>\n",
|
|
" <td>WhatsApp</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>2/2</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>2/2</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>2/2</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>A2</th>\n",
|
|
" <td>Snapchat</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>2/2</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>2/2</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>2/2</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>A3</th>\n",
|
|
" <td>Telegram</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>A4</th>\n",
|
|
" <td>Google Maps</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>A5</th>\n",
|
|
" <td>Samsung Internet</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>I1</th>\n",
|
|
" <td>WhatsApp (iOS)</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>I2</th>\n",
|
|
" <td>Contacts</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>I3</th>\n",
|
|
" <td>Apple Messages</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>0/1</td>\n",
|
|
" <td>0/1</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>I4</th>\n",
|
|
" <td>Safari</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>I5</th>\n",
|
|
" <td>Calendar</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>-</td>\n",
|
|
" <td>1/1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Application Email Phone User Name Person Name Postal Address All PII\n",
|
|
"ID \n",
|
|
"A1 WhatsApp - 2/2 1/1 2/2 - 2/2\n",
|
|
"A2 Snapchat 1/1 2/2 1/1 2/2 - 2/2\n",
|
|
"A3 Telegram - - - - - -\n",
|
|
"A4 Google Maps 1/1 - 1/1 - - 1/1\n",
|
|
"A5 Samsung Internet 1/1 - 1/1 - - 1/1\n",
|
|
"I1 WhatsApp (iOS) - - - 1/1 1/1 1/1\n",
|
|
"I2 Contacts 1/1 1/1 - 1/1 - 1/1\n",
|
|
"I3 Apple Messages 1/1 0/1 0/1 1/1 - 1/1\n",
|
|
"I4 Safari - - 1/1 - - 1/1\n",
|
|
"I5 Calendar 1/1 - - - - 1/1"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.DataFrame(table_data)\n",
|
|
"\n",
|
|
"# Reorder columns to match Table 9\n",
|
|
"final_columns = ['ID', 'Application'] + [config.COLUMN_MAPPING[pt] for pt in config.PII_TYPES] + ['All PII']\n",
|
|
"df = df[final_columns]\n",
|
|
"\n",
|
|
"df = df.set_index('ID')\n",
|
|
"\n",
|
|
"# Display the dataframe\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\\begin{table}\n",
|
|
"\\caption{Application-level source coverage by PII type.}\n",
|
|
"\\label{tab:app_level_coverage}\n",
|
|
"\\begin{tabular}{llccccccc}\n",
|
|
"\\toprule\n",
|
|
" & Application & Email & Phone & User Name & Person Name & Postal Address & All PII \\\\\n",
|
|
"ID & & & & & & & \\\\\n",
|
|
"\\midrule\n",
|
|
"A1 & WhatsApp & - & 2/2 & 1/1 & 2/2 & - & 2/2 \\\\\n",
|
|
"A2 & Snapchat & 1/1 & 2/2 & 1/1 & 2/2 & - & 2/2 \\\\\n",
|
|
"A3 & Telegram & - & - & - & - & - & - \\\\\n",
|
|
"A4 & Google Maps & 1/1 & - & 1/1 & - & - & 1/1 \\\\\n",
|
|
"A5 & Samsung Internet & 1/1 & - & 1/1 & - & - & 1/1 \\\\\n",
|
|
"I1 & WhatsApp (iOS) & - & - & - & 1/1 & 1/1 & 1/1 \\\\\n",
|
|
"I2 & Contacts & 1/1 & 1/1 & - & 1/1 & - & 1/1 \\\\\n",
|
|
"I3 & Apple Messages & 1/1 & 0/1 & 0/1 & 1/1 & - & 1/1 \\\\\n",
|
|
"I4 & Safari & - & - & 1/1 & - & - & 1/1 \\\\\n",
|
|
"I5 & Calendar & 1/1 & - & - & - & - & 1/1 \\\\\n",
|
|
"\\bottomrule\n",
|
|
"\\end{tabular}\n",
|
|
"\\end{table}\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Optional: Save to LaTeX\n",
|
|
"latex_output = df.to_latex(index=True, caption='Application-level source coverage by PII type.', label='tab:app_level_coverage', column_format='ll' + 'c' * (len(df.columns)))\n",
|
|
"print(latex_output)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.18"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|