Files
mobile-pii-discovery-agent/RQs/RQ3/RQ3_t10.ipynb
2026-02-11 22:29:04 -05:00

391 lines
13 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RQ3: Table 10 - Per-application distinct recall"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import glob\n",
"import pandas as pd\n",
"from collections import defaultdict\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"# Add the parent directory (RQs) to the path to find the config file\n",
"if '..' not in sys.path:\n",
" sys.path.insert(1, os.path.abspath('..'))\n",
"import config"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def canonicalize(pii, pii_type):\n",
" \"\"\"Applies canonicalization rules to a PII string based on its type.\"\"\"\n",
" if not isinstance(pii, str):\n",
" pii = str(pii)\n",
" \n",
" pii = pii.strip()\n",
" \n",
" if pii_type == 'EMAIL':\n",
" return pii.lower()\n",
" elif pii_type == 'PHONE':\n",
" is_plus_prefix = pii.startswith('+')\n",
" digits = re.sub(r'\\D', '', pii)\n",
" return ('+' + digits) if is_plus_prefix else digits\n",
" elif pii_type == 'PERSON_NAME':\n",
" return pii.lower()\n",
" \n",
" return pii # Default for USERNAME, POSTAL_ADDRESS etc. is just stripping whitespace\n",
"\n",
"def parse_filename_app_id(filepath):\n",
" \"\"\"Parses a filename to extract just the app ID.\"\"\"\n",
" base_name = os.path.basename(filepath)\n",
" match = re.match(r'PII_([A-Z0-9]+)_', base_name)\n",
" if match:\n",
" return match.group(1)\n",
" return None\n",
"\n",
"def load_pii_data(path):\n",
" \"\"\"Loads and aggregates all distinct canonicalized PII from a directory.\"\"\"\n",
" # Structure: {app_id: {pii_type: {set of pii_strings}}}\n",
" data = defaultdict(lambda: defaultdict(set))\n",
" files = glob.glob(os.path.join(path, '*.jsonl'))\n",
" \n",
" for f_path in files:\n",
" app_id = parse_filename_app_id(f_path)\n",
" if not app_id:\n",
" continue\n",
" with open(f_path, 'r', encoding='utf-8') as f:\n",
" for line in f:\n",
" try:\n",
" record = json.loads(line)\n",
" pii_type = record['PII_type']\n",
" if pii_type in config.PII_TYPES:\n",
" for pii_item in record.get('PII', []):\n",
" canon_pii = canonicalize(pii_item, pii_type)\n",
" if canon_pii:\n",
" data[app_id][pii_type].add(canon_pii)\n",
" except json.JSONDecodeError:\n",
" print(f\"Warning: Could not decode JSON from line in {f_path}\")\n",
" return data\n",
"\n",
"gt_pii_sets = load_pii_data(os.path.join('..', config.GROUND_TRUTH_DIR))\n",
"system_pii_sets = load_pii_data(os.path.join('..', config.GPT4O_RESULTS_DIR))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"table_data = []\n",
"\n",
"for app_id, app_name in config.APP_MAPPING.items():\n",
" row = {'ID': app_id, 'Application': app_name}\n",
" \n",
" all_gt_pii_for_app = set()\n",
" all_system_pii_for_app = set()\n",
"\n",
" # --- Per-PII Type Recall Calculation ---\n",
" for pii_type in config.PII_TYPES:\n",
" col_name = config.COLUMN_MAPPING[pii_type]\n",
" \n",
" gt_set = gt_pii_sets.get(app_id, {}).get(pii_type, set())\n",
" system_set = system_pii_sets.get(app_id, {}).get(pii_type, set())\n",
" \n",
" all_gt_pii_for_app.update(gt_set)\n",
" all_system_pii_for_app.update(system_set)\n",
" \n",
" if not gt_set:\n",
" row[col_name] = '-'\n",
" else:\n",
" # Ra,t = |Ga,t ∩ Sa,t| / |Ga,t|\n",
" recall = len(gt_set.intersection(system_set)) / len(gt_set)\n",
" row[col_name] = recall\n",
"\n",
" # --- All PII Recall Calculation ---\n",
" if not all_gt_pii_for_app:\n",
" row['All PII'] = '-'\n",
" else:\n",
" # Ra,all = |(U Ga,t) ∩ (U Sa,t)| / |U Ga,t|\n",
" all_recall = len(all_gt_pii_for_app.intersection(all_system_pii_for_app)) / len(all_gt_pii_for_app)\n",
" row['All PII'] = all_recall\n",
" \n",
" table_data.append(row)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Application</th>\n",
" <th>Email</th>\n",
" <th>Phone</th>\n",
" <th>User Name</th>\n",
" <th>Person Name</th>\n",
" <th>Postal Address</th>\n",
" <th>All PII</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A1</th>\n",
" <td>WhatsApp</td>\n",
" <td>-</td>\n",
" <td>0.96</td>\n",
" <td>0.50</td>\n",
" <td>0.68</td>\n",
" <td>-</td>\n",
" <td>0.79</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A2</th>\n",
" <td>Snapchat</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.33</td>\n",
" <td>1.00</td>\n",
" <td>-</td>\n",
" <td>0.79</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A3</th>\n",
" <td>Telegram</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A4</th>\n",
" <td>Google Maps</td>\n",
" <td>1.00</td>\n",
" <td>-</td>\n",
" <td>1.00</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A5</th>\n",
" <td>Samsung Internet</td>\n",
" <td>1.00</td>\n",
" <td>-</td>\n",
" <td>0.00</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>0.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I1</th>\n",
" <td>WhatsApp (iOS)</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I2</th>\n",
" <td>Contacts</td>\n",
" <td>1.00</td>\n",
" <td>0.47</td>\n",
" <td>-</td>\n",
" <td>0.86</td>\n",
" <td>-</td>\n",
" <td>0.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I3</th>\n",
" <td>Apple Messages</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>-</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I4</th>\n",
" <td>Safari</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>0.02</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>0.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I5</th>\n",
" <td>Calendar</td>\n",
" <td>1.00</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Application Email Phone User Name Person Name Postal Address All PII\n",
"ID \n",
"A1 WhatsApp - 0.96 0.50 0.68 - 0.79\n",
"A2 Snapchat 1.00 1.00 0.33 1.00 - 0.79\n",
"A3 Telegram - - - - - -\n",
"A4 Google Maps 1.00 - 1.00 - - 1.00\n",
"A5 Samsung Internet 1.00 - 0.00 - - 0.20\n",
"I1 WhatsApp (iOS) - - - 1.00 1.00 1.00\n",
"I2 Contacts 1.00 0.47 - 0.86 - 0.65\n",
"I3 Apple Messages 1.00 0.00 0.00 1.00 - 0.33\n",
"I4 Safari - - 0.02 - - 0.02\n",
"I5 Calendar 1.00 - - - - 1.00"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(table_data)\n",
"\n",
"# Reorder columns to match Table 10\n",
"final_columns = ['ID', 'Application'] + [config.COLUMN_MAPPING[pt] for pt in config.PII_TYPES] + ['All PII']\n",
"df = df[final_columns]\n",
"\n",
"df = df.set_index('ID')\n",
"\n",
"# Format numbers to 2 decimal places, replacing non-numeric placeholders with '-\n",
"for col in df.columns:\n",
" if col != 'Application':\n",
" df[col] = df[col].apply(lambda x: f\"{x:.2f}\" if isinstance(x, float) else x)\n",
"\n",
"# Display the dataframe\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\begin{table}\n",
"\\caption{Per-application distinct recall.}\n",
"\\label{tab:app_level_recall}\n",
"\\begin{tabular}{llllllll}\n",
"\\toprule\n",
" & Application & Email & Phone & User Name & Person Name & Postal Address & All PII \\\\\n",
"ID & & & & & & & \\\\\n",
"\\midrule\n",
"A1 & WhatsApp & - & 0.96 & 0.50 & 0.68 & - & 0.79 \\\\\n",
"A2 & Snapchat & 1.00 & 1.00 & 0.33 & 1.00 & - & 0.79 \\\\\n",
"A3 & Telegram & - & - & - & - & - & - \\\\\n",
"A4 & Google Maps & 1.00 & - & 1.00 & - & - & 1.00 \\\\\n",
"A5 & Samsung Internet & 1.00 & - & 0.00 & - & - & 0.20 \\\\\n",
"I1 & WhatsApp (iOS) & - & - & - & 1.00 & 1.00 & 1.00 \\\\\n",
"I2 & Contacts & 1.00 & 0.47 & - & 0.86 & - & 0.65 \\\\\n",
"I3 & Apple Messages & 1.00 & 0.00 & 0.00 & 1.00 & - & 0.33 \\\\\n",
"I4 & Safari & - & - & 0.02 & - & - & 0.02 \\\\\n",
"I5 & Calendar & 1.00 & - & - & - & - & 1.00 \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"\\end{table}\n",
"\n"
]
}
],
"source": [
"# Optional: Save to LaTeX\n",
"latex_output = df.to_latex(index=True, caption='Per-application distinct recall.', label='tab:app_level_recall', na_rep='-')\n",
"print(latex_output)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}