diff --git a/RQs/RQ3/RQ3_t10.ipynb b/RQs/RQ3/RQ3_t10.ipynb new file mode 100644 index 0000000..de1526d --- /dev/null +++ b/RQs/RQ3/RQ3_t10.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RQ3: Table 10 - Per-application distinct recall" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import glob\n", + "import pandas as pd\n", + "from collections import defaultdict\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "GPT4O_RESULTS_DIR = '../normalized_PII_results/gpt4o/db_level/'\n", + "GROUND_TRUTH_DIR = '../normalized_PII_results/ground_truth/db_level/'\n", + "\n", + "PII_TYPES = ['EMAIL', 'PHONE', 'USERNAME', 'PERSON_NAME', 'POSTAL_ADDRESS']\n", + "\n", + "APP_MAPPING = {\n", + " 'A1': 'WhatsApp',\n", + " 'A2': 'Snapchat',\n", + " 'A3': 'Telegram',\n", + " 'A4': 'Google Maps',\n", + " 'A5': 'Samsung Internet',\n", + " 'I1': 'WhatsApp (iOS)',\n", + " 'I2': 'Contacts',\n", + " 'I3': 'Apple Messages',\n", + " 'I4': 'Safari',\n", + " 'I5': 'Calendar'\n", + "}\n", + "\n", + "COLUMN_MAPPING = {\n", + " 'EMAIL': 'Email',\n", + " 'PHONE': 'Phone',\n", + " 'USERNAME': 'User Name',\n", + " 'PERSON_NAME': 'Person Name',\n", + " 'POSTAL_ADDRESS': 'Postal Address'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def canonicalize(pii, pii_type):\n", + " \"\"\"Applies canonicalization rules to a PII string based on its type.\"\"\"\n", + " if not isinstance(pii, str):\n", + " pii = str(pii)\n", + " \n", + " pii = pii.strip()\n", + " \n", + " if pii_type == 'EMAIL':\n", + " return pii.lower()\n", + " elif pii_type == 'PHONE':\n", + " is_plus_prefix = pii.startswith('+')\n", + " digits = re.sub(r'\\D', '', pii)\n", + " return ('+' + digits) if is_plus_prefix else digits\n", + " elif pii_type == 'PERSON_NAME':\n", + " return pii.lower()\n", + " \n", + " return pii # Default for USERNAME, POSTAL_ADDRESS etc. is just stripping whitespace\n", + "\n", + "def parse_filename_app_id(filepath):\n", + " \"\"\"Parses a filename to extract just the app ID.\"\"\"\n", + " base_name = os.path.basename(filepath)\n", + " match = re.match(r'PII_([A-Z0-9]+)_', base_name)\n", + " if match:\n", + " return match.group(1)\n", + " return None\n", + "\n", + "def load_pii_data(path):\n", + " \"\"\"Loads and aggregates all distinct canonicalized PII from a directory.\"\"\"\n", + " # Structure: {app_id: {pii_type: {set of pii_strings}}}\n", + " data = defaultdict(lambda: defaultdict(set))\n", + " files = glob.glob(os.path.join(path, '*.jsonl'))\n", + " \n", + " for f_path in files:\n", + " app_id = parse_filename_app_id(f_path)\n", + " if not app_id:\n", + " continue\n", + " with open(f_path, 'r', encoding='utf-8') as f:\n", + " for line in f:\n", + " try:\n", + " record = json.loads(line)\n", + " pii_type = record['PII_type']\n", + " if pii_type in PII_TYPES:\n", + " for pii_item in record.get('PII', []):\n", + " canon_pii = canonicalize(pii_item, pii_type)\n", + " if canon_pii:\n", + " data[app_id][pii_type].add(canon_pii)\n", + " except json.JSONDecodeError:\n", + " print(f\"Warning: Could not decode JSON from line in {f_path}\")\n", + " return data\n", + "\n", + "gt_pii_sets = load_pii_data(GROUND_TRUTH_DIR)\n", + "system_pii_sets = load_pii_data(GPT4O_RESULTS_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "table_data = []\n", + "\n", + "for app_id, app_name in APP_MAPPING.items():\n", + " row = {'ID': app_id, 'Application': app_name}\n", + " \n", + " all_gt_pii_for_app = set()\n", + " all_system_pii_for_app = set()\n", + "\n", + " # --- Per-PII Type Recall Calculation ---\n", + " for pii_type in PII_TYPES:\n", + " col_name = COLUMN_MAPPING[pii_type]\n", + " \n", + " gt_set = gt_pii_sets.get(app_id, {}).get(pii_type, set())\n", + " system_set = system_pii_sets.get(app_id, {}).get(pii_type, set())\n", + " \n", + " all_gt_pii_for_app.update(gt_set)\n", + " all_system_pii_for_app.update(system_set)\n", + " \n", + " if not gt_set:\n", + " row[col_name] = '-'\n", + " else:\n", + " # Ra,t = |Ga,t ∩ Sa,t| / |Ga,t|\n", + " recall = len(gt_set.intersection(system_set)) / len(gt_set)\n", + " row[col_name] = recall\n", + "\n", + " # --- All PII Recall Calculation ---\n", + " if not all_gt_pii_for_app:\n", + " row['All PII'] = '-'\n", + " else:\n", + " # Ra,all = |(U Ga,t) ∩ (U Sa,t)| / |U Ga,t|\n", + " all_recall = len(all_gt_pii_for_app.intersection(all_system_pii_for_app)) / len(all_gt_pii_for_app)\n", + " row['All PII'] = all_recall\n", + " \n", + " table_data.append(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ApplicationEmailPhoneUser NamePerson NamePostal AddressAll PII
ID
A1WhatsApp-0.780.500.68-0.71
A2Snapchat1.001.000.331.00-0.79
A3Telegram------
A4Google Maps1.00-1.00--1.00
A5Samsung Internet1.00-0.00--0.20
I1WhatsApp (iOS)---1.001.001.00
I2Contacts1.000.03-0.86-0.42
I3Apple Messages1.000.000.001.00-0.33
I4Safari--0.02--0.02
I5Calendar1.00----1.00
\n", + "
" + ], + "text/plain": [ + " Application Email Phone User Name Person Name Postal Address All PII\n", + "ID \n", + "A1 WhatsApp - 0.78 0.50 0.68 - 0.71\n", + "A2 Snapchat 1.00 1.00 0.33 1.00 - 0.79\n", + "A3 Telegram - - - - - -\n", + "A4 Google Maps 1.00 - 1.00 - - 1.00\n", + "A5 Samsung Internet 1.00 - 0.00 - - 0.20\n", + "I1 WhatsApp (iOS) - - - 1.00 1.00 1.00\n", + "I2 Contacts 1.00 0.03 - 0.86 - 0.42\n", + "I3 Apple Messages 1.00 0.00 0.00 1.00 - 0.33\n", + "I4 Safari - - 0.02 - - 0.02\n", + "I5 Calendar 1.00 - - - - 1.00" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(table_data)\n", + "\n", + "# Reorder columns to match Table 10\n", + "final_columns = ['ID', 'Application'] + [COLUMN_MAPPING[pt] for pt in PII_TYPES] + ['All PII']\n", + "df = df[final_columns]\n", + "\n", + "df = df.set_index('ID')\n", + "\n", + "# Format numbers to 2 decimal places, replacing non-numeric placeholders with '-'\n", + "for col in df.columns:\n", + " if col != 'Application':\n", + " df[col] = df[col].apply(lambda x: f\"{x:.2f}\" if isinstance(x, float) else x)\n", + "\n", + "# Display the dataframe\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{table}\n", + "\\caption{Per-application distinct recall.}\n", + "\\label{tab:app_level_recall}\n", + "\\begin{tabular}{llllllll}\n", + "\\toprule\n", + " & Application & Email & Phone & User Name & Person Name & Postal Address & All PII \\\\\n", + "ID & & & & & & & \\\\\n", + "\\midrule\n", + "A1 & WhatsApp & - & 0.78 & 0.50 & 0.68 & - & 0.71 \\\\\n", + "A2 & Snapchat & 1.00 & 1.00 & 0.33 & 1.00 & - & 0.79 \\\\\n", + "A3 & Telegram & - & - & - & - & - & - \\\\\n", + "A4 & Google Maps & 1.00 & - & 1.00 & - & - & 1.00 \\\\\n", + "A5 & Samsung Internet & 1.00 & - & 0.00 & - & - & 0.20 \\\\\n", + "I1 & WhatsApp (iOS) & - & - & - & 1.00 & 1.00 & 1.00 \\\\\n", + "I2 & Contacts & 1.00 & 0.03 & - & 0.86 & - & 0.42 \\\\\n", + "I3 & Apple Messages & 1.00 & 0.00 & 0.00 & 1.00 & - & 0.33 \\\\\n", + "I4 & Safari & - & - & 0.02 & - & - & 0.02 \\\\\n", + "I5 & Calendar & 1.00 & - & - & - & - & 1.00 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\\end{table}\n", + "\n" + ] + } + ], + "source": [ + "# Optional: Save to LaTeX\n", + "latex_output = df.to_latex(index=True, caption='Per-application distinct recall.', label='tab:app_level_recall', na_rep='-')\n", + "print(latex_output)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}