diff --git a/RQs/RQ3/RQ3_t10.ipynb b/RQs/RQ3/RQ3_t10.ipynb new file mode 100644 index 0000000..de1526d --- /dev/null +++ b/RQs/RQ3/RQ3_t10.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RQ3: Table 10 - Per-application distinct recall" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import glob\n", + "import pandas as pd\n", + "from collections import defaultdict\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "GPT4O_RESULTS_DIR = '../normalized_PII_results/gpt4o/db_level/'\n", + "GROUND_TRUTH_DIR = '../normalized_PII_results/ground_truth/db_level/'\n", + "\n", + "PII_TYPES = ['EMAIL', 'PHONE', 'USERNAME', 'PERSON_NAME', 'POSTAL_ADDRESS']\n", + "\n", + "APP_MAPPING = {\n", + " 'A1': 'WhatsApp',\n", + " 'A2': 'Snapchat',\n", + " 'A3': 'Telegram',\n", + " 'A4': 'Google Maps',\n", + " 'A5': 'Samsung Internet',\n", + " 'I1': 'WhatsApp (iOS)',\n", + " 'I2': 'Contacts',\n", + " 'I3': 'Apple Messages',\n", + " 'I4': 'Safari',\n", + " 'I5': 'Calendar'\n", + "}\n", + "\n", + "COLUMN_MAPPING = {\n", + " 'EMAIL': 'Email',\n", + " 'PHONE': 'Phone',\n", + " 'USERNAME': 'User Name',\n", + " 'PERSON_NAME': 'Person Name',\n", + " 'POSTAL_ADDRESS': 'Postal Address'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def canonicalize(pii, pii_type):\n", + " \"\"\"Applies canonicalization rules to a PII string based on its type.\"\"\"\n", + " if not isinstance(pii, str):\n", + " pii = str(pii)\n", + " \n", + " pii = pii.strip()\n", + " \n", + " if pii_type == 'EMAIL':\n", + " return pii.lower()\n", + " elif pii_type == 'PHONE':\n", + " is_plus_prefix = pii.startswith('+')\n", + " digits = re.sub(r'\\D', '', pii)\n", + " return ('+' + digits) if is_plus_prefix else digits\n", + " elif pii_type == 'PERSON_NAME':\n", + " return pii.lower()\n", + " \n", + " return pii # Default for USERNAME, POSTAL_ADDRESS etc. is just stripping whitespace\n", + "\n", + "def parse_filename_app_id(filepath):\n", + " \"\"\"Parses a filename to extract just the app ID.\"\"\"\n", + " base_name = os.path.basename(filepath)\n", + " match = re.match(r'PII_([A-Z0-9]+)_', base_name)\n", + " if match:\n", + " return match.group(1)\n", + " return None\n", + "\n", + "def load_pii_data(path):\n", + " \"\"\"Loads and aggregates all distinct canonicalized PII from a directory.\"\"\"\n", + " # Structure: {app_id: {pii_type: {set of pii_strings}}}\n", + " data = defaultdict(lambda: defaultdict(set))\n", + " files = glob.glob(os.path.join(path, '*.jsonl'))\n", + " \n", + " for f_path in files:\n", + " app_id = parse_filename_app_id(f_path)\n", + " if not app_id:\n", + " continue\n", + " with open(f_path, 'r', encoding='utf-8') as f:\n", + " for line in f:\n", + " try:\n", + " record = json.loads(line)\n", + " pii_type = record['PII_type']\n", + " if pii_type in PII_TYPES:\n", + " for pii_item in record.get('PII', []):\n", + " canon_pii = canonicalize(pii_item, pii_type)\n", + " if canon_pii:\n", + " data[app_id][pii_type].add(canon_pii)\n", + " except json.JSONDecodeError:\n", + " print(f\"Warning: Could not decode JSON from line in {f_path}\")\n", + " return data\n", + "\n", + "gt_pii_sets = load_pii_data(GROUND_TRUTH_DIR)\n", + "system_pii_sets = load_pii_data(GPT4O_RESULTS_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "table_data = []\n", + "\n", + "for app_id, app_name in APP_MAPPING.items():\n", + " row = {'ID': app_id, 'Application': app_name}\n", + " \n", + " all_gt_pii_for_app = set()\n", + " all_system_pii_for_app = set()\n", + "\n", + " # --- Per-PII Type Recall Calculation ---\n", + " for pii_type in PII_TYPES:\n", + " col_name = COLUMN_MAPPING[pii_type]\n", + " \n", + " gt_set = gt_pii_sets.get(app_id, {}).get(pii_type, set())\n", + " system_set = system_pii_sets.get(app_id, {}).get(pii_type, set())\n", + " \n", + " all_gt_pii_for_app.update(gt_set)\n", + " all_system_pii_for_app.update(system_set)\n", + " \n", + " if not gt_set:\n", + " row[col_name] = '-'\n", + " else:\n", + " # Ra,t = |Ga,t ∩ Sa,t| / |Ga,t|\n", + " recall = len(gt_set.intersection(system_set)) / len(gt_set)\n", + " row[col_name] = recall\n", + "\n", + " # --- All PII Recall Calculation ---\n", + " if not all_gt_pii_for_app:\n", + " row['All PII'] = '-'\n", + " else:\n", + " # Ra,all = |(U Ga,t) ∩ (U Sa,t)| / |U Ga,t|\n", + " all_recall = len(all_gt_pii_for_app.intersection(all_system_pii_for_app)) / len(all_gt_pii_for_app)\n", + " row['All PII'] = all_recall\n", + " \n", + " table_data.append(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Application | \n", + "Phone | \n", + "User Name | \n", + "Person Name | \n", + "Postal Address | \n", + "All PII | \n", + "|
|---|---|---|---|---|---|---|---|
| ID | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| A1 | \n", + "- | \n", + "0.78 | \n", + "0.50 | \n", + "0.68 | \n", + "- | \n", + "0.71 | \n", + "|
| A2 | \n", + "Snapchat | \n", + "1.00 | \n", + "1.00 | \n", + "0.33 | \n", + "1.00 | \n", + "- | \n", + "0.79 | \n", + "
| A3 | \n", + "Telegram | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "
| A4 | \n", + "Google Maps | \n", + "1.00 | \n", + "- | \n", + "1.00 | \n", + "- | \n", + "- | \n", + "1.00 | \n", + "
| A5 | \n", + "Samsung Internet | \n", + "1.00 | \n", + "- | \n", + "0.00 | \n", + "- | \n", + "- | \n", + "0.20 | \n", + "
| I1 | \n", + "WhatsApp (iOS) | \n", + "- | \n", + "- | \n", + "- | \n", + "1.00 | \n", + "1.00 | \n", + "1.00 | \n", + "
| I2 | \n", + "Contacts | \n", + "1.00 | \n", + "0.03 | \n", + "- | \n", + "0.86 | \n", + "- | \n", + "0.42 | \n", + "
| I3 | \n", + "Apple Messages | \n", + "1.00 | \n", + "0.00 | \n", + "0.00 | \n", + "1.00 | \n", + "- | \n", + "0.33 | \n", + "
| I4 | \n", + "Safari | \n", + "- | \n", + "- | \n", + "0.02 | \n", + "- | \n", + "- | \n", + "0.02 | \n", + "
| I5 | \n", + "Calendar | \n", + "1.00 | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "1.00 | \n", + "