mobile-pii-discovery-agent/RQs/RQ3/RQ3_t10.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RQ3: Table 10 - Per-application distinct recall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import glob\n",
    "import pandas as pd\n",
    "from collections import defaultdict\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "# Add the parent directory (RQs) to the path to find the config file\n",
    "if '..' not in sys.path:\n",
    "    sys.path.insert(1, os.path.abspath('..'))\n",
    "import config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def canonicalize(pii, pii_type):\n",
    "    \"\"\"Applies canonicalization rules to a PII string based on its type.\"\"\"\n",
    "    if not isinstance(pii, str):\n",
    "        pii = str(pii)\n",
    "    \n",
    "    pii = pii.strip()\n",
    "    \n",
    "    if pii_type == 'EMAIL':\n",
    "        return pii.lower()\n",
    "    elif pii_type == 'PHONE':\n",
    "        is_plus_prefix = pii.startswith('+')\n",
    "        digits = re.sub(r'\\D', '', pii)\n",
    "        return ('+' + digits) if is_plus_prefix else digits\n",
    "    elif pii_type == 'PERSON_NAME':\n",
    "        return pii.lower()\n",
    "    \n",
    "    return pii # Default for USERNAME, POSTAL_ADDRESS etc. is just stripping whitespace\n",
    "\n",
    "def parse_filename_app_id(filepath):\n",
    "    \"\"\"Parses a filename to extract just the app ID.\"\"\"\n",
    "    base_name = os.path.basename(filepath)\n",
    "    match = re.match(r'PII_([A-Z0-9]+)_', base_name)\n",
    "    if match:\n",
    "        return match.group(1)\n",
    "    return None\n",
    "\n",
    "def load_pii_data(path):\n",
    "    \"\"\"Loads and aggregates all distinct canonicalized PII from a directory.\"\"\"\n",
    "    # Structure: {app_id: {pii_type: {set of pii_strings}}}\n",
    "    data = defaultdict(lambda: defaultdict(set))\n",
    "    files = glob.glob(os.path.join(path, '*.jsonl'))\n",
    "    \n",
    "    for f_path in files:\n",
    "        app_id = parse_filename_app_id(f_path)\n",
    "        if not app_id:\n",
    "            continue\n",
    "        with open(f_path, 'r', encoding='utf-8') as f:\n",
    "            for line in f:\n",
    "                try:\n",
    "                    record = json.loads(line)\n",
    "                    pii_type = record['PII_type']\n",
    "                    if pii_type in config.PII_TYPES:\n",
    "                        for pii_item in record.get('PII', []):\n",
    "                            canon_pii = canonicalize(pii_item, pii_type)\n",
    "                            if canon_pii:\n",
    "                                data[app_id][pii_type].add(canon_pii)\n",
    "                except json.JSONDecodeError:\n",
    "                    print(f\"Warning: Could not decode JSON from line in {f_path}\")\n",
    "    return data\n",
    "\n",
    "gt_pii_sets = load_pii_data(os.path.join('..', config.GROUND_TRUTH_DIR))\n",
    "system_pii_sets = load_pii_data(os.path.join('..', config.GPT4O_RESULTS_DIR))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "table_data = []\n",
    "\n",
    "for app_id, app_name in config.APP_MAPPING.items():\n",
    "    row = {'ID': app_id, 'Application': app_name}\n",
    "    \n",
    "    all_gt_pii_for_app = set()\n",
    "    all_system_pii_for_app = set()\n",
    "\n",
    "    # --- Per-PII Type Recall Calculation ---\n",
    "    for pii_type in config.PII_TYPES:\n",
    "        col_name = config.COLUMN_MAPPING[pii_type]\n",
    "        \n",
    "        gt_set = gt_pii_sets.get(app_id, {}).get(pii_type, set())\n",
    "        system_set = system_pii_sets.get(app_id, {}).get(pii_type, set())\n",
    "        \n",
    "        all_gt_pii_for_app.update(gt_set)\n",
    "        all_system_pii_for_app.update(system_set)\n",
    "        \n",
    "        if not gt_set:\n",
    "            row[col_name] = '-'\n",
    "        else:\n",
    "            # Ra,t = |Ga,t ∩ Sa,t| / |Ga,t|\n",
    "            recall = len(gt_set.intersection(system_set)) / len(gt_set)\n",
    "            row[col_name] = recall\n",
    "\n",
    "    # --- All PII Recall Calculation ---\n",
    "    if not all_gt_pii_for_app:\n",
    "        row['All PII'] = '-'\n",
    "    else:\n",
    "        # Ra,all = |(U Ga,t) ∩ (U Sa,t)| / |U Ga,t|\n",
    "        all_recall = len(all_gt_pii_for_app.intersection(all_system_pii_for_app)) / len(all_gt_pii_for_app)\n",
    "        row['All PII'] = all_recall\n",
    "        \n",
    "    table_data.append(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Application</th>\n",
       "      <th>Email</th>\n",
       "      <th>Phone</th>\n",
       "      <th>User Name</th>\n",
       "      <th>Person Name</th>\n",
       "      <th>Postal Address</th>\n",
       "      <th>All PII</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1</th>\n",
       "      <td>WhatsApp</td>\n",
       "      <td>-</td>\n",
       "      <td>0.96</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.68</td>\n",
       "      <td>-</td>\n",
       "      <td>0.79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2</th>\n",
       "      <td>Snapchat</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.33</td>\n",
       "      <td>1.00</td>\n",
       "      <td>-</td>\n",
       "      <td>0.79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A3</th>\n",
       "      <td>Telegram</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A4</th>\n",
       "      <td>Google Maps</td>\n",
       "      <td>1.00</td>\n",
       "      <td>-</td>\n",
       "      <td>1.00</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>1.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A5</th>\n",
       "      <td>Samsung Internet</td>\n",
       "      <td>1.00</td>\n",
       "      <td>-</td>\n",
       "      <td>0.00</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>0.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I1</th>\n",
       "      <td>WhatsApp (iOS)</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I2</th>\n",
       "      <td>Contacts</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.47</td>\n",
       "      <td>-</td>\n",
       "      <td>0.86</td>\n",
       "      <td>-</td>\n",
       "      <td>0.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I3</th>\n",
       "      <td>Apple Messages</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>-</td>\n",
       "      <td>0.33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I4</th>\n",
       "      <td>Safari</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>0.02</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>0.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I5</th>\n",
       "      <td>Calendar</td>\n",
       "      <td>1.00</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>1.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Application Email Phone User Name Person Name Postal Address All PII\n",
       "ID                                                                           \n",
       "A1          WhatsApp     -  0.96      0.50        0.68              -    0.79\n",
       "A2          Snapchat  1.00  1.00      0.33        1.00              -    0.79\n",
       "A3          Telegram     -     -         -           -              -       -\n",
       "A4       Google Maps  1.00     -      1.00           -              -    1.00\n",
       "A5  Samsung Internet  1.00     -      0.00           -              -    0.20\n",
       "I1    WhatsApp (iOS)     -     -         -        1.00           1.00    1.00\n",
       "I2          Contacts  1.00  0.47         -        0.86              -    0.65\n",
       "I3    Apple Messages  1.00  0.00      0.00        1.00              -    0.33\n",
       "I4            Safari     -     -      0.02           -              -    0.02\n",
       "I5          Calendar  1.00     -         -           -              -    1.00"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(table_data)\n",
    "\n",
    "# Reorder columns to match Table 10\n",
    "final_columns = ['ID', 'Application'] + [config.COLUMN_MAPPING[pt] for pt in config.PII_TYPES] + ['All PII']\n",
    "df = df[final_columns]\n",
    "\n",
    "df = df.set_index('ID')\n",
    "\n",
    "# Format numbers to 2 decimal places, replacing non-numeric placeholders with '-‘\n",
    "for col in df.columns:\n",
    "    if col != 'Application':\n",
    "        df[col] = df[col].apply(lambda x: f\"{x:.2f}\" if isinstance(x, float) else x)\n",
    "\n",
    "# Display the dataframe\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{table}\n",
      "\\caption{Per-application distinct recall.}\n",
      "\\label{tab:app_level_recall}\n",
      "\\begin{tabular}{llllllll}\n",
      "\\toprule\n",
      " & Application & Email & Phone & User Name & Person Name & Postal Address & All PII \\\\\n",
      "ID &  &  &  &  &  &  &  \\\\\n",
      "\\midrule\n",
      "A1 & WhatsApp & - & 0.96 & 0.50 & 0.68 & - & 0.79 \\\\\n",
      "A2 & Snapchat & 1.00 & 1.00 & 0.33 & 1.00 & - & 0.79 \\\\\n",
      "A3 & Telegram & - & - & - & - & - & - \\\\\n",
      "A4 & Google Maps & 1.00 & - & 1.00 & - & - & 1.00 \\\\\n",
      "A5 & Samsung Internet & 1.00 & - & 0.00 & - & - & 0.20 \\\\\n",
      "I1 & WhatsApp (iOS) & - & - & - & 1.00 & 1.00 & 1.00 \\\\\n",
      "I2 & Contacts & 1.00 & 0.47 & - & 0.86 & - & 0.65 \\\\\n",
      "I3 & Apple Messages & 1.00 & 0.00 & 0.00 & 1.00 & - & 0.33 \\\\\n",
      "I4 & Safari & - & - & 0.02 & - & - & 0.02 \\\\\n",
      "I5 & Calendar & 1.00 & - & - & - & - & 1.00 \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\\end{table}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Optional: Save to LaTeX\n",
    "latex_output = df.to_latex(index=True, caption='Per-application distinct recall.', label='tab:app_level_recall', na_rep='-')\n",
    "print(latex_output)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}