mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
reorganize RQs in different folders
This commit is contained in:
@@ -42,21 +42,17 @@
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pathlib import Path\n",
|
||||
"import sys\n",
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"# IMPORTANT: sys.path needs a DIRECTORY, not the .py file itself\n",
|
||||
"STATS_DIR = Path(r\"I:\\project2026\\llmagent\\RQs\").resolve() # folder containing stats_utils.py\n",
|
||||
"sys.path.insert(0, str(STATS_DIR))\n",
|
||||
"\n",
|
||||
"from stats_utils import normalize_and_slim_record\n",
|
||||
"\n",
|
||||
"IN_DIR = Path(r\"..\\..\\batch_results\")\n",
|
||||
"OUT_DIR = Path(r\"..\\batch_results_normalized\")\n",
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Callable, Tuple\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def process_file(in_path: Path, out_path: Path) -> int:\n",
|
||||
"def process_file_jsonl(\n",
|
||||
" in_path: Path,\n",
|
||||
" out_path: Path,\n",
|
||||
" normalize_record_fn: Callable[[dict], dict],\n",
|
||||
") -> int:\n",
|
||||
" n = 0\n",
|
||||
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
|
||||
" for line in fin:\n",
|
||||
@@ -66,43 +62,66 @@
|
||||
" obj = json.loads(line)\n",
|
||||
" if not isinstance(obj, dict):\n",
|
||||
" continue\n",
|
||||
" slim = normalize_and_slim_record(obj)\n",
|
||||
" slim = normalize_record_fn(obj)\n",
|
||||
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
|
||||
" n += 1\n",
|
||||
" return n\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def main() -> None:\n",
|
||||
" # Delete OUT_DIR if it exists, then recreate it cleanly\n",
|
||||
" if OUT_DIR.exists():\n",
|
||||
" if OUT_DIR.is_dir():\n",
|
||||
" shutil.rmtree(OUT_DIR)\n",
|
||||
"def normalize_jsonl_folder(\n",
|
||||
" in_dir: Path,\n",
|
||||
" out_dir: Path,\n",
|
||||
" normalize_record_fn: Callable[[dict], dict],\n",
|
||||
" *,\n",
|
||||
" delete_out_dir_first: bool = True,\n",
|
||||
") -> Tuple[int, int]:\n",
|
||||
" \"\"\"\n",
|
||||
" Normalize every *.jsonl file in `in_dir` and write outputs (same filenames) to `out_dir`.\n",
|
||||
"\n",
|
||||
" Returns: (num_files_processed, num_records_written)\n",
|
||||
" \"\"\"\n",
|
||||
" if delete_out_dir_first and out_dir.exists():\n",
|
||||
" if out_dir.is_dir():\n",
|
||||
" shutil.rmtree(out_dir)\n",
|
||||
" else:\n",
|
||||
" OUT_DIR.unlink()\n",
|
||||
" out_dir.unlink()\n",
|
||||
"\n",
|
||||
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
" out_dir.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
" files = sorted(IN_DIR.glob(\"*.jsonl\"))\n",
|
||||
" files = sorted(in_dir.glob(\"*.jsonl\"))\n",
|
||||
" if not files:\n",
|
||||
" print(f\"No .jsonl files found in: {IN_DIR.resolve()}\")\n",
|
||||
" return\n",
|
||||
" print(f\"No .jsonl files found in: {in_dir.resolve()}\")\n",
|
||||
" return (0, 0)\n",
|
||||
"\n",
|
||||
" total_files = 0\n",
|
||||
" total_records = 0\n",
|
||||
"\n",
|
||||
" for fp in files:\n",
|
||||
" out_fp = OUT_DIR / fp.name\n",
|
||||
" n = process_file(fp, out_fp)\n",
|
||||
" out_fp = out_dir / fp.name\n",
|
||||
" n = process_file_jsonl(fp, out_fp, normalize_record_fn)\n",
|
||||
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
|
||||
" total_files += 1\n",
|
||||
" total_records += n\n",
|
||||
"\n",
|
||||
" print(f\"Done. Files: {total_files}, Records: {total_records}\")\n",
|
||||
" print(f\"Output folder: {OUT_DIR.resolve()}\")\n",
|
||||
" print(f\"Done. Files: {len(files)}, Records: {total_records}\")\n",
|
||||
" print(f\"Output folder: {out_dir.resolve()}\")\n",
|
||||
" return (len(files), total_records)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ---- Example usage (your exact paths) ----\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" main()"
|
||||
" STATS_DIR = Path(r\"I:\\project2026\\llmagent\\RQs\").resolve() # folder containing stats_utils.py\n",
|
||||
" sys.path.insert(0, str(STATS_DIR))\n",
|
||||
"\n",
|
||||
" from stats_utils import normalize_and_slim_record\n",
|
||||
"\n",
|
||||
" IN_DIR = Path(r\"..\\..\\batch_results\")\n",
|
||||
" OUT_DIR = Path(r\"..\\batch_results_normalized\")\n",
|
||||
"\n",
|
||||
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
|
||||
" \n",
|
||||
" IN_DIR = Path(r\"..\\..\\ground_truth\")\n",
|
||||
" OUT_DIR = Path(r\"..\\ground_truth_normalized\")\n",
|
||||
"\n",
|
||||
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
1
RQs/ground_truth_normalized/PII_A1_commerce.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A1_commerce.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "commerce.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_A1_msgstore.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A1_msgstore.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "msgstore.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["business_name"], "Num_of_source_columns": 1}
|
||||
8
RQs/ground_truth_normalized/PII_A1_wa.jsonl
Normal file
8
RQs/ground_truth_normalized/PII_A1_wa.jsonl
Normal file
@@ -0,0 +1,8 @@
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["number"], "Num_of_source_columns": 1}
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["given_name+family_name", "sort_name"], "Num_of_source_columns": 2}
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["display_name"], "Num_of_source_columns": 1}
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["number"], "Num_of_source_columns": 1}
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["display_name", "wa_name"], "Num_of_source_columns": 2}
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["given_name+family_name", "sort_name"], "Num_of_source_columns": 2}
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["verified_name"], "Num_of_source_columns": 1}
|
||||
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["creator_name"], "Num_of_source_columns": 1}
|
||||
4
RQs/ground_truth_normalized/PII_A2_core.jsonl
Normal file
4
RQs/ground_truth_normalized/PII_A2_core.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}
|
||||
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}
|
||||
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}
|
||||
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}
|
||||
1
RQs/ground_truth_normalized/PII_A2_journal.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A2_journal.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "journal.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
10
RQs/ground_truth_normalized/PII_A2_main.jsonl
Normal file
10
RQs/ground_truth_normalized/PII_A2_main.jsonl
Normal file
@@ -0,0 +1,10 @@
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["encodedusername", "mutableusername", "originalusername"], "Num_of_source_columns": 3}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["phone", "rawphone"], "Num_of_source_columns": 2}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname"], "Num_of_source_columns": 1}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname", "serverdisplayname", "username"], "Num_of_source_columns": 3}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname"], "Num_of_source_columns": 1}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["captiontextdisplay"], "Num_of_source_columns": 1}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname", "serverdisplayname", "username", "usernameforsorting"], "Num_of_source_columns": 4}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayinteractionuserusername", "frienddisplayusername"], "Num_of_source_columns": 2}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayinteractionuserdisplayname"], "Num_of_source_columns": 1}
|
||||
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["frienddisplayname", "friendusername"], "Num_of_source_columns": 2}
|
||||
1
RQs/ground_truth_normalized/PII_A3_account1cache4.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A3_account1cache4.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "account1cache4.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_A3_account2cache4.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A3_account2cache4.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "account2cache4.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_A3_account3cache4.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A3_account3cache4.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "account3cache4.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_A4_gmm_myplaces.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A4_gmm_myplaces.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "gmm_myplaces.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_A4_gmm_storage.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A4_gmm_storage.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "gmm_storage.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
@@ -0,0 +1,6 @@
|
||||
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
|
||||
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
|
||||
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
|
||||
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c1value"], "Num_of_source_columns": 1}
|
||||
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c1value"], "Num_of_source_columns": 1}
|
||||
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c1value"], "Num_of_source_columns": 1}
|
||||
3
RQs/ground_truth_normalized/PII_A5_SBrowser.jsonl
Normal file
3
RQs/ground_truth_normalized/PII_A5_SBrowser.jsonl
Normal file
@@ -0,0 +1,3 @@
|
||||
{"db_path": "SBrowser.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_name"], "Num_of_source_columns": 1}
|
||||
{"db_path": "SBrowser.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_name"], "Num_of_source_columns": 1}
|
||||
{"db_path": "SBrowser.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_name"], "Num_of_source_columns": 1}
|
||||
1
RQs/ground_truth_normalized/PII_A5_SBrowser2.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A5_SBrowser2.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "SBrowser2.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_A5_searchengine.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_A5_searchengine.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "searchengine.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_I1_CallHistory.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_I1_CallHistory.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "CallHistory.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
4
RQs/ground_truth_normalized/PII_I1_ChatStorage.jsonl
Normal file
4
RQs/ground_truth_normalized/PII_I1_ChatStorage.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpartnername"], "Num_of_source_columns": 1}
|
||||
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpartnername"], "Num_of_source_columns": 1}
|
||||
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpushname"], "Num_of_source_columns": 1}
|
||||
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpushname"], "Num_of_source_columns": 1}
|
||||
2
RQs/ground_truth_normalized/PII_I1_ContactsV2.jsonl
Normal file
2
RQs/ground_truth_normalized/PII_I1_ContactsV2.jsonl
Normal file
@@ -0,0 +1,2 @@
|
||||
{"db_path": "ContactsV2.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zfullname"], "Num_of_source_columns": 1}
|
||||
{"db_path": "ContactsV2.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zphonenumber"], "Num_of_source_columns": 1}
|
||||
5
RQs/ground_truth_normalized/PII_I2_AddressBook.jsonl
Normal file
5
RQs/ground_truth_normalized/PII_I2_AddressBook.jsonl
Normal file
@@ -0,0 +1,5 @@
|
||||
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["first+last"], "Num_of_source_columns": 1}
|
||||
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
|
||||
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
|
||||
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c17email"], "Num_of_source_columns": 1}
|
||||
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c0first+c1last+c2middle"], "Num_of_source_columns": 1}
|
||||
@@ -0,0 +1 @@
|
||||
{"db_path": "AddressBookImages.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
6
RQs/ground_truth_normalized/PII_I3_sms.jsonl
Normal file
6
RQs/ground_truth_normalized/PII_I3_sms.jsonl
Normal file
@@ -0,0 +1,6 @@
|
||||
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_login", "chat_identifier", "guid", "last_addressed_handle"], "Num_of_source_columns": 4}
|
||||
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_login", "chat_identifier", "guid"], "Num_of_source_columns": 3}
|
||||
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account", "destination_caller_id"], "Num_of_source_columns": 2}
|
||||
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account"], "Num_of_source_columns": 1}
|
||||
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["id", "uncanonicalized_id"], "Num_of_source_columns": 2}
|
||||
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["id"], "Num_of_source_columns": 1}
|
||||
1
RQs/ground_truth_normalized/PII_I4_CloudTabs.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_I4_CloudTabs.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "CloudTabs.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
1
RQs/ground_truth_normalized/PII_I4_History.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_I4_History.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "History.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
5
RQs/ground_truth_normalized/PII_I5_Calendar.jsonl
Normal file
5
RQs/ground_truth_normalized/PII_I5_Calendar.jsonl
Normal file
@@ -0,0 +1,5 @@
|
||||
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["last_sync_title", "notes", "owner_identity_email", "self_identity_email", "shared_owner_address", "title"], "Num_of_source_columns": 6}
|
||||
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["shared_owner_name"], "Num_of_source_columns": 1}
|
||||
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["address"], "Num_of_source_columns": 1}
|
||||
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["display_name"], "Num_of_source_columns": 1}
|
||||
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["owner_name"], "Num_of_source_columns": 1}
|
||||
1
RQs/ground_truth_normalized/PII_I5_Extras.jsonl
Normal file
1
RQs/ground_truth_normalized/PII_I5_Extras.jsonl
Normal file
@@ -0,0 +1 @@
|
||||
{"db_path": "Extras.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}
|
||||
Reference in New Issue
Block a user