reorganize RQs in different folders

This commit is contained in:
Frank Xu
2026-01-28 12:56:56 -05:00
parent 19aaf185e2
commit b85e861adb
26 changed files with 118 additions and 31 deletions

View File

@@ -42,21 +42,17 @@
],
"source": [
"import json\n",
"from pathlib import Path\n",
"import sys\n",
"import shutil\n",
"\n",
"# IMPORTANT: sys.path needs a DIRECTORY, not the .py file itself\n",
"STATS_DIR = Path(r\"I:\\project2026\\llmagent\\RQs\").resolve() # folder containing stats_utils.py\n",
"sys.path.insert(0, str(STATS_DIR))\n",
"\n",
"from stats_utils import normalize_and_slim_record\n",
"\n",
"IN_DIR = Path(r\"..\\..\\batch_results\")\n",
"OUT_DIR = Path(r\"..\\batch_results_normalized\")\n",
"import sys\n",
"from pathlib import Path\n",
"from typing import Callable, Tuple\n",
"\n",
"\n",
"def process_file(in_path: Path, out_path: Path) -> int:\n",
"def process_file_jsonl(\n",
" in_path: Path,\n",
" out_path: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
") -> int:\n",
" n = 0\n",
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
" for line in fin:\n",
@@ -66,43 +62,66 @@
" obj = json.loads(line)\n",
" if not isinstance(obj, dict):\n",
" continue\n",
" slim = normalize_and_slim_record(obj)\n",
" slim = normalize_record_fn(obj)\n",
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
" n += 1\n",
" return n\n",
"\n",
"\n",
"def main() -> None:\n",
" # Delete OUT_DIR if it exists, then recreate it cleanly\n",
" if OUT_DIR.exists():\n",
" if OUT_DIR.is_dir():\n",
" shutil.rmtree(OUT_DIR)\n",
"def normalize_jsonl_folder(\n",
" in_dir: Path,\n",
" out_dir: Path,\n",
" normalize_record_fn: Callable[[dict], dict],\n",
" *,\n",
" delete_out_dir_first: bool = True,\n",
") -> Tuple[int, int]:\n",
" \"\"\"\n",
" Normalize every *.jsonl file in `in_dir` and write outputs (same filenames) to `out_dir`.\n",
"\n",
" Returns: (num_files_processed, num_records_written)\n",
" \"\"\"\n",
" if delete_out_dir_first and out_dir.exists():\n",
" if out_dir.is_dir():\n",
" shutil.rmtree(out_dir)\n",
" else:\n",
" OUT_DIR.unlink()\n",
" out_dir.unlink()\n",
"\n",
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
" out_dir.mkdir(parents=True, exist_ok=True)\n",
"\n",
" files = sorted(IN_DIR.glob(\"*.jsonl\"))\n",
" files = sorted(in_dir.glob(\"*.jsonl\"))\n",
" if not files:\n",
" print(f\"No .jsonl files found in: {IN_DIR.resolve()}\")\n",
" return\n",
" print(f\"No .jsonl files found in: {in_dir.resolve()}\")\n",
" return (0, 0)\n",
"\n",
" total_files = 0\n",
" total_records = 0\n",
"\n",
" for fp in files:\n",
" out_fp = OUT_DIR / fp.name\n",
" n = process_file(fp, out_fp)\n",
" out_fp = out_dir / fp.name\n",
" n = process_file_jsonl(fp, out_fp, normalize_record_fn)\n",
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
" total_files += 1\n",
" total_records += n\n",
"\n",
" print(f\"Done. Files: {total_files}, Records: {total_records}\")\n",
" print(f\"Output folder: {OUT_DIR.resolve()}\")\n",
" print(f\"Done. Files: {len(files)}, Records: {total_records}\")\n",
" print(f\"Output folder: {out_dir.resolve()}\")\n",
" return (len(files), total_records)\n",
"\n",
"\n",
"# ---- Example usage (your exact paths) ----\n",
"if __name__ == \"__main__\":\n",
" main()"
" STATS_DIR = Path(r\"I:\\project2026\\llmagent\\RQs\").resolve() # folder containing stats_utils.py\n",
" sys.path.insert(0, str(STATS_DIR))\n",
"\n",
" from stats_utils import normalize_and_slim_record\n",
"\n",
" IN_DIR = Path(r\"..\\..\\batch_results\")\n",
" OUT_DIR = Path(r\"..\\batch_results_normalized\")\n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" \n",
" IN_DIR = Path(r\"..\\..\\ground_truth\")\n",
" OUT_DIR = Path(r\"..\\ground_truth_normalized\")\n",
"\n",
" normalize_jsonl_folder(IN_DIR, OUT_DIR, normalize_and_slim_record, delete_out_dir_first=True)\n",
" "
]
},
{

View File

@@ -0,0 +1 @@
{"db_path": "commerce.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "msgstore.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["business_name"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1,8 @@
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["number"], "Num_of_source_columns": 1}
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["given_name+family_name", "sort_name"], "Num_of_source_columns": 2}
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["display_name"], "Num_of_source_columns": 1}
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["number"], "Num_of_source_columns": 1}
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["display_name", "wa_name"], "Num_of_source_columns": 2}
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["given_name+family_name", "sort_name"], "Num_of_source_columns": 2}
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["verified_name"], "Num_of_source_columns": 1}
{"db_path": "wa.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["creator_name"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1,4 @@
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}
{"db_path": "core.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["textval"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1 @@
{"db_path": "journal.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,10 @@
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["encodedusername", "mutableusername", "originalusername"], "Num_of_source_columns": 3}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["phone", "rawphone"], "Num_of_source_columns": 2}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname"], "Num_of_source_columns": 1}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname", "serverdisplayname", "username"], "Num_of_source_columns": 3}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname"], "Num_of_source_columns": 1}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["captiontextdisplay"], "Num_of_source_columns": 1}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayname", "serverdisplayname", "username", "usernameforsorting"], "Num_of_source_columns": 4}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayinteractionuserusername", "frienddisplayusername"], "Num_of_source_columns": 2}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["displayinteractionuserdisplayname"], "Num_of_source_columns": 1}
{"db_path": "main.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["frienddisplayname", "friendusername"], "Num_of_source_columns": 2}

View File

@@ -0,0 +1 @@
{"db_path": "account1cache4.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "account2cache4.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "account3cache4.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "gmm_myplaces.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "gmm_storage.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,6 @@
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c1value"], "Num_of_source_columns": 1}
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c1value"], "Num_of_source_columns": 1}
{"db_path": "peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c1value"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1,3 @@
{"db_path": "SBrowser.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_name"], "Num_of_source_columns": 1}
{"db_path": "SBrowser.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_name"], "Num_of_source_columns": 1}
{"db_path": "SBrowser.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_name"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1 @@
{"db_path": "SBrowser2.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "searchengine.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "CallHistory.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,4 @@
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpartnername"], "Num_of_source_columns": 1}
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpartnername"], "Num_of_source_columns": 1}
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpushname"], "Num_of_source_columns": 1}
{"db_path": "ChatStorage.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zpushname"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1,2 @@
{"db_path": "ContactsV2.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zfullname"], "Num_of_source_columns": 1}
{"db_path": "ContactsV2.sqlite", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["zphonenumber"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1,5 @@
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["first+last"], "Num_of_source_columns": 1}
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["value"], "Num_of_source_columns": 1}
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c17email"], "Num_of_source_columns": 1}
{"db_path": "AddressBook.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["c0first+c1last+c2middle"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1 @@
{"db_path": "AddressBookImages.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,6 @@
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_login", "chat_identifier", "guid", "last_addressed_handle"], "Num_of_source_columns": 4}
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account_login", "chat_identifier", "guid"], "Num_of_source_columns": 3}
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account", "destination_caller_id"], "Num_of_source_columns": 2}
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["account"], "Num_of_source_columns": 1}
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["id", "uncanonicalized_id"], "Num_of_source_columns": 2}
{"db_path": "sms.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["id"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1 @@
{"db_path": "CloudTabs.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1 @@
{"db_path": "History.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}

View File

@@ -0,0 +1,5 @@
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["last_sync_title", "notes", "owner_identity_email", "self_identity_email", "shared_owner_address", "title"], "Num_of_source_columns": 6}
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["shared_owner_name"], "Num_of_source_columns": 1}
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["address"], "Num_of_source_columns": 1}
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["display_name"], "Num_of_source_columns": 1}
{"db_path": "Calendar.sqlitedb", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": ["owner_name"], "Num_of_source_columns": 1}

View File

@@ -0,0 +1 @@
{"db_path": "Extras.db", "PII_type": "", "PII": [], "Num_of_PII": 0, "source_columns": [], "Num_of_source_columns": 0}