mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
reorginze RQs in different folders
This commit is contained in:
138
RQs/RQ0/RQ0_batch_results_normalization.ipynb
Normal file
138
RQs/RQ0/RQ0_batch_results_normalization.ipynb
Normal file
@@ -0,0 +1,138 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c2d824a6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A1_commerce_20260127T175911Z.jsonl\n",
|
||||
"PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
|
||||
"PII_A1_wa_20260127T180213Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A1_wa_20260127T180213Z.jsonl\n",
|
||||
"PII_A2_core_20260127T180339Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A2_core_20260127T180339Z.jsonl\n",
|
||||
"PII_A2_journal_20260127T180440Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A2_journal_20260127T180440Z.jsonl\n",
|
||||
"PII_A2_main_20260127T180710Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A2_main_20260127T180710Z.jsonl\n",
|
||||
"PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
|
||||
"PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
|
||||
"PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
|
||||
"PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
|
||||
"PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
|
||||
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
|
||||
"PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
|
||||
"PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
|
||||
"PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
|
||||
"PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
|
||||
"PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
|
||||
"PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
|
||||
"PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
|
||||
"PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
|
||||
"PII_I3_sms_20260127T183606Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I3_sms_20260127T183606Z.jsonl\n",
|
||||
"PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
|
||||
"PII_I4_History_20260127T183727Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I4_History_20260127T183727Z.jsonl\n",
|
||||
"PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
|
||||
"PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> ..\\batch_results_normalized\\PII_I5_Extras_20260127T183857Z.jsonl\n",
|
||||
"Done. Files: 25, Records: 125\n",
|
||||
"Output folder: I:\\project2026\\llmagent\\RQs\\batch_results_normalized\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pathlib import Path\n",
|
||||
"import sys\n",
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"# IMPORTANT: sys.path needs a DIRECTORY, not the .py file itself\n",
|
||||
"STATS_DIR = Path(r\"I:\\project2026\\llmagent\\RQs\").resolve() # folder containing stats_utils.py\n",
|
||||
"sys.path.insert(0, str(STATS_DIR))\n",
|
||||
"\n",
|
||||
"from stats_utils import normalize_and_slim_record\n",
|
||||
"\n",
|
||||
"IN_DIR = Path(r\"..\\..\\batch_results\")\n",
|
||||
"OUT_DIR = Path(r\"..\\batch_results_normalized\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def process_file(in_path: Path, out_path: Path) -> int:\n",
|
||||
" n = 0\n",
|
||||
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
|
||||
" for line in fin:\n",
|
||||
" line = line.strip()\n",
|
||||
" if not line:\n",
|
||||
" continue\n",
|
||||
" obj = json.loads(line)\n",
|
||||
" if not isinstance(obj, dict):\n",
|
||||
" continue\n",
|
||||
" slim = normalize_and_slim_record(obj)\n",
|
||||
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
|
||||
" n += 1\n",
|
||||
" return n\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def main() -> None:\n",
|
||||
" # Delete OUT_DIR if it exists, then recreate it cleanly\n",
|
||||
" if OUT_DIR.exists():\n",
|
||||
" if OUT_DIR.is_dir():\n",
|
||||
" shutil.rmtree(OUT_DIR)\n",
|
||||
" else:\n",
|
||||
" OUT_DIR.unlink()\n",
|
||||
"\n",
|
||||
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
" files = sorted(IN_DIR.glob(\"*.jsonl\"))\n",
|
||||
" if not files:\n",
|
||||
" print(f\"No .jsonl files found in: {IN_DIR.resolve()}\")\n",
|
||||
" return\n",
|
||||
"\n",
|
||||
" total_files = 0\n",
|
||||
" total_records = 0\n",
|
||||
"\n",
|
||||
" for fp in files:\n",
|
||||
" out_fp = OUT_DIR / fp.name\n",
|
||||
" n = process_file(fp, out_fp)\n",
|
||||
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
|
||||
" total_files += 1\n",
|
||||
" total_records += n\n",
|
||||
"\n",
|
||||
" print(f\"Done. Files: {total_files}, Records: {total_records}\")\n",
|
||||
" print(f\"Output folder: {OUT_DIR.resolve()}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" main()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0074eda4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,122 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c2d824a6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_commerce_20260127T175911Z.jsonl\n",
|
||||
"PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_msgstore_20260127T180043Z.jsonl\n",
|
||||
"PII_A1_wa_20260127T180213Z.jsonl: 5 records -> batch_results_normalized\\PII_A1_wa_20260127T180213Z.jsonl\n",
|
||||
"PII_A2_core_20260127T180339Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_core_20260127T180339Z.jsonl\n",
|
||||
"PII_A2_journal_20260127T180440Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_journal_20260127T180440Z.jsonl\n",
|
||||
"PII_A2_main_20260127T180710Z.jsonl: 5 records -> batch_results_normalized\\PII_A2_main_20260127T180710Z.jsonl\n",
|
||||
"PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account1cache4_20260127T180745Z.jsonl\n",
|
||||
"PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account2cache4_20260127T180821Z.jsonl\n",
|
||||
"PII_A3_account3cache4_20260127T180857Z.jsonl: 5 records -> batch_results_normalized\\PII_A3_account3cache4_20260127T180857Z.jsonl\n",
|
||||
"PII_A4_gmm_myplaces_20260127T180935Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_gmm_myplaces_20260127T180935Z.jsonl\n",
|
||||
"PII_A4_gmm_storage_20260127T181014Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_gmm_storage_20260127T181014Z.jsonl\n",
|
||||
"PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl: 5 records -> batch_results_normalized\\PII_A4_peopleCache_sharononeil368@gmail.com_com.google_14_20260127T181121Z.jsonl\n",
|
||||
"PII_A5_SBrowser2_20260127T181345Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_SBrowser2_20260127T181345Z.jsonl\n",
|
||||
"PII_A5_SBrowser_20260127T181239Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_SBrowser_20260127T181239Z.jsonl\n",
|
||||
"PII_A5_searchengine_20260127T181446Z.jsonl: 5 records -> batch_results_normalized\\PII_A5_searchengine_20260127T181446Z.jsonl\n",
|
||||
"PII_I1_CallHistory_20260127T181557Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_CallHistory_20260127T181557Z.jsonl\n",
|
||||
"PII_I1_ChatStorage_20260127T181731Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_ChatStorage_20260127T181731Z.jsonl\n",
|
||||
"PII_I1_ContactsV2_20260127T182906Z.jsonl: 5 records -> batch_results_normalized\\PII_I1_ContactsV2_20260127T182906Z.jsonl\n",
|
||||
"PII_I2_AddressBook_20260127T183457Z.jsonl: 5 records -> batch_results_normalized\\PII_I2_AddressBook_20260127T183457Z.jsonl\n",
|
||||
"PII_I2_AddressBookImages_20260127T183526Z.jsonl: 5 records -> batch_results_normalized\\PII_I2_AddressBookImages_20260127T183526Z.jsonl\n",
|
||||
"PII_I3_sms_20260127T183606Z.jsonl: 5 records -> batch_results_normalized\\PII_I3_sms_20260127T183606Z.jsonl\n",
|
||||
"PII_I4_CloudTabs_20260127T183643Z.jsonl: 5 records -> batch_results_normalized\\PII_I4_CloudTabs_20260127T183643Z.jsonl\n",
|
||||
"PII_I4_History_20260127T183727Z.jsonl: 5 records -> batch_results_normalized\\PII_I4_History_20260127T183727Z.jsonl\n",
|
||||
"PII_I5_Calendar_20260127T183815Z.jsonl: 5 records -> batch_results_normalized\\PII_I5_Calendar_20260127T183815Z.jsonl\n",
|
||||
"PII_I5_Extras_20260127T183857Z.jsonl: 5 records -> batch_results_normalized\\PII_I5_Extras_20260127T183857Z.jsonl\n",
|
||||
"Done. Files: 25, Records: 125\n",
|
||||
"Output folder: I:\\project2026\\llmagent\\stats\\batch_results_normalized\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from stats_utils import normalize_and_slim_record\n",
|
||||
"\n",
|
||||
"IN_DIR = Path(r\"..\\\\batch_results\")\n",
|
||||
"OUT_DIR = Path(r\"batch_results_normalized\")\n",
|
||||
"\n",
|
||||
"def process_file(in_path: Path, out_path: Path) -> int:\n",
|
||||
" n = 0\n",
|
||||
" with in_path.open(\"r\", encoding=\"utf-8\") as fin, out_path.open(\"w\", encoding=\"utf-8\") as fout:\n",
|
||||
" for line in fin:\n",
|
||||
" line = line.strip()\n",
|
||||
" if not line:\n",
|
||||
" continue\n",
|
||||
" obj = json.loads(line)\n",
|
||||
" if not isinstance(obj, dict):\n",
|
||||
" continue\n",
|
||||
" slim = normalize_and_slim_record(obj)\n",
|
||||
" fout.write(json.dumps(slim, ensure_ascii=False) + \"\\n\")\n",
|
||||
" n += 1\n",
|
||||
" return n\n",
|
||||
"\n",
|
||||
"def main() -> None:\n",
|
||||
" OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
" files = sorted(IN_DIR.glob(\"*.jsonl\"))\n",
|
||||
" if not files:\n",
|
||||
" print(f\"No .jsonl files found in: {IN_DIR.resolve()}\")\n",
|
||||
" return\n",
|
||||
"\n",
|
||||
" total_files = 0\n",
|
||||
" total_records = 0\n",
|
||||
"\n",
|
||||
" for fp in files:\n",
|
||||
" out_fp = OUT_DIR / fp.name\n",
|
||||
" n = process_file(fp, out_fp)\n",
|
||||
" print(f\"{fp.name}: {n} records -> {out_fp}\")\n",
|
||||
" total_files += 1\n",
|
||||
" total_records += n\n",
|
||||
"\n",
|
||||
" print(f\"Done. Files: {total_files}, Records: {total_records}\")\n",
|
||||
" print(f\"Output folder: {OUT_DIR.resolve()}\")\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" main()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0074eda4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
5
RQs/RQ3/RQ3_aggregated_corpus_by_type.jsonl
Normal file
5
RQs/RQ3/RQ3_aggregated_corpus_by_type.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"id": "234eed3f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -10,7 +10,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Wrote: I:\\project2026\\llmagent\\stats\\aggregated_by_app_and_type.jsonl\n"
|
||||
"Wrote: I:\\project2026\\llmagent\\stats\\RQ3_aggregated_by_app_and_type.jsonl\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -180,7 +180,7 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" out = aggregate_jsonl_folder(r\"batch_results_normalized\", \"aggregated_by_app_and_type.jsonl\")\n",
|
||||
" out = aggregate_jsonl_folder(r\"batch_results_normalized\", \"RQ3_aggregated_by_app_and_type.jsonl\")\n",
|
||||
" print(f\"Wrote: {out.resolve()}\")\n"
|
||||
]
|
||||
}
|
||||
176
RQs/RQ3/RQ3_t7_corpus_level.ipynb
Normal file
176
RQs/RQ3/RQ3_t7_corpus_level.ipynb
Normal file
@@ -0,0 +1,176 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "234eed3f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Wrote: I:\\project2026\\llmagent\\stats\\RQ3_aggregated_corpus_by_type.jsonl\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Any, Dict, Tuple\n",
|
||||
"\n",
|
||||
"IGNORE_FIELDS = {\"Raw_rows_first_100\", \"Exploration_sql\", \"Extraction_sql\", \"PII_Prompt\"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def _dedupe_preserve_order(items):\n",
|
||||
" \"\"\"\n",
|
||||
" Stable dedupe for lists that may contain scalars, dicts, or lists.\n",
|
||||
" \"\"\"\n",
|
||||
" seen = set()\n",
|
||||
" out = []\n",
|
||||
" for x in items:\n",
|
||||
" key = json.dumps(x, sort_keys=True, ensure_ascii=False) if isinstance(x, (dict, list)) else x\n",
|
||||
" if key in seen:\n",
|
||||
" continue\n",
|
||||
" seen.add(key)\n",
|
||||
" out.append(x)\n",
|
||||
" return out\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def prefix_source_columns(db_path: str, cols: list) -> list:\n",
|
||||
" \"\"\"\n",
|
||||
" Prefix each source column with the database filename to avoid ambiguity\n",
|
||||
" after aggregating across many DBs.\n",
|
||||
"\n",
|
||||
" Example:\n",
|
||||
" db_path = selectedDBs\\\\A1_msgstore.db\n",
|
||||
" col = message.text_data\n",
|
||||
" -> A1_msgstore.db:message.text_data\n",
|
||||
" \"\"\"\n",
|
||||
" db_file = Path(db_path).name\n",
|
||||
" out = []\n",
|
||||
" for c in cols:\n",
|
||||
" if isinstance(c, str) and c:\n",
|
||||
" out.append(f\"{db_file}:{c}\")\n",
|
||||
" return out\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def aggregate_jsonl_folder_corpus_level(in_dir: str | Path, out_path: str | Path) -> Path:\n",
|
||||
" \"\"\"\n",
|
||||
" Corpus-level aggregation across all *.jsonl files in in_dir, grouped ONLY by PII_type.\n",
|
||||
"\n",
|
||||
" Input records are expected to already be normalized (your batch_results_normalized),\n",
|
||||
" but this function still performs dedupe at aggregation time.\n",
|
||||
"\n",
|
||||
" Output per PII_type keeps:\n",
|
||||
" - PII_type\n",
|
||||
" - PII_all: concatenated across corpus (with duplicates)\n",
|
||||
" - PII_unique: deduped\n",
|
||||
" - Num_of_PII_all: total count with duplicates (sum of per-record Num_of_PII or len(PII))\n",
|
||||
" - Num_of_PII_unique: len(PII_unique)\n",
|
||||
" - source_columns: deduped, prefixed with db filename\n",
|
||||
" - Num_of_source_columns: len(source_columns)\n",
|
||||
"\n",
|
||||
" It ignores IGNORE_FIELDS and discards all other keys.\n",
|
||||
" \"\"\"\n",
|
||||
" in_dir = Path(in_dir)\n",
|
||||
" out_path = Path(out_path)\n",
|
||||
" out_path.parent.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
" grouped: Dict[str, Dict[str, Any]] = {}\n",
|
||||
"\n",
|
||||
" for jsonl_file in sorted(in_dir.glob(\"*.jsonl\")):\n",
|
||||
" with jsonl_file.open(\"r\", encoding=\"utf-8\") as f:\n",
|
||||
" for line_no, line in enumerate(f, start=1):\n",
|
||||
" line = line.strip()\n",
|
||||
" if not line:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" rec = json.loads(line)\n",
|
||||
" except json.JSONDecodeError as e:\n",
|
||||
" raise ValueError(f\"Bad JSON in {jsonl_file} line {line_no}: {e}\") from e\n",
|
||||
"\n",
|
||||
" pii_type = rec.get(\"PII_type\")\n",
|
||||
" if not pii_type:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" if pii_type not in grouped:\n",
|
||||
" grouped[pii_type] = {\n",
|
||||
" \"PII_type\": pii_type,\n",
|
||||
" \"PII_all\": [],\n",
|
||||
" \"PII_unique\": [],\n",
|
||||
" \"Num_of_PII_all\": 0,\n",
|
||||
" \"Num_of_PII_unique\": 0,\n",
|
||||
" \"source_columns\": [],\n",
|
||||
" \"Num_of_source_columns\": 0,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" agg = grouped[pii_type]\n",
|
||||
"\n",
|
||||
" # --- PII + count (with-dup) ---\n",
|
||||
" pii_list = rec.get(\"PII\", [])\n",
|
||||
" if isinstance(pii_list, list):\n",
|
||||
" agg[\"PII_all\"].extend(pii_list)\n",
|
||||
"\n",
|
||||
" n = rec.get(\"Num_of_PII\")\n",
|
||||
" if isinstance(n, (int, float)) and not isinstance(n, bool):\n",
|
||||
" agg[\"Num_of_PII_all\"] += int(n)\n",
|
||||
" else:\n",
|
||||
" agg[\"Num_of_PII_all\"] += len(pii_list) if isinstance(pii_list, list) else 0\n",
|
||||
"\n",
|
||||
" # --- source_columns (with-dup) ---\n",
|
||||
" dbp = rec.get(\"db_path\", \"\")\n",
|
||||
" cols = rec.get(\"source_columns\", [])\n",
|
||||
" if isinstance(cols, list):\n",
|
||||
" agg[\"source_columns\"].extend(prefix_source_columns(dbp, cols))\n",
|
||||
"\n",
|
||||
" # ignore everything else (and IGNORE_FIELDS)\n",
|
||||
"\n",
|
||||
" # --- Finalize: dedupe lists + compute unique counts ---\n",
|
||||
" for agg in grouped.values():\n",
|
||||
" agg[\"PII_unique\"] = _dedupe_preserve_order(agg[\"PII_all\"])\n",
|
||||
" agg[\"Num_of_PII_unique\"] = len(agg[\"PII_unique\"])\n",
|
||||
"\n",
|
||||
" agg[\"source_columns\"] = _dedupe_preserve_order(agg[\"source_columns\"])\n",
|
||||
" agg[\"Num_of_source_columns\"] = len(agg[\"source_columns\"])\n",
|
||||
"\n",
|
||||
" # --- Write aggregated JSONL ---\n",
|
||||
" with out_path.open(\"w\", encoding=\"utf-8\") as f:\n",
|
||||
" for pii_type in sorted(grouped.keys()):\n",
|
||||
" f.write(json.dumps(grouped[pii_type], ensure_ascii=False) + \"\\n\")\n",
|
||||
"\n",
|
||||
" return out_path\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" out = aggregate_jsonl_folder_corpus_level(\n",
|
||||
" r\"batch_results_normalized\",\n",
|
||||
" \"RQ3_aggregated_corpus_by_type.jsonl\",\n",
|
||||
" )\n",
|
||||
" print(f\"Wrote: {out.resolve()}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
Reference in New Issue
Block a user