add automated process (folder level)

This commit is contained in:
Frank Xu
2026-01-19 23:53:39 -05:00
parent 82ecd08ea2
commit c1de9e7872
5 changed files with 752 additions and 256 deletions

File diff suppressed because it is too large Load Diff

8
config.yaml Normal file
View File

@@ -0,0 +1,8 @@
db_dir: selectedDBs
out_dir: batch_results
config_py: my_run_config.py
pii_targets:
- EMAIL
- PHONE
- USERNAME
- PERSON_NAME

View File

@@ -1,28 +0,0 @@
db_files = [
"test2.db",
# "A1_commerce.db",
# "A1_msgstore.db",
# "A1_wa.db",
# "A2_core.db",
# "A2_journal.db",
# "A2_main.db",
# "A3_account1cache4.db",
# "A3_account2cache4.db",
# "A3_account3cache4.db",
# "A4_gmm_myplaces.db",
# "A4_gmm_storage.db",
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
# "A5_SBrowser.db",
# "A5_SBrowser2.db",
# "A5_searchengine.db",
# "I1_CallHistory.sqlite",
# "I1_ChatStorage.sqlite",
# "I1_ContactsV2.sqlite",
# "I2_AddressBook.sqlitedb",
# "I2_AddressBookImages.sqlitedb",
# "I3_sms.db",
# "I4_CloudTabs.db",
# "I4_History.db",
# "I5_Calendar.sqlitedb",
# "I5_Extras.db",
]

52
my_run_config.py Normal file
View File

@@ -0,0 +1,52 @@
db_files = [
"test2.db",
# "A1_commerce.db",
# "A1_msgstore.db",
# "A1_wa.db",
# "A2_core.db",
# "A2_journal.db",
# "A2_main.db",
# "A3_account1cache4.db",
# "A3_account2cache4.db",
# "A3_account3cache4.db",
# "A4_gmm_myplaces.db",
# "A4_gmm_storage.db",
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
# "A5_SBrowser.db",
# "A5_SBrowser2.db",
# "A5_searchengine.db",
# "I1_CallHistory.sqlite",
# "I1_ChatStorage.sqlite",
# "I1_ContactsV2.sqlite",
# "I2_AddressBook.sqlitedb",
# "I2_AddressBookImages.sqlitedb",
# "I3_sms.db",
# "I4_CloudTabs.db",
# "I4_History.db",
# "I5_Calendar.sqlitedb",
# "I5_Extras.db",
]
PII_CONFIG = {
"EMAIL": {
"type":"email",
"regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
"desc": "valid email addresses. For example: username@domain.tld"
},
"PHONE": {
"type":"phone number",
"regex": r"\+?[0-9]{1,4}[- .]?\(?[0-9]{1,3}?\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}",
"desc": "international or local telephone numbers"
},
"USERNAME": {
"type":"username",
"regex": r"\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\b",
"desc": "application-specific login usernames created by users for login purposes"
},
"PERSON_NAME": {
"type":"person name",
"regex": r"[A-Za-z][A-Za-z\s\.\-]{1,50}",
"desc": "loosely structured human name-like strings"
}
}

View File

@@ -3,7 +3,9 @@ import json
import sys import sys
from pathlib import Path from pathlib import Path
from datetime import datetime, timezone from datetime import datetime, timezone
import yaml
import importlib.util
from typing import List, Tuple
def extract_tables_with_aliases(select_sql: str) -> dict[str, str]: def extract_tables_with_aliases(select_sql: str) -> dict[str, str]:
""" """
@@ -280,26 +282,6 @@ def is_sqlite_file(p: Path) -> bool:
return f.read(16) == b"SQLite format 3\x00" return f.read(16) == b"SQLite format 3\x00"
except Exception: except Exception:
return False return False
from pathlib import Path
import importlib.util
from typing import List, Tuple
def load_db_files_list(py_path: Path, var_name: str = "db_files") -> List[str]:
"""Load a list variable (default: db_files) from a .py file."""
spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
if spec is None or spec.loader is None:
raise ValueError(f"Cannot load module from {py_path}")
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
if not hasattr(mod, var_name):
raise AttributeError(f"{py_path} does not define `{var_name}`")
value = getattr(mod, var_name)
if not isinstance(value, list):
raise TypeError(f"`{var_name}` must be a list, got {type(value)}")
return value
def build_db_paths( def build_db_paths(
db_dir: Path, db_dir: Path,
@@ -346,4 +328,22 @@ def save_jsonl(all_results, out_dir):
f.write(json.dumps(r, ensure_ascii=False) + "\n") f.write(json.dumps(r, ensure_ascii=False) + "\n")
print(f"Wrote: {out_path.resolve()}") print(f"Wrote: {out_path.resolve()}")
return out_path return out_path
def load_config_yaml(path: Path) -> dict:
return yaml.safe_load(path.read_text(encoding="utf-8"))
def load_vars_from_py(py_path: Path, *var_names: str):
spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
if spec is None or spec.loader is None:
raise ValueError(f"Cannot load module from {py_path}")
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
out = {}
for name in var_names:
if not hasattr(mod, name):
raise AttributeError(f"{py_path} does not define `{name}`")
out[name] = getattr(mod, name)
return out