add automated process (folder level)

This commit is contained in:
Frank Xu
2026-01-19 23:53:39 -05:00
parent 82ecd08ea2
commit c1de9e7872
5 changed files with 752 additions and 256 deletions

File diff suppressed because it is too large Load Diff

8
config.yaml Normal file
View File

@@ -0,0 +1,8 @@
db_dir: selectedDBs
out_dir: batch_results
config_py: my_run_config.py
pii_targets:
- EMAIL
- PHONE
- USERNAME
- PERSON_NAME

View File

@@ -1,28 +0,0 @@
db_files = [
"test2.db",
# "A1_commerce.db",
# "A1_msgstore.db",
# "A1_wa.db",
# "A2_core.db",
# "A2_journal.db",
# "A2_main.db",
# "A3_account1cache4.db",
# "A3_account2cache4.db",
# "A3_account3cache4.db",
# "A4_gmm_myplaces.db",
# "A4_gmm_storage.db",
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
# "A5_SBrowser.db",
# "A5_SBrowser2.db",
# "A5_searchengine.db",
# "I1_CallHistory.sqlite",
# "I1_ChatStorage.sqlite",
# "I1_ContactsV2.sqlite",
# "I2_AddressBook.sqlitedb",
# "I2_AddressBookImages.sqlitedb",
# "I3_sms.db",
# "I4_CloudTabs.db",
# "I4_History.db",
# "I5_Calendar.sqlitedb",
# "I5_Extras.db",
]

52
my_run_config.py Normal file
View File

@@ -0,0 +1,52 @@
db_files = [
"test2.db",
# "A1_commerce.db",
# "A1_msgstore.db",
# "A1_wa.db",
# "A2_core.db",
# "A2_journal.db",
# "A2_main.db",
# "A3_account1cache4.db",
# "A3_account2cache4.db",
# "A3_account3cache4.db",
# "A4_gmm_myplaces.db",
# "A4_gmm_storage.db",
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
# "A5_SBrowser.db",
# "A5_SBrowser2.db",
# "A5_searchengine.db",
# "I1_CallHistory.sqlite",
# "I1_ChatStorage.sqlite",
# "I1_ContactsV2.sqlite",
# "I2_AddressBook.sqlitedb",
# "I2_AddressBookImages.sqlitedb",
# "I3_sms.db",
# "I4_CloudTabs.db",
# "I4_History.db",
# "I5_Calendar.sqlitedb",
# "I5_Extras.db",
]
PII_CONFIG = {
"EMAIL": {
"type":"email",
"regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
"desc": "valid email addresses. For example: username@domain.tld"
},
"PHONE": {
"type":"phone number",
"regex": r"\+?[0-9]{1,4}[- .]?\(?[0-9]{1,3}?\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}",
"desc": "international or local telephone numbers"
},
"USERNAME": {
"type":"username",
"regex": r"\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\b",
"desc": "application-specific login usernames created by users for login purposes"
},
"PERSON_NAME": {
"type":"person name",
"regex": r"[A-Za-z][A-Za-z\s\.\-]{1,50}",
"desc": "loosely structured human name-like strings"
}
}

View File

@@ -3,7 +3,9 @@ import json
import sys
from pathlib import Path
from datetime import datetime, timezone
import yaml
import importlib.util
from typing import List, Tuple
def extract_tables_with_aliases(select_sql: str) -> dict[str, str]:
"""
@@ -281,26 +283,6 @@ def is_sqlite_file(p: Path) -> bool:
except Exception:
return False
from pathlib import Path
import importlib.util
from typing import List, Tuple
def load_db_files_list(py_path: Path, var_name: str = "db_files") -> List[str]:
"""Load a list variable (default: db_files) from a .py file."""
spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
if spec is None or spec.loader is None:
raise ValueError(f"Cannot load module from {py_path}")
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
if not hasattr(mod, var_name):
raise AttributeError(f"{py_path} does not define `{var_name}`")
value = getattr(mod, var_name)
if not isinstance(value, list):
raise TypeError(f"`{var_name}` must be a list, got {type(value)}")
return value
def build_db_paths(
db_dir: Path,
db_files: List[str],
@@ -347,3 +329,21 @@ def save_jsonl(all_results, out_dir):
print(f"Wrote: {out_path.resolve()}")
return out_path
def load_config_yaml(path: Path) -> dict:
return yaml.safe_load(path.read_text(encoding="utf-8"))
def load_vars_from_py(py_path: Path, *var_names: str):
spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
if spec is None or spec.loader is None:
raise ValueError(f"Cannot load module from {py_path}")
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
out = {}
for name in var_names:
if not hasattr(mod, name):
raise AttributeError(f"{py_path} does not define `{name}`")
out[name] = getattr(mod, name)
return out