mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
add automated process (folder level)
This commit is contained in:
File diff suppressed because it is too large
Load Diff
8
config.yaml
Normal file
8
config.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
db_dir: selectedDBs
|
||||||
|
out_dir: batch_results
|
||||||
|
config_py: my_run_config.py
|
||||||
|
pii_targets:
|
||||||
|
- EMAIL
|
||||||
|
- PHONE
|
||||||
|
- USERNAME
|
||||||
|
- PERSON_NAME
|
||||||
28
db_files.py
28
db_files.py
@@ -1,28 +0,0 @@
|
|||||||
db_files = [
|
|
||||||
"test2.db",
|
|
||||||
# "A1_commerce.db",
|
|
||||||
# "A1_msgstore.db",
|
|
||||||
# "A1_wa.db",
|
|
||||||
# "A2_core.db",
|
|
||||||
# "A2_journal.db",
|
|
||||||
# "A2_main.db",
|
|
||||||
# "A3_account1cache4.db",
|
|
||||||
# "A3_account2cache4.db",
|
|
||||||
# "A3_account3cache4.db",
|
|
||||||
# "A4_gmm_myplaces.db",
|
|
||||||
# "A4_gmm_storage.db",
|
|
||||||
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
|
|
||||||
# "A5_SBrowser.db",
|
|
||||||
# "A5_SBrowser2.db",
|
|
||||||
# "A5_searchengine.db",
|
|
||||||
# "I1_CallHistory.sqlite",
|
|
||||||
# "I1_ChatStorage.sqlite",
|
|
||||||
# "I1_ContactsV2.sqlite",
|
|
||||||
# "I2_AddressBook.sqlitedb",
|
|
||||||
# "I2_AddressBookImages.sqlitedb",
|
|
||||||
# "I3_sms.db",
|
|
||||||
# "I4_CloudTabs.db",
|
|
||||||
# "I4_History.db",
|
|
||||||
# "I5_Calendar.sqlitedb",
|
|
||||||
# "I5_Extras.db",
|
|
||||||
]
|
|
||||||
52
my_run_config.py
Normal file
52
my_run_config.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
db_files = [
|
||||||
|
"test2.db",
|
||||||
|
# "A1_commerce.db",
|
||||||
|
# "A1_msgstore.db",
|
||||||
|
# "A1_wa.db",
|
||||||
|
# "A2_core.db",
|
||||||
|
# "A2_journal.db",
|
||||||
|
# "A2_main.db",
|
||||||
|
# "A3_account1cache4.db",
|
||||||
|
# "A3_account2cache4.db",
|
||||||
|
# "A3_account3cache4.db",
|
||||||
|
# "A4_gmm_myplaces.db",
|
||||||
|
# "A4_gmm_storage.db",
|
||||||
|
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
|
||||||
|
# "A5_SBrowser.db",
|
||||||
|
# "A5_SBrowser2.db",
|
||||||
|
# "A5_searchengine.db",
|
||||||
|
# "I1_CallHistory.sqlite",
|
||||||
|
# "I1_ChatStorage.sqlite",
|
||||||
|
# "I1_ContactsV2.sqlite",
|
||||||
|
# "I2_AddressBook.sqlitedb",
|
||||||
|
# "I2_AddressBookImages.sqlitedb",
|
||||||
|
# "I3_sms.db",
|
||||||
|
# "I4_CloudTabs.db",
|
||||||
|
# "I4_History.db",
|
||||||
|
# "I5_Calendar.sqlitedb",
|
||||||
|
# "I5_Extras.db",
|
||||||
|
]
|
||||||
|
|
||||||
|
PII_CONFIG = {
|
||||||
|
"EMAIL": {
|
||||||
|
"type":"email",
|
||||||
|
"regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
|
||||||
|
"desc": "valid email addresses. For example: username@domain.tld"
|
||||||
|
},
|
||||||
|
"PHONE": {
|
||||||
|
"type":"phone number",
|
||||||
|
"regex": r"\+?[0-9]{1,4}[- .]?\(?[0-9]{1,3}?\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}",
|
||||||
|
"desc": "international or local telephone numbers"
|
||||||
|
},
|
||||||
|
"USERNAME": {
|
||||||
|
"type":"username",
|
||||||
|
"regex": r"\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\b",
|
||||||
|
"desc": "application-specific login usernames created by users for login purposes"
|
||||||
|
},
|
||||||
|
"PERSON_NAME": {
|
||||||
|
"type":"person name",
|
||||||
|
"regex": r"[A-Za-z][A-Za-z\s\.\-]{1,50}",
|
||||||
|
"desc": "loosely structured human name-like strings"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
44
sql_utils.py
44
sql_utils.py
@@ -3,7 +3,9 @@ import json
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
import yaml
|
||||||
|
import importlib.util
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
def extract_tables_with_aliases(select_sql: str) -> dict[str, str]:
|
def extract_tables_with_aliases(select_sql: str) -> dict[str, str]:
|
||||||
"""
|
"""
|
||||||
@@ -280,26 +282,6 @@ def is_sqlite_file(p: Path) -> bool:
|
|||||||
return f.read(16) == b"SQLite format 3\x00"
|
return f.read(16) == b"SQLite format 3\x00"
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
import importlib.util
|
|
||||||
from typing import List, Tuple
|
|
||||||
|
|
||||||
def load_db_files_list(py_path: Path, var_name: str = "db_files") -> List[str]:
|
|
||||||
"""Load a list variable (default: db_files) from a .py file."""
|
|
||||||
spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
|
|
||||||
if spec is None or spec.loader is None:
|
|
||||||
raise ValueError(f"Cannot load module from {py_path}")
|
|
||||||
mod = importlib.util.module_from_spec(spec)
|
|
||||||
spec.loader.exec_module(mod) # type: ignore
|
|
||||||
|
|
||||||
if not hasattr(mod, var_name):
|
|
||||||
raise AttributeError(f"{py_path} does not define `{var_name}`")
|
|
||||||
value = getattr(mod, var_name)
|
|
||||||
if not isinstance(value, list):
|
|
||||||
raise TypeError(f"`{var_name}` must be a list, got {type(value)}")
|
|
||||||
return value
|
|
||||||
|
|
||||||
def build_db_paths(
|
def build_db_paths(
|
||||||
db_dir: Path,
|
db_dir: Path,
|
||||||
@@ -346,4 +328,22 @@ def save_jsonl(all_results, out_dir):
|
|||||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
print(f"Wrote: {out_path.resolve()}")
|
print(f"Wrote: {out_path.resolve()}")
|
||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
|
def load_config_yaml(path: Path) -> dict:
|
||||||
|
return yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def load_vars_from_py(py_path: Path, *var_names: str):
|
||||||
|
spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
|
||||||
|
if spec is None or spec.loader is None:
|
||||||
|
raise ValueError(f"Cannot load module from {py_path}")
|
||||||
|
mod = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(mod) # type: ignore
|
||||||
|
|
||||||
|
out = {}
|
||||||
|
for name in var_names:
|
||||||
|
if not hasattr(mod, name):
|
||||||
|
raise AttributeError(f"{py_path} does not define `{name}`")
|
||||||
|
out[name] = getattr(mod, name)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user