mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
90 lines
3.3 KiB
Python
90 lines
3.3 KiB
Python
db_files = [
|
|
# "test2.db",
|
|
# "users.db",
|
|
"A1_commerce.db",
|
|
"A1_msgstore.db",
|
|
"A1_wa.db",
|
|
"A2_core.db",
|
|
"A2_journal.db",
|
|
"A2_main.db",
|
|
"A3_account1cache4.db",
|
|
"A3_account2cache4.db",
|
|
"A3_account3cache4.db",
|
|
"A4_gmm_myplaces.db",
|
|
"A4_gmm_storage.db",
|
|
"A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
|
|
"A5_SBrowser.db",
|
|
"A5_SBrowser2.db",
|
|
"A5_searchengine.db",
|
|
"I1_CallHistory.sqlite",
|
|
"I1_ChatStorage.sqlite",
|
|
"I1_ContactsV2.sqlite",
|
|
"I2_AddressBook.sqlitedb",
|
|
"I2_AddressBookImages.sqlitedb",
|
|
"I3_sms.db",
|
|
"I4_CloudTabs.db",
|
|
"I4_History.db",
|
|
"I5_Calendar.sqlitedb",
|
|
"I5_Extras.db",
|
|
]
|
|
|
|
PII_CONFIG = {
|
|
"EMAIL": {
|
|
"type":"email address",
|
|
"regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
|
|
"desc": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"
|
|
},
|
|
"PHONE": {
|
|
"type":"US phone number",
|
|
"regex": r"\+?[0-9]{1,4}[- .]?\(?[0-9]{1,3}?\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}",
|
|
"desc": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"
|
|
},
|
|
"USERNAME": {
|
|
"type":"username",
|
|
"regex": r"\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\b",
|
|
"desc": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."
|
|
},
|
|
"PERSON_NAME": {
|
|
"type":"person name",
|
|
"regex": r"[A-Za-z][A-Za-z\s\.\-]{1,50}",
|
|
"desc": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"
|
|
},
|
|
"POSTAL_ADDRESS": {
|
|
"type": "US postal address",
|
|
# MAX RECALL prefilter (street number optional).
|
|
# Matches either:
|
|
# (1) PO Box patterns, OR
|
|
# (2) optional street number + some tokens + a street suffix, OR
|
|
# (3) street suffix with nearby tokens (even without a number).
|
|
"regex": (
|
|
r"(?i)\b(?:"
|
|
r"p\.?\s*o\.?\s*box|post\s+office\s+box|"
|
|
r"ave\.?|avenue|"
|
|
r"st\.?|street|"
|
|
r"rd\.?|road|"
|
|
r"blvd\.?|boulevard|"
|
|
r"dr\.?|drive|"
|
|
r"ln\.?|lane|"
|
|
r"ct\.?|court|"
|
|
r"pl\.?|place|"
|
|
r"way|"
|
|
r"pkwy\.?|parkway|"
|
|
r"cir\.?|circle|"
|
|
r"ter\.?|terrace|"
|
|
r"hwy\.?|highway|"
|
|
r"trl\.?|trail|"
|
|
r"sq\.?|square|"
|
|
r"pike|"
|
|
r"loop|"
|
|
r"run|"
|
|
r"walk|"
|
|
r"path|"
|
|
r"byp\.?|bypass|"
|
|
r"(?:n|s|e|w|ne|nw|se|sw)\b"
|
|
r")\b"
|
|
),
|
|
"desc": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"
|
|
}
|
|
}
|
|
|