mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
fix sql alias
This commit is contained in:
5
model_PII_results/ground_truth/PII_A1_commerce.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A1_commerce.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A1_commerce.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A1_commerce.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A1_commerce.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A1_commerce.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A1_commerce.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "No user tables are available in the database to query for person names.", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_A1_msgstore.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A1_msgstore.jsonl
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
5
model_PII_results/ground_truth/PII_A1_wa.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A1_wa.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A1_wa.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A1_wa.db", "PII_Type": "PHONE", "PII": ["+19199037779", "8085096467", "5713349815", "+16263678865", "+16106046786", "7034241981", "5715917168", "+12065937224", "5713298742", "8624338328", "+18056377243", "2028177932", "2025692832", "+19735203731", "+81367430271", "+17423794330", "5713298742", "5713349815", "8085096467", "8624338328", "7034241981", "2028177932", "5715917168", "2025692832", "+16106046786", "+16263678865", "+12065937224", "+19199037779", "+18056377243", "+19735203731", "+81367430271", "+17423794330"], "Num_of_PII": 32, "source_columns": ["wa_address_book.number", "wa_contacts.number"], "num_of_source_columns": 2}
|
||||||
|
{"db_path": "selectedDBs\\A1_wa.db", "PII_Type": "USERNAME", "PII": ["CLARKE", "Philips", "a_kalachikova", "Elizabeth Jones", "george", "Morton", "latefire_eu", "Mario.drapela", "Dolgetta", "Jacob Emily", "Thomas", "Alethea", "Peter", "Imogen Holman", "Chloe Rodriguez", "Robert jane", "Mason", "James Matthew Brown", "Jalen", "Official representative", "Gilbert", "Andie", "Connor Haggarty", "kop", "Winona", "laurenncbrown", "Gabriel", "Sarah Cox", "Jesus Delgado", "Milicardis", "jackson", "Dave", "Benjamin", "Elliot Dowell", "Mary Garcia", "Gaia", "renwbw", "Jim Anderson", "Marit Bonthuis", "rola nd", "marcel_juhas", "Kevin Destiny", "Minardo Gaspari", "Stephanna smith", "YTliken", "Thomas Anderson", "pnod", "Paco Almeida", "Oscar Steven", "Lockerbie Do-Vip", "Admin", "Roberto Cadorin", "Brandon Addison", "Matt Galligan", "Christellecamiller", "Carlos", "Marcinmis", "Isabel Abarca", "Adam Taylor", "Antonio", "scrichpower", "Hcibc", "Gabi", "Alethea", "Thomas Arthur Heber Fearn", "Augustin Richard", "Kamila", "Jim Anderson", "Aiden Savannah", "Edwardsaliendra", "Jayden", "Morgan", "harry", "kandicesledge", "Jaroslaw Machek", "Juan Pablo Pesqueira", "Mandy Lauren\ud83e\uddd9\u200d\u2640\ufe0f", "gemmadorney", "zara", "Marcos Amorim", "Amanda White", "Mauro Silvabarbosa", "Felix Davey", "\ud83d\udc99\ud83d\udc99\ud83d\udc99", "AbeRudder", "Isabella", "Olivia", "Felipe Hernandez", "Sophia", "Elizabeth Jones", "Miss you", "Russell Philby", "Stephcoleman", "Brian Goudy", "Liam Thomas", "Shawn Hoxie", "Carolasol_espinoza", "13135550002", "Otto Matik", "Ruby", "Million Marketing", "Million Marketing", "Jason Steven", "\u200eshirley", "Shehriyar Ansari", "nole", "Dungeon Master", "Alvin the Alien", "Perry", "Angie", "Bob the robot", "Leo", "Sally", "Brian", "Liv", "Coco", "Victor", "Tamika", "Becca", "Lorena", "Zach", "Luiz", "Bru", "Thalia", "Lily", "Izzy", "Max", "Scarlett", "Amber", "Dylan", "Jade", "Billie", "Jane Austen", "Meta", "leaura", "faxinezidohne", "Meta", "Meta", "patrickh34", "yauyauyauhen", "nathanmorris", "faxinezidohne", "0jamesf", "italianmatters", "Meta", "robm435", "loususi", "Meta", "lanaire2023", "Meta", "pet_the_bunny", "madmax_mgm", "reidback", "rennymorales", "yjr_fit.inba", "Meta", "Meta", "the_real_flockfam", "Meta", "Meta", "Meta", "Meta", "Meta", "Meta", "Meta", "Meta", "yauyauyauhen", "gibbogram", "almondeyezbitch", "visionz2turnt", "stars_hinemoon", "thisvillage_ijn", "Meta", "patrick_c_doyle", "Meta", "patrickvaxter", "Meta", "Meta", "Meta", "Meta", "patrickh34", "humans_of_data", "airwicksol", "homan.jason", "Meta", "Meta", "Meta", "nadhir_chiu_oficial", "Meta", "Meta", "Meta", "Meta", "Meta", "brandonmcclainl", "geezdagawd", "superflysugar2024", "curia__", "psychicadvisor345", "lemieuxbrands", "Meta", "james_macray_", "Meta", "Meta", "Meta", "Meta", "Meta", "Meta", "Meta"], "Num_of_PII": 208, "source_columns": ["wa_bot_profiles.creator_name", "wa_contacts.wa_name", "wa_vnames.verified_name"], "num_of_source_columns": 3}
|
||||||
|
{"db_path": "selectedDBs\\A1_wa.db", "PII_Type": "PERSON_NAME", "PII": ["Russ Philby", "Don Wayne", "Whalen", "Mary Garcia", "Joey", "Hank", "Goldie Kahn", "Abe Rudder", "Svetlana Chernoff", "Karen Tate", "Brian Reynolds", "Vladamir Stravinsky", "Otto", "Toks", "Bo", "Russ Philby", "Don Wayne", "Whalen", "Mary Garcia", "Joey", "Hank", "Goldie Kahn", "Abe Rudder", "Svetlana Chernoff", "Karen Tate", "Voice Mail", "Brian Reynolds", "Vladamir Stravinsky", "Otto", "Toks", "Bo", "Svetlana Chernoff", "Whalen", "Don Wayne", "Karen Tate", "Hank", "Brian Reynolds", "Goldie Kahn", "Vladamir Stravinsky", "Joey", "Mary Garcia", "Abe Rudder", "Russ Philby", "Voice Mail", "Otto", "Toks", "Bo", "Svetlana Chernoff", "Whalen", "Don Wayne", "Karen Tate", "Hank", "Brian Reynolds", "Goldie Kahn", "Vladamir Stravinsky", "Joey", "Mary Garcia", "Abe Rudder", "Russ Philby", "Otto", "Toks", "Bo", "Russ Philby", "Capt Don Wayne", "LTC Whalen", "Mary Garcia", "Joey", "Hank", "Goldie Kahn", "Abe Rudder", "Svetlana Chernoff", "Karen Tate", "Brian Reynolds", "Vladamir Stravinsky", "Otto", "Toks", "Bo", "Svetlana Chernoff", "Whalen", "Don Wayne", "Karen Tate", "Hank", "Brian Reynolds", "Goldie Kahn", "Vladamir Stravinsky", "Joey", "Mary Garcia", "Abe Rudder", "Russ Philby", "Otto", "Toks", "Bo"], "Num_of_PII": 92, "source_columns": ["wa_address_book.display_name", "wa_address_book.given_name+family_name", "wa_address_book.sort_name", "wa_contacts.given_name+family_name", "wa_contacts.sort_name"], "num_of_source_columns": 5}
|
||||||
|
{"db_path": "selectedDBs\\A1_wa.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["12014946184@s.whatsapp.net"], ["12025692832@s.whatsapp.net"], ["12028177932@s.whatsapp.net"], ["12028275725@s.whatsapp.net"], ["120363098389106519@g.us"], ["12037188989@s.whatsapp.net"], ["12065937224@s.whatsapp.net"], ["12088549831@s.whatsapp.net"], ["12089234440@s.whatsapp.net"], ["12092759604@s.whatsapp.net"]], "Total_raw_rows": 10, "Exploration_sql": "SELECT email FROM wa_biz_profiles WHERE email REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT jid FROM wa_contacts WHERE jid REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT account_id FROM wa_biz_profiles_linked_accounts_table WHERE account_id REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT websites FROM wa_biz_profiles_websites WHERE websites REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "PHONE", "PII": ["5713298742", "5713349815", "8085096467", "8624338328", "7034241981", "2028177932", "5715917168", "2025692832", "6106046786", "6263678865", "2065937224", "9199037779", "8056377243", "9735203731", "8136743027", "7423794330", "9199037779", "8085096467", "5713349815", "6263678865", "6106046786", "7034241981", "5715917168", "2065937224", "5713298742", "8624338328", "8056377243", "2028177932", "2025692832", "9735203731", "8136743027", "7423794330"], "Num_of_PII": 32, "source_columns": ["wa_contacts.number", "wa_address_book.number"], "Raw_rows_first_100": [["5713298742"], ["5713349815"], ["8085096467"], ["8624338328"], ["7034241981"], ["2028177932"], ["5715917168"], ["2025692832"], ["+16106046786"], ["+16263678865"], ["+12065937224"], ["+19199037779"], ["+18056377243"], ["+19735203731"], ["+81367430271"], ["+17423794330"], ["+19199037779"], ["8085096467"], ["5713349815"], ["+16263678865"], ["+16106046786"], ["7034241981"], ["5715917168"], ["+12065937224"], ["5713298742"], ["8624338328"], ["+18056377243"], ["2028177932"], ["2025692832"], ["+19735203731"], ["+81367430271"], ["+17423794330"]], "Total_raw_rows": 32, "Exploration_sql": "SELECT number FROM wa_contacts WHERE number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT number FROM wa_address_book WHERE number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT number FROM wa_biz_profiles WHERE number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": "SELECT number FROM wa_contacts WHERE number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'\nUNION ALL\nSELECT number FROM wa_address_book WHERE number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'\nUNION ALL\nSELECT number FROM wa_biz_profiles WHERE number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["12025692832@s.whatsapp.net"], ["12028177932@s.whatsapp.net"], ["12065937224@s.whatsapp.net"], ["15713298742@s.whatsapp.net"], ["15713349815@s.whatsapp.net"], ["15715917168@s.whatsapp.net"], ["16106046786@s.whatsapp.net"], ["16263678865@s.whatsapp.net"], ["17034241981@s.whatsapp.net"], ["17423794330@s.whatsapp.net"], ["13135550002@s.whatsapp.net"], ["13135550005@s.whatsapp.net"], ["13135550009@s.whatsapp.net"], ["13135550012@s.whatsapp.net"], ["13135550013@s.whatsapp.net"], ["13135550014@s.whatsapp.net"], ["13135550015@s.whatsapp.net"], ["13135550019@s.whatsapp.net"], ["13135550022@s.whatsapp.net"], ["13135550023@s.whatsapp.net"], ["100674318659756@lid"], ["103023648964858@lid"], ["103929853546678@lid"], ["103981426733067@lid"], ["104569854005296@lid"], ["106940592033823@lid"], ["107967072444508@lid"], ["108272048668771@lid"], ["109766730866918@lid"], ["110638642757646@lid"], ["491748600734@s.whatsapp.net"]], "Total_raw_rows": 31, "Exploration_sql": "SELECT jid FROM wa_address_book WHERE jid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT jid FROM wa_biz_profiles WHERE jid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT jid FROM wa_contacts WHERE jid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT creator_jid FROM wa_group_admin_settings WHERE creator_jid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT description_setter_jid FROM wa_group_descriptions WHERE description_setter_jid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "PERSON_NAME", "PII": ["Svetlana Chernoff", "LTC Whalen", "Capt Don Wayne", "Karen Tate", "Brian Reynolds", "Goldie Kahn", "Vladamir Stravinsky", "Mary Garcia", "Abe Rudder", "Russ Philby"], "Num_of_PII": 10, "source_columns": ["wa_contacts.display_name", "wa_contacts.given_name", "wa_contacts.family_name", "wa_contacts.nickname", "wa_address_book.display_name", "wa_address_book.given_name", "wa_address_book.family_name", "wa_address_book.nickname", "wa_biz_profiles.business_description", "wa_biz_profiles.location_name"], "Raw_rows_first_100": [["Svetlana Chernoff"], ["LTC Whalen"], ["Capt Don Wayne"], ["Karen Tate"], ["Hank"], ["Brian Reynolds"], ["Goldie Kahn"], ["Vladamir Stravinsky"], ["6️⃣ Wealth Builders Club"], ["Joey"], ["Mary Garcia"], ["Abe Rudder"], ["Russ Philby"], ["Voice Mail"], ["Otto"], ["Toks"], ["Bo"], ["Svetlana"], ["Don"], ["Karen"], ["Hank"], ["Brian"], ["Goldie"], ["Vladamir"], ["Joey"], ["Mary"], ["Abe"], ["Russ"], ["Voice Mail"], ["Otto"], ["Toks"], ["Bo"], ["Chernoff"], ["Whalen"], ["Wayne"], ["Tate"], ["Reynolds"], ["Kahn"], ["Stravinsky"], ["Garcia"], ["Rudder"], ["Philby"], ["Russ Philby"], ["Capt Don Wayne"], ["LTC Whalen"], ["Mary Garcia"], ["Joey"], ["Hank"], ["Goldie Kahn"], ["Abe Rudder"], ["Svetlana Chernoff"], ["Karen Tate"], ["Voice Mail"], ["Brian Reynolds"], ["Vladamir Stravinsky"], ["Otto"], ["Toks"], ["Bo"], ["Russ"], ["Don"], ["Mary"], ["Joey"], ["Hank"], ["Goldie"], ["Abe"], ["Svetlana"], ["Karen"], ["Voice Mail"], ["Brian"], ["Vladamir"], ["Otto"], ["Toks"], ["Bo"], ["Philby"], ["Wayne"], ["Whalen"], ["Garcia"], ["Kahn"], ["Rudder"], ["Chernoff"], ["Tate"], ["Reynolds"], ["Stravinsky"], ["Dispatch and Sales"], ["Dispatch and Sales"]], "Total_raw_rows": 85, "Exploration_sql": "SELECT display_name FROM wa_contacts WHERE display_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT given_name FROM wa_contacts WHERE given_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT family_name FROM wa_contacts WHERE family_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT nickname FROM wa_contacts WHERE nickname REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT display_name FROM wa_address_book WHERE display_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT given_name FROM wa_address_book WHERE given_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT family_name FROM wa_address_book WHERE family_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT nickname FROM wa_address_book WHERE nickname REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT business_description FROM wa_biz_profiles WHERE business_description REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT location_name FROM wa_biz_profiles WHERE location_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": "SELECT display_name FROM wa_contacts WHERE display_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT given_name FROM wa_contacts WHERE given_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT family_name FROM wa_contacts WHERE family_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT nickname FROM wa_contacts WHERE nickname REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT display_name FROM wa_address_book WHERE display_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT given_name FROM wa_address_book WHERE given_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT family_name FROM wa_address_book WHERE family_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT nickname FROM wa_address_book WHERE nickname REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT business_description FROM wa_biz_profiles WHERE business_description REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT location_name FROM wa_biz_profiles WHERE location_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A1_wa.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT display_name FROM wa_contacts WHERE display_name REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT company FROM wa_contacts WHERE company REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT display_name FROM wa_address_book WHERE display_name REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT company FROM wa_address_book WHERE company REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b';", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_A2_core.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A2_core.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A2_core.db", "PII_Type": "EMAIL", "PII": ["sharononeil368@gmail.com"], "Num_of_PII": 1, "source_columns": ["SnapUserStore.textVal"], "num_of_source_columns": 1}
|
||||||
|
{"db_path": "selectedDBs\\A2_core.db", "PII_Type": "PHONE", "PII": ["18624338329"], "Num_of_PII": 1, "source_columns": ["SnapUserStore.textVal"], "num_of_source_columns": 1}
|
||||||
|
{"db_path": "selectedDBs\\A2_core.db", "PII_Type": "USERNAME", "PII": ["oneil3607", "oneil3607"], "Num_of_PII": 2, "source_columns": ["SnapUserStore.textVal"], "num_of_source_columns": 1}
|
||||||
|
{"db_path": "selectedDBs\\A2_core.db", "PII_Type": "PERSON_NAME", "PII": ["Sharon Oneil"], "Num_of_PII": 1, "source_columns": ["SnapUserStore.textVal"], "num_of_source_columns": 1}
|
||||||
|
{"db_path": "selectedDBs\\A2_core.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
File diff suppressed because one or more lines are too long
5
model_PII_results/ground_truth/PII_A2_journal.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A2_journal.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A2_journal.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A2_journal.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A2_journal.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A2_journal.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A2_journal.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT key FROM journal_entry WHERE key REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT value_count FROM journal_entry WHERE value_count REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT status FROM journal_entry WHERE status REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT last_update_time FROM journal_entry WHERE last_update_time REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT last_read_time FROM journal_entry WHERE last_read_time REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT lock_count FROM journal_entry WHERE lock_count REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT total_size FROM journal_entry WHERE total_size REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT expiration FROM journal_entry WHERE expiration REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT last_consumed_time FROM journal_entry WHERE last_consumed_time REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT metadata FROM journal_entry WHERE metadata REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["9F3F465DC00D96696DDDFE0A946AAB99.khand_medium"], ["C730963C61386A34712C819CA25436C9.media"], ["70177660B739FDDF75DE848B97DC6A6E.edits"], ["03FD66A15523689AD035E1E2B1AD6DAE.chat_wallpaper_media"], ["D41F76126B39D1F7E7EC3D8FA4079D0F.discover_story_streaming_snap"], ["F05AD4876AFE7190FBF88E879238978A.discover_story_streaming_snap_ff"], ["67B685FF2948DC22416716E822D4F5A1.discover_story_streaming_snap_ff"], ["4EDFB389483E360F0CBA63F7A928FD39.discover_story_streaming_snap_ff"], ["629156B858FDF391C0639F2DE6933EEB.discover_story_streaming_snap_ff"], ["5A4F0362F54488AC2542C174F69C9A24.discover_story_streaming_snap_ff"]], "Total_raw_rows": 10, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT key FROM journal_entry WHERE key REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT value_count FROM journal_entry WHERE value_count REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["/data/data/com.snapchat.android/files/file_manager/BLOOPS_STICKER"], ["/data/data/com.snapchat.android/files/file_manager/Bitmoji_Preview"], ["/data/data/com.snapchat.android/files/file_manager/LENS_ASSET_CONTENT_TYPE_NAME"], ["/data/data/com.snapchat.android/files/file_manager/Live_Mirror_Model"], ["/data/data/com.snapchat.android/files/file_manager/Login_Kit_Privacy"], ["/data/data/com.snapchat.android/files/file_manager/MUSIC_GENERIC_ASSET_TYPE"], ["/data/data/com.snapchat.android/files/file_manager/Maps_Kashmir"], ["/data/data/com.snapchat.android/files/file_manager/Maps_WorldEffects"], ["/data/data/com.snapchat.android/files/file_manager/Perception"], ["/data/data/com.snapchat.android/files/file_manager/PerceptionMl"], ["12DB3FD3B46FC8F9DD60F79CB359FBFE.khand_medium"], ["9F3F465DC00D96696DDDFE0A946AAB99.khand_medium"], ["2FC6ABAAFF969A947FAB4E52FE0971FC.thumbnail"], ["C730963C61386A34712C819CA25436C9.media"], ["70177660B739FDDF75DE848B97DC6A6E.edits"], ["03FD66A15523689AD035E1E2B1AD6DAE.chat_wallpaper_media"], ["D41F76126B39D1F7E7EC3D8FA4079D0F.discover_story_streaming_snap"], ["5BBE52CE6D0010CB50CA3221C4741E7D.discover_story_streaming_snap_ff"], ["F05AD4876AFE7190FBF88E879238978A.discover_story_streaming_snap_ff"], ["67B685FF2948DC22416716E822D4F5A1.discover_story_streaming_snap_ff"]], "Total_raw_rows": 20, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL \nSELECT key FROM journal_entry WHERE key REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL \nSELECT value_count FROM journal_entry WHERE value_count REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["/data/data/com.snapchat.android/files/file_manager/BLOOPS_STICKER"], ["/data/data/com.snapchat.android/files/file_manager/Bitmoji_Preview"], ["/data/data/com.snapchat.android/files/file_manager/LENS_ASSET_CONTENT_TYPE_NAME"], ["/data/data/com.snapchat.android/files/file_manager/Live_Mirror_Model"], ["/data/data/com.snapchat.android/files/file_manager/Login_Kit_Privacy"], ["/data/data/com.snapchat.android/files/file_manager/MUSIC_GENERIC_ASSET_TYPE"], ["/data/data/com.snapchat.android/files/file_manager/Maps_Kashmir"], ["/data/data/com.snapchat.android/files/file_manager/Maps_WorldEffects"], ["/data/data/com.snapchat.android/files/file_manager/Perception"], ["/data/data/com.snapchat.android/files/file_manager/PerceptionMl"], ["12DB3FD3B46FC8F9DD60F79CB359FBFE.khand_medium"], ["9F3F465DC00D96696DDDFE0A946AAB99.khand_medium"], ["2FC6ABAAFF969A947FAB4E52FE0971FC.thumbnail"], ["C730963C61386A34712C819CA25436C9.media"], ["70177660B739FDDF75DE848B97DC6A6E.edits"], ["03FD66A15523689AD035E1E2B1AD6DAE.chat_wallpaper_media"], ["D41F76126B39D1F7E7EC3D8FA4079D0F.discover_story_streaming_snap"], ["5BBE52CE6D0010CB50CA3221C4741E7D.discover_story_streaming_snap_ff"], ["F05AD4876AFE7190FBF88E879238978A.discover_story_streaming_snap_ff"], ["67B685FF2948DC22416716E822D4F5A1.discover_story_streaming_snap_ff"]], "Total_raw_rows": 20, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT key FROM journal_entry WHERE key REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT value FROM journal_entry WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [[{"__bytes_b64__": "AAAAAAALmnc="}], [{"__bytes_b64__": "AAAAAAACUwo="}], [{"__bytes_b64__": "AAAAAAAAZbc="}], [{"__bytes_b64__": "AAAAAAAAjm4="}], [{"__bytes_b64__": "AAAAAAAAUwQ="}], [{"__bytes_b64__": "AAAAAAABVzo="}], [{"__bytes_b64__": "AAAAAAAAW04="}], [{"__bytes_b64__": "AAAAAAAA11M="}], [{"__bytes_b64__": "AAAAAAABRSw="}], [{"__bytes_b64__": "AAAAAAAAZYk="}]], "Total_raw_rows": 10, "Exploration_sql": "The existing tables and their columns are as follows:\n\n1. **journal**\n - _id INTEGER\n - path TEXT\n - size INTEGER\n - locked_size INTEGER\n\n2. **journal_entry**\n - _id INTEGER\n - journal_id INTEGER\n - key TEXT\n - sequence_number INTEGER\n - value_count INTEGER\n - status INTEGER\n - last_update_time INTEGER\n - last_read_time INTEGER\n - lock_count INTEGER\n - total_size INTEGER\n - value_sizes BLOB\n - expiration INTEGER\n - last_consumed_time INTEGER\n - metadata BLOB\n\nBased on the available columns, the most relevant column for searching potential US postal addresses would be the `path` column in the `journal` table and the `key` and `value_sizes` columns in the `journal_entry` table. \n\nHere is the SQL query to find possible US postal addresses:\n\n```sql\nSELECT path FROM journal WHERE path REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT key FROM journal_entry WHERE key REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT value_sizes FROM journal_entry WHERE value_sizes REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b';", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_A2_main.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A2_main.jsonl
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT id FROM params WHERE id REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT id FROM params WHERE id REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT id FROM params WHERE id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT id FROM params WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT pbytes FROM params WHERE pbytes REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT pbytes FROM params WHERE pbytes REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT id FROM params WHERE id REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT id FROM params WHERE id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT id FROM params WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT pbytes FROM params WHERE pbytes REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT pbytes FROM params WHERE pbytes REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT pbytes FROM params WHERE pbytes REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT id FROM params WHERE id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT id FROM params WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT pbytes FROM params WHERE pbytes REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_A4_gmm_myplaces.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A4_gmm_myplaces.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT key_string FROM sync_item WHERE key_string REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT last_sync_time FROM sync_corpus WHERE last_sync_time REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT key_string FROM sync_item WHERE key_string REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT corpus FROM sync_corpus WHERE corpus REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT key_string FROM sync_item WHERE key_string REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT key_string FROM sync_item WHERE key_string REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT corpus FROM sync_corpus WHERE corpus REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT key_string FROM sync_item WHERE key_string REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' UNION ALL SELECT corpus FROM sync_corpus WHERE corpus REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b';", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_A4_gmm_storage.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A4_gmm_storage.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_Type": "EMAIL", "PII": ["heather@cellebrite.com", "hmahalik@gmail.com", "heather@cellebrite.com", "hmahalik@gmail.com"], "Num_of_PII": 4, "source_columns": ["Tokens.value", "Tokens_content.c1value"], "num_of_source_columns": 2}
|
||||||
|
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_Type": "PHONE", "PII": ["17423794330", "17423794330"], "Num_of_PII": 2, "source_columns": ["Tokens.value", "Tokens_content.c1value"], "num_of_source_columns": 2}
|
||||||
|
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_Type": "USERNAME", "PII": ["hmahalik", "hmahalik"], "Num_of_PII": 2, "source_columns": ["Tokens.value", "Tokens_content.c1value"], "num_of_source_columns": 2}
|
||||||
|
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_Type": "PERSON_NAME", "PII": ["heather", "Bo", "heather", "Bo"], "Num_of_PII": 4, "source_columns": ["Tokens.value", "Tokens_content.c1value"], "num_of_source_columns": 2}
|
||||||
|
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "EMAIL", "PII": ["heather@cellebrite.com", "hmahalik@gmail.com"], "Num_of_PII": 2, "source_columns": ["Tokens.value", "Tokens_content.c1value", "Tokens_stat.value", "CacheInfo.affinity_response_context"], "Raw_rows_first_100": [["heather@cellebrite.com"], ["hmahalik@gmail.com"], ["heather@cellebrite.com"], ["hmahalik@gmail.com"]], "Total_raw_rows": 4, "Exploration_sql": "SELECT value FROM Tokens WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT value FROM Tokens_stat WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": "SELECT value FROM Tokens WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT value FROM Tokens_stat WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": ["Tokens.value", "Tokens_content.c1value", "Tokens_stat.value"], "Raw_rows_first_100": [["17423794330"], ["7423794330"], ["3794330"], ["17423794330"], ["7423794330"], ["3794330"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT value FROM Tokens WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT value FROM Tokens_stat WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT value FROM Contacts WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": "SELECT value FROM Tokens WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL\nSELECT value FROM Tokens_stat WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL\nSELECT value FROM Contacts WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "USERNAME", "PII": ["heather", "hmahalik"], "Num_of_PII": 2, "source_columns": ["CacheInfo.affinity_response_context", "Contacts.type", "Tokens.value", "Tokens_content.c1value", "Tokens_stat.value"], "Raw_rows_first_100": [["PERSON"], ["PERSON"], ["PERSON"], ["heather@cellebrite.com"], ["heather"], ["hmahalik@gmail.com"], ["hmahalik"], ["heather@cellebrite.com"], ["heather"], ["hmahalik@gmail.com"], ["hmahalik"]], "Total_raw_rows": 11, "Exploration_sql": "SELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT type FROM Contacts WHERE type REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT value FROM Tokens WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT value FROM Tokens_stat WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT identity_hash FROM android_metadata WHERE identity_hash REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": "SELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT type FROM Contacts WHERE type REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT value FROM Tokens WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT value FROM Tokens_stat WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT identity_hash FROM android_metadata WHERE identity_hash REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["Tokens.value", "Tokens_stat.value", "Contacts.id"], "Raw_rows_first_100": [["heather@cellebrite.com"], ["heather"], ["hmahalik@gmail.com"], ["hmahalik"], ["Bo"]], "Total_raw_rows": 5, "Exploration_sql": "SELECT value FROM Tokens WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT value FROM Tokens_content WHERE c1value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT value FROM Tokens_stat WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT id FROM Contacts WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": "SELECT value FROM Tokens WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT value FROM Tokens_content WHERE c1value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT value FROM Tokens_stat WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT id FROM Contacts WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT value FROM Tokens WHERE value REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'\nUNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_A5_SBrowser.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A5_SBrowser.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_Type": "EMAIL", "PII": ["sharononeil368@gmail.com", "sharononeil368@gmail.com", "sharononeil368@gmail.com", "sharononeil368@gmail.com", "sharononeil368@gmail.com", "sharononeil368@gmail.com"], "Num_of_PII": 6, "source_columns": ["BOOKMARKS.ACCOUNT_NAME", "SYNC_STATE.account_name", "TABS.ACCOUNT_NAME"], "num_of_source_columns": 3}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
5
model_PII_results/ground_truth/PII_A5_SBrowser2.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A5_SBrowser2.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT URL FROM BOOKMARKS WHERE URL REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT TAGS FROM BOOKMARKS WHERE TAGS REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT description FROM BOOKMARKS WHERE description REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT URL FROM BOOKMARKS WHERE URL REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT TAGS FROM BOOKMARKS WHERE TAGS REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT description FROM BOOKMARKS WHERE description REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["Sharon's S21"], ["Sharon's S21"], ["Sharon's S21"], ["Sharon's S21"]], "Total_raw_rows": 4, "Exploration_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT ACCOUNT_TYPE FROM BOOKMARKS WHERE ACCOUNT_TYPE REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT DEVICE_ID FROM BOOKMARKS WHERE DEVICE_ID REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT DEVICE_NAME FROM BOOKMARKS WHERE DEVICE_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT TAGS FROM BOOKMARKS WHERE TAGS REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT SOURCEID FROM BOOKMARKS WHERE SOURCEID REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT guid FROM BOOKMARKS WHERE guid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [[{"__bytes_b64__": "BAGBHAIGAQABLGwCAOx/Yz8pWMaZgKzK9gidY1JbWlHtbmZoPVRvHa8L1Qa5v77osj/b9cxf6AihGfAj9/oMzNgzwG4TEa1ocxhMgTw="}], [{"__bytes_b64__": "BAGBHAIGAQABLGwCADg08Y0rLftwtKLewxQylCXdY9YcWUe01qQU3juIWvYiZTepFwA/AGQEV8xammaotcT+LNCNEQBvYK49zZmdniQ="}], [{"__bytes_b64__": "BAGBXgIHAQABLIEsAgAx0od46RvJj3ZMpmE/r++aNuxFfDaTafncAW0XddYSB0WkI7vryB4lplJKYOFVJvriCJ54MlN56/+r6ZnwIIKx3TF4rvHTUNRIc9LKLCfh5hqUwF5ebYY+wMqebfiqMSY="}], [{"__bytes_b64__": "BAGBXgIHAQABLIEsAgAN/bPhb+tihkAfwO8uIxTzh9YWVp1U/kZ3oy/IzxRDnWH35ch+01jfwXwTtCmEj+LyGgSjHzzowXmue9uLyUvfpmVNbt3JCOqCl0EyBAo5+xpVCiij7EFhenIbvW/5cN0="}], [{"__bytes_b64__": "BAGBfgIHAQABLIFMAgAxMZf3KoVtVYHMTrajimnorfRebkfgH/qYhDfOJ2RRzWYOGMxn9xTFfrGxr/Cg60A6HnjLCVgpgftWYAW/bKRQc57iUOY3/35T/FmViNlLg/0T1xmtyNMJL85eTN6Ty8FnisaqUE3+iEK+drFSfnJt"}], [{"__bytes_b64__": "BAGBXgIHAQABLIEsAgDferB3B+clQ8gWXYeFleQ01pIC6VzQ+FxtlyoVYuGZ/sO3K+0Bwv1BqzJe5oM8CyJQtis/qRu/Li9LMYVDQxRCkfClKVoXFf6qVbzv+0XrXKADvEyQD083AV/geGnfwx8="}]], "Total_raw_rows": 6, "Exploration_sql": "SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT TAGS FROM BOOKMARKS WHERE TAGS REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT description FROM BOOKMARKS WHERE description REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [[{"__bytes_b64__": "BAGBHAIGAQABLGwCAFgwiN0XYaRNwN6JFtBRdWfQw5YoTxjvsswMtBRYpQ4bzjZW6jjqVulUtvEdo3wPiKvtT0SzbxZ9i5yPZb27tpI="}], [{"__bytes_b64__": "BAGEPgIHAQABLIQMAgABgxFFetbBD88wsavVtVjy+KN6LQaF52tl9Ztc+fzXT8MhsjknjCsDwrr+9tZQ1uZZNjvrZf13/vt7Nn58Xe1+sdDXmFRvJGfComqfeWyTQONx07TYcFCxWvZNWaecNUuUw+EC87PJColRgnZwE0JLLZ8Zcjsh8AjPicnUGgdkOfN8I1TaXKxMGebpzElMuqtVIhlph5EGeSm7pg0v/ELp/VeGLuxGeKbA6tXvUoiGaT51XBQebcsvu7qUegGuoIrZLf8Od7Q3JpKpH9miAs8O6o+o4zZXEI5LBv1suoFxmU6MRC7lTPGYgU6e+O1Wd9cSg4F2f7U2fAOBGDPYg3BrCZONls5ungK74IoKY48a5A=="}], [{"__bytes_b64__": "BAGBPgIHAQABLIEMAgD443e4SeU/jxbaN7+6lk4CFvovH1ui72l0PE6Mr20DtdnV/jQxlqAi9jhvXfYxTbDjh9dKl6yNgZm/19GPdqzMV0aYY9i3A05cPRL/VKYMpw=="}], [{"__bytes_b64__": "BAGDHgIHAQABLIJsAgCKKvrixPJ+dMqD7ppuoI0MyZaiuUvIgtJ4WHgnxrQGklMS6lrCMckJrtZnmVGBcHWPgbEIH/JKEmczg7KSJqaMfK+BKSyUC+v3ydvl2m/UC0JEA37crOZHBfy5z4B7KF1j7b1u7DDYe4DzyPE0+JyV0dfHonPF0zGA+yDZUAOoReeMJ9yHcuCKnASWELWEEvSukVLIZBmZ2OhTv4KNnbCyLBcA0Xz3SQ32AC3/+ef77BgKgSzNArHW3R5aUBmC7EE="}], [{"__bytes_b64__": "BAGBXgIHAQABLIEsAgAx0od46RvJj3ZMpmE/r++aNuxFfDaTafncAW0XddYSB0WkI7vryB4lplJKYOFVJvriCJ54MlN56/+r6ZnwIIKx3TF4rvHTUNRIc9LKLCfh5hqUwF5ebYY+wMqebfiqMSY="}]], "Total_raw_rows": 5, "Exploration_sql": "SELECT URL FROM BOOKMARKS WHERE URL REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT TAGS FROM BOOKMARKS WHERE TAGS REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT description FROM BOOKMARKS WHERE description REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b';", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "EMAIL", "PII": ["sharononeil368@gmail.com"], "Num_of_PII": 1, "source_columns": ["BOOKMARKS.ACCOUNT_NAME", "INTERNET_SYNC.SYNC_KEY", "SYNC_STATE.account_name", "TABS.TAB_TITLE", "android_metadata.locale"], "Raw_rows_first_100": [["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"]], "Total_raw_rows": 2, "Exploration_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT locale FROM android_metadata WHERE locale REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT locale FROM android_metadata WHERE locale REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["https://www.samsung.com/mobile/?cid=global_ow_app_s-internet_none_none_bookmark_bookmark_202008_none"], ["SBROWSER_TAB1724467631361__BROWSER1724467632107__SBROWSER_SAVEDPAGES1724467635256__QUICKACCESS_SYNC_V21724467633471__SBROWSER_HISTORY1724467634746__"], ["https://www.pinterest.com/pin/410812797236816112/"]], "Total_raw_rows": 3, "Exploration_sql": "SELECT URL FROM BOOKMARKS WHERE URL REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT data FROM SYNC_STATE WHERE data REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT TAB_URL FROM TABS WHERE TAB_URL REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "USERNAME", "PII": ["syncinternetdata", "syncbookmarks", "syncopenpages", "syncsavedpages"], "Num_of_PII": 4, "source_columns": ["BOOKMARKS.ACCOUNT_NAME", "TABS.ACCOUNT_NAME", "SYNC_STATE.account_name", "INTERNET_SYNC.SYNC_KEY"], "Raw_rows_first_100": [["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sync_internet_data"], ["sync_bookmarks"], ["sync_open_pages"], ["sync_saved_pages"]], "Total_raw_rows": 10, "Exploration_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["BOOKMARKS.TITLE", "BOOKMARKS.ACCOUNT_NAME", "BOOKMARKS.ACCOUNT_TYPE", "SYNC_STATE.data", "TABS.TAB_TITLE", "TABS.ACCOUNT_NAME", "TABS.ACCOUNT_TYPE"], "Raw_rows_first_100": [["Bookmarks"], ["Samsung account"], ["how to meditate - Google Search"], ["Google"], ["Galaxy Shop"], ["User guide"], ["sharononeil368@gmail.com"], ["com.osp.app.signin"], ["SBROWSER_TAB1724467631361__BROWSER1724467632107__SBROWSER_SAVEDPAGES1724467635256__QUICKACCESS_SYNC_V21724467633471__SBROWSER_HISTORY1724467634746__"], ["Pin on Simon, God of Hairdos"], ["puck from.glee - Google Search"], ["Midjourney AI - Free Image Generator"], ["billie eilish birds of a feather lyrics - Google Search"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["com.osp.app.signin"], ["com.osp.app.signin"], ["com.osp.app.signin"], ["com.osp.app.signin"]], "Total_raw_rows": 21, "Exploration_sql": "SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT ACCOUNT_TYPE FROM BOOKMARKS WHERE ACCOUNT_TYPE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT data FROM SYNC_STATE WHERE data REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT ACCOUNT_TYPE FROM TABS WHERE ACCOUNT_TYPE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": "SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\nSELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\nSELECT ACCOUNT_TYPE FROM BOOKMARKS WHERE ACCOUNT_TYPE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\nSELECT data FROM SYNC_STATE WHERE data REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\nSELECT ACCOUNT_TYPE FROM TABS WHERE ACCOUNT_TYPE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT URL FROM BOOKMARKS WHERE URL REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT data FROM SYNC_STATE WHERE data REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT TAB_URL FROM TABS WHERE TAB_URL REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b';", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_A5_searchengine.jsonl
Normal file
5
model_PII_results/ground_truth/PII_A5_searchengine.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM searchengine WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT url FROM searchengine WHERE url REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT extra1 FROM searchengine WHERE extra1 REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT extra2 FROM searchengine WHERE extra2 REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT extra3 FROM searchengine WHERE extra3 REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["https://home.imgsmail.ru/resplash/123689/i/meta/favicon.ico"], ["https://search.seznam.cz/re/media/favicon.192a42730e.ico"]], "Total_raw_rows": 2, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT title FROM searchengine WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT url FROM searchengine WHERE url REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT extra1 FROM searchengine WHERE extra1 REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT extra2 FROM searchengine WHERE extra2 REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT extra3 FROM searchengine WHERE extra3 REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"], ["google"], ["DuckDuckGo"], ["yahoo"], ["youtube"], ["bing"], ["so360"], ["qwant"], ["toutiao"], ["StartPage"], ["shenma"], ["https://duckduckgo.com/favicon.ico"], ["https://home.imgsmail.ru/resplash/123689/i/meta/favicon.ico"], ["https://m.toutiao.com/favicon.ico"], ["https://p0.ssl.qhimg.com/d/inn/128c749e/icon.png"], ["https://search.daum.net/favicon.ico"], ["https://search.naver.com/favicon.ico"], ["https://search.seznam.cz/re/media/favicon.192a42730e.ico"], ["https://search.yahoo.com/favicon.ico"], ["https://sm01.alicdn.com/L1/272/1990/favicon/favicon.ico"], ["https://www.baidu.com/favicon.ico"]], "Total_raw_rows": 21, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT title FROM searchengine WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT url FROM searchengine WHERE url REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT image_url FROM searchengine WHERE image_url REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT extra1 FROM searchengine WHERE extra1 REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT extra2 FROM searchengine WHERE extra2 REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT extra3 FROM searchengine WHERE extra3 REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"], ["google"], ["DuckDuckGo"], ["yahoo"], ["youtube"], ["bing"], ["so360"], ["qwant"], ["toutiao"], ["StartPage"], ["shenma"], ["https://duckduckgo.com/favicon.ico"], ["https://home.imgsmail.ru/resplash/123689/i/meta/favicon.ico"], ["https://m.toutiao.com/favicon.ico"], ["https://p0.ssl.qhimg.com/d/inn/128c749e/icon.png"], ["https://search.daum.net/favicon.ico"], ["https://search.naver.com/favicon.ico"], ["https://search.seznam.cz/re/media/favicon.192a42730e.ico"], ["https://search.yahoo.com/favicon.ico"], ["https://sm01.alicdn.com/L1/272/1990/favicon/favicon.ico"], ["https://www.baidu.com/favicon.ico"]], "Total_raw_rows": 21, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT title FROM searchengine WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT url FROM searchengine WHERE url REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT extra1 FROM searchengine WHERE extra1 REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT extra2 FROM searchengine WHERE extra2 REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT extra3 FROM searchengine WHERE extra3 REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["https://www.youtube.com/s/desktop/fadc8afc/img/favicon_48x48.png"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM searchengine WHERE title REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT url FROM searchengine WHERE url REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT extra1 FROM searchengine WHERE extra1 REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT extra2 FROM searchengine WHERE extra2 REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT extra3 FROM searchengine WHERE extra3 REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_I1_CallHistory.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I1_CallHistory.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": ["ZWAAGGREGATECALLEVENT.ZLINKTOKEN", "ZWACDCALLEVENT.ZCALLIDSTRING", "ZWACDCALLEVENT.ZGROUPCALLCREATORUSERJIDSTRING", "ZWACDCALLEVENT.ZGROUPJIDSTRING", "ZWACDCALLEVENTPARTICIPANT.ZJIDSTRING"], "Raw_rows_first_100": [["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"]], "Total_raw_rows": 9, "Exploration_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\nUNION ALL\nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\nUNION ALL\nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\nUNION ALL\nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\nUNION ALL\nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": ["ZWACDCALLEVENT.ZCALLIDSTRING", "ZWACDCALLEVENTPARTICIPANT.ZJIDSTRING", "ZWAAGGREGATECALLEVENT.ZLINKTOKEN"], "Raw_rows_first_100": [["372FA57E129467051E04B3E4DD5A26D5"], ["3A9CEF8B4996D645358B"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": "SELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'\nUNION ALL\nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'\nUNION ALL\nSELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["397466DD-9F9E-4385-BB61-81454ECF0FA4"], ["WAAggregateCallEvent"], ["WACDCallEvent"], ["WACDCallEventParticipant"], ["WAJoinableCallEvent"], ["WAJoinableCallEventParticipant"], ["WAUpcomingCallEvent"]], "Total_raw_rows": 16, "Exploration_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT Z_UUID FROM Z_METADATA WHERE Z_UUID REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT Z_NAME FROM Z_PRIMARYKEY WHERE Z_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["3C399CDDAF11A41F7AFF2892E0A4B10C"], ["3C37CBFE11C261E6CD80C2DE7834D770"], ["372FA57E129467051E04B3E4DD5A26D5"], ["3A6DF670F7121CD6D08B"], ["3A9CEF8B4996D645358B"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"]], "Total_raw_rows": 14, "Exploration_sql": "SELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"]], "Total_raw_rows": 4, "Exploration_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'\nUNION ALL\nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'\nUNION ALL\nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b';", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_I1_ChatStorage.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I1_ChatStorage.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_Type": "PHONE", "PII": ["+1 (971) 678-6701", "19716786701"], "Num_of_PII": 2, "source_columns": ["ZWACHATSESSION.ZPARTNERNAME", "ZWAPROFILEPUSHNAME.ZPUSHNAME"], "num_of_source_columns": 2}
|
||||||
|
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_Type": "USERNAME", "PII": ["skinnyfatfarms", "EmergencyStream", "m.coinol.club", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "Anna_Malina05", "skinnyfatfarms", "Anna_Malina05", "skinnyfatfarms", "skinnyfatfarms", "EmergencyStream"], "Num_of_PII": 36, "source_columns": ["ZWAMEDIAITEM.ZTITLE", "ZWAMESSAGE.ZTEXT", "ZWAMESSAGEDATAITEM.ZSUMMARY", "ZWAMESSAGEDATAITEM.ZTITLE"], "num_of_source_columns": 4}
|
||||||
|
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_Type": "PERSON_NAME", "PII": ["Chad Hunt", "Toni Yu", "Charles Finley", "Ronen Engler", "John Raynolds", "Jonathan Reyes", "Ronen Engler", "Johnny Good", "Russell Philby", "Sharon \ud83d\ude0d", "Abe Rudder", "Finn", "Ronen Engler", "John Reynolds", "Colin DaCopps", "Russell Philby", "Lisena Gocaj", "Lisena Gocaj", "Lisena Gocaj", "Andy Sieg", "Howell", "Yaliweisi", "Andy Sieg", "Abner", "Andy Schweichert", "Lisena Gocaj", "Andy Sieg", "Brian Arseneau", "Virginia", "Christian Justiniano", "Lisena Gocaj", "Jim Wilson", "Lisena Gocaj", "Virginia Benton", "Howell", "Lisena", "Yaliweisi", "Abner", "Abe Rudder", "Virginia", "Jason", "Sharon Oneil", "Job Vizcarra", "Robert Elliott", "Sultan", "Emerick", "Nia Yuniar", "\u200bskol", "David Wilson", "Robechucks Raul", "Ella Bella", "Dick Oscar", "Charlie", "Steven", "Ameya Joshi", "Robechucks Raul", "Robechucks Raul", "Amiel Williamson", "Ashwin Menon", "Ajax Edmiston", "Eleazar Lewden", "Polly Lucas", "Eleazar Lewden", "Robechucks Raul", "Lemuel Glasgow", "Bazel McConnel", "William Stevenson", "Robechucks Raul", "Denice R Allen", "Leif Fox", "William Hopkins", "Robechucks Raul", "Jonas Bradley", "Robechucks Raul", "Amit Sharma", "Brodie", "Carlton", "Andy Sieg", "Ernesto Torres Cant\u00fa", "Andy", "Andy", "Lisena", "Lisena", "Mr. Andy", "Andy", "Andy", "Lisena", "Lisena", "Anna Malina", "Rick", "Otto", "Reynolds", "Otto", "Sharon", "Lisena Gocaj", "Lisena Gocaj", "Andy Sieg", "Lisena", "Lisena Gocaj", "Andy Sieg", "Lisena Gocaj", "Lisena", "Lisena", "Andy Sieg", "Lisena Gocaj", "Andy Sieg", "Lisena", "Monica", "Lisena", "Sharon", "Lisena", "Lisena", "Andy", "Andy", "Lisena", "Andy", "Andy", "Lisena", "Lisena", "Lisena Gocaj", "Lisena Gocaj", "Lisena", "Lisena", "Lisena", "Lisena", "Lisena Gocaj", "Andy Sieg", "Lisena Gocaj", "Stephanie", "Lisena Gocaj", "Abner", "Christian", "Andy", "Lisena Gocaj", "Lisena Gocaj", "Lisena Gocaj", "Mr. Andy", "Lisena", "Jason", "Lisena Gocaj", "Monica", "Lisena Gocaj", "Lisena Gocaj", "Lisena Gocaj", "Lisena", "Lisena", "Lisean", "Lisena", "Lisena Gocaj", "Lisena", "Lisena Gocaj", "Lisena", "Mr. Robechucks", "Mr. Robechucks Raul", "Mr. Robechucks", "Robechucks Raul", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Mr. Robechucks", "Thomas Matthew Crooks", "Donald Trump", "Anna Malina"], "Num_of_PII": 177, "source_columns": ["ZWACHATSESSION.ZPARTNERNAME", "ZWAMEDIAITEM.ZTITLE", "ZWAMESSAGE.ZTEXT", "ZWAMESSAGEDATAITEM.ZSUMMARY", "ZWAMESSAGEDATAITEM.ZTITLE", "ZWAPROFILEPUSHNAME.ZPUSHNAME"], "num_of_source_columns": 6}
|
||||||
|
{"db_path": "selectedDBs\\I1_ChatStorage.sqlite", "PII_Type": "POSTAL_ADDRESS", "PII": ["8607 US-24, Fort Wayne, IN 46804", "16 Marina Bay Financial Centre Tower 2, 12 Marina Boulevard, Singapore 018982", "12503 E Via De Palmas, Chandler, AZ", "8500 Pe\u00f1a Blvd, Denver, CO", "8607 US-24, Fort Wayne, IN 46804"], "Num_of_PII": 5, "source_columns": ["ZWAMEDIAITEM.ZTITLE", "ZWAMESSAGE.ZTEXT", "ZWAMESSAGEDATAITEM.ZTITLE"], "num_of_source_columns": 3}
|
||||||
File diff suppressed because one or more lines are too long
5
model_PII_results/ground_truth/PII_I1_ContactsV2.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I1_ContactsV2.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ZFULLNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZFULLNAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ZUSERNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZUSERNAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ZNOTES FROM ZWAADDRESSBOOKCONTACT WHERE ZNOTES REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": ["ZWAADDRESSBOOKCONTACT.ZPHONENUMBER", "ZWAADDRESSBOOKCONTACT.ZLOCALIZEDPHONENUMBER"], "Raw_rows_first_100": [["(484) 735-3029"], ["(516) 287-9924"], ["(571) 267-9786"], ["(785) 253-3080"], ["(828) 367-7149"], ["(862) 433-8324"], ["(919) 579-6456"], ["(919) 903-7779"], ["(973) 520-3731"], ["+1 (218) 571-5037"], ["+11003163800"], ["+11003236193"], ["+11010361518"], ["+11017911312"], ["+11037878368"], ["+11042223682"], ["+11048948999"], ["+11049272303"], ["+11083209744"], ["+11084335884"], ["+11085591720"], ["+11100705234"], ["+11111844206"], ["+11114456067"], ["+11127810067"], ["+11154642430"], ["+11156064084"], ["+11160925958"], ["+11165224332"], ["+11170259144"], ["+11219230321"], ["+11230876671"], ["+11236195069"], ["+11237220065"], ["+11237697889"], ["+11246793781"], ["+11252341214"], ["+11263010277"], ["+11266958629"], ["+11274676445"], ["+11293350477"], ["+11309658508"], ["+11311142857"], ["+11317473845"], ["+11335889533"], ["+11345908052"], ["+11347504465"], ["+11352788825"], ["+11355886834"], ["+11359112637"], ["+11371651118"], ["+11374373500"], ["+11383511453"], ["+11394174443"], ["+11394974100"], ["+11400289091"], ["+11438414472"], ["+11456738623"], ["+11466660520"], ["+11477095715"], ["+11486535856"], ["+11489236581"], ["+11522593608"], ["+11523301108"], ["+11523732570"], ["+11527995679"], ["+11530023892"], ["+11532451508"], ["+11534800864"], ["+11553731015"], ["+11561833198"], ["+11567697599"], ["+11580127521"], ["+11589953298"], ["+11590233689"], ["+11592618580"], ["+11598803703"], ["+11615188831"], ["+11620904215"], ["+11621272801"], ["+11622622249"], ["+11626585945"], ["+11648180067"], ["+11652416402"], ["+11657127459"], ["+11659501788"], ["+11662243216"], ["+11678178252"], ["+11680737602"], ["+11700968514"], ["+11727060137"], ["+11729502426"], ["+11729630568"], ["+11730812663"], ["+11732642364"], ["+11732997355"], ["+11743515310"], ["+11749448024"], ["+11749883352"], ["+11753137800"]], "Total_raw_rows": 1017, "Exploration_sql": "SELECT ZPHONENUMBER FROM ZWAADDRESSBOOKCONTACT WHERE ZPHONENUMBER REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT ZLOCALIZEDPHONENUMBER FROM ZWAADDRESSBOOKCONTACT WHERE ZLOCALIZEDPHONENUMBER REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": "SELECT ZPHONENUMBER FROM ZWAADDRESSBOOKCONTACT WHERE ZPHONENUMBER REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL\nSELECT ZLOCALIZEDPHONENUMBER FROM ZWAADDRESSBOOKCONTACT WHERE ZLOCALIZEDPHONENUMBER REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["001ACA1F-E806-4F8A-BBC9-07CCDE7D0DA1"], ["00A2DA3D-DDFF-4EC2-A06E-447A68B2D088:ABPerson"], ["01B87AAC-601D-4CAD-AE21-5F00FA2CF474"], ["02AEBD55-5BFC-49CE-B533-C6EC800561D4"], ["0308DA26-C2DB-406D-B586-959FF69ED27A"], ["03161203-7E1B-4104-9FC3-F16CCB2566C0"], ["03273136-9F5B-43CF-8DEF-E1ADF2B5478A"], ["051FA09D-DBDD-4B13-A201-06592403490D"], ["0567CD6A-072D-4D5F-A087-D0E93CAB8A16"], ["057157DA-B5EC-49D1-93F1-D6E0219A2B4C"], ["15728600-E592-4D8B-B6A8-C0D6F37F696C"], ["43B6437F-35BA-4E4E-9D71-FA997FCAABF9"], ["5C8429BF-C54C-4370-9571-151305DB199C"], ["9A0C2AE3-6A86-42D8-A47E-5F53207274AA"], ["B05543F0-D0C7-4AB7-AF58-431BE26C09F6"], ["B8AAFF2B-7077-4B0C-B6D3-F94C062E5829"]], "Total_raw_rows": 16, "Exploration_sql": "SELECT ZUSERNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZUSERNAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZIDENTIFIER FROM ZWAADDRESSBOOKCONTACT WHERE ZIDENTIFIER REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZUNIQUEID FROM ZWAADDRESSBOOKCONTACT WHERE ZUNIQUEID REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["ZWAADDRESSBOOKCONTACT.ZFULLNAME", "ZWAADDRESSBOOKCONTACT.ZGIVENNAME", "ZWAADDRESSBOOKCONTACT.ZLASTNAME", "ZWAADDRESSBOOKCONTACT.ZNOTES"], "Raw_rows_first_100": [["Otto Matik"], ["Ronen Engler"], ["Kathy Fitzpatrick"], ["Daniel White"], ["Alex Weber"], ["Zachary Powell"], ["Amy Chavez"], ["Paul Harris"], ["Amanda Parker"], ["Diane Simmons"], ["Joel Valenzuela"], ["Wendy Estrada"], ["Sydney Hall"], ["Lisa Ritter"], ["Robert Hutchinson"], ["Taylor Garrett"], ["Joshua Cole"], ["Andrea Smith"], ["Nicole Obrien"], ["Jill Carr"], ["Cassandra Barber"], ["Heather Baker"], ["Scott Jacobs"], ["Donna Bautista"], ["Jason Sanders"], ["Cassandra Dickerson"], ["Julie Hodges"], ["Mark Simmons"], ["David Donaldson"], ["Meghan Jarvis"], ["Blake Goodwin"], ["Natalie Lindsey"], ["Joshua Mcdonald"], ["Marilyn Jones"], ["Jacob Taylor"], ["Robert Mccarthy"], ["Randall Taylor"], ["Eric Benson"], ["William Cunningham"], ["Catherine Powers"], ["Charles Waller"], ["Robert Campbell"], ["Anita Morris"], ["Jeremy Armstrong"], ["Katherine Davis"], ["Sarah Lynch"], ["Courtney James"], ["Eugene Farmer"], ["William Acosta"], ["Tony Mederos"], ["Marcus Moore"], ["Ed Venture"], ["Felicia Berg"], ["Tiffany Rocha"], ["Jennifer Ibarra"], ["Tyler Ayala"], ["Melissa Diaz"], ["Alan Clay"], ["Jeanette Nunez"], ["Kaitlyn Anderson"], ["Sue Wilson"], ["David Stanley"], ["Patricia Young"], ["Bryan Cox"], ["Charles Parker"], ["Jeremy Henderson"], ["Tammy Rowe"], ["Chase Collins"], ["Julie Barnes"], ["Madison Hill"], ["Daniel Richardson"], ["Leon Gallegos"], ["Wanda Ramirez"], ["Micheal Fischer"], ["Carol Johnson"], ["Shawn Barnes"], ["Daniel Lang"], ["Jessica Herring"], ["Melvin Estrada"], ["Deanna Roberts"], ["Lindsey Campbell"], ["Mary Khan"], ["Beth Chavez"], ["Jennifer Casey"], ["Frank Flores"], ["Christina Gonzales"], ["Eric Gardner"], ["Alexandria Gilbert"], ["Charles Anderson"], ["Martha Watson"], ["Brian Cross"], ["Cristina Camacho"], ["John Guerrero"], ["Kylie Bradley"], ["William Wang"], ["Jennifer Anderson"], ["Taylor Brock"], ["Joanna Howell"], ["Elizabeth Smith"], ["Felicia Lee"]], "Total_raw_rows": 2032, "Exploration_sql": "SELECT ZFULLNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZFULLNAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZGIVENNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZGIVENNAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZLASTNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZLASTNAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZNOTES FROM ZWAADDRESSBOOKCONTACT WHERE ZNOTES REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": "SELECT ZFULLNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZFULLNAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT ZGIVENNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZGIVENNAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT ZLASTNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZLASTNAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT ZNOTES FROM ZWAADDRESSBOOKCONTACT WHERE ZNOTES REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\I1_ContactsV2.sqlite", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["There’s no place like 127.0.0.1"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT ZFULLNAME FROM ZWAADDRESSBOOKCONTACT WHERE ZFULLNAME REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT ZNOTES FROM ZWAADDRESSBOOKCONTACT WHERE ZNOTES REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_I2_AddressBook.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I2_AddressBook.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I2_AddressBookImages.sqlitedb", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
5
model_PII_results/ground_truth/PII_I3_sms.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I3_sms.jsonl
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
5
model_PII_results/ground_truth/PII_I4_CloudTabs.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I4_CloudTabs.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT system_fields FROM cloud_tab_devices WHERE system_fields REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT title FROM cloud_tabs WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT value FROM metadata WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null, "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["3075F5C2-E558-4E12-B421-6118960D1D2E"], ["ambarrestaurant.com/wp-content/uploads/2024/07/ambar-ch-dinner-07012024.pdf"], ["https://www.bluebite.com/?utm_source=https://www.themtag.com/n10053654&utm_medium=301"], ["https://www.google.com/search?q=snack+with+dairy+kids&client=safari&sca_esv=6eac42e19cef7bdf&hl=en-us&sxsrf=ADLYWIJQfVNS-z6971OqeDnskNzgmBnxVA%3A1723546457188&ei=WTu7Zu6YC9vl5NoPsZT5uAM&oq=snack+with+dairy+kids&gs_lp=EhNtb2JpbGUtZ3dzLXdpei1zZXJwIhVzbmFjayB3aXRoIGRhaXJ5IGtpZHNIkXVQmR9Yym5wEXgBkAEAmAGkAaABthWqAQQzMi4yuAEDyAEA-AEBmAIVoALyC6gCD8ICChAAGLADGNYEGEfCAg0QABiABBiwAxhDGIoFwgIHECMYJxjqAsICChAuGIAEGEMYigXCAgoQABiABBhDGIoFwgIQEC4YgAQY0QMYQxjHARiKBcICDhAuGIAEGLEDGNEDGMcBwgILEC4YgAQY0QMYxwHCAgUQABiABMICDhAAGIAEGLEDGIMBGIoFwgILEC4YgAQYkQIYigXCAgsQABiABBiRAhiKBcICDRAAGIAEGEMY5QQYigXCAggQABiABBixA8ICDRAAGIAEGLEDGEMYigXCAhYQLhiABBixAxjRAxhDGIMBGMcBGIoFwgILEAAYgAQYsQMYgwHCAggQLhiABBixA8ICExAuGIAEGLEDGNEDGIMBGMcBGArCAgoQABiABBgUGIcCwgIHEAAYgAQYCsICCBAAGIAEGMkDwgILEC4YgAQYsQMYgwHCAgYQABgWGB6YAwiIBgGQBhGSBwQxOC4zoAfzbQ&sclient=mobile-gws-wiz-serp"], ["https://www.ravensmanorexperience.com/uploads/b/10bcaef0-743a-11ea-b3f4-31522c100dea/Summer%20PDF_NTkzNT.pdf"], ["https://ambarrestaurant.com/wp-content/uploads/2024/07/ambar-ch-dinner-07012024.pdf"], ["https://support.apple.com/en-us/104959"], ["https://www.bing.com/search?q=figs+tre+wont+ripen+green&form=QBLH&sp=-1&ghc=1&lq=0&pq=figs+tre+wont+ripen+green&sc=10-25&qs=n&sk=&cvid=E8428067064D4726B92401B41C5C64A5&ghsh=0&ghacc=0&ghpl="], ["https://found.apple.com/airtag?pid=5500&b=00&pt=004c&fv=00200240&dg=00&z=00&pi=fb28165432edfeebc48867d872c4acc82721e710cd8a4eede50401ec"]], "Total_raw_rows": 9, "Exploration_sql": "SELECT device_uuid FROM cloud_tab_devices WHERE device_uuid REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT title FROM cloud_tabs WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT url FROM cloud_tabs WHERE url REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT key FROM metadata WHERE key REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT value FROM metadata WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["3075F5C2-E558-4E12-B421-6118960D1D2E"], ["0A10E551-3FB8-47A8-A1A1-EBB67EB1E89C"], ["0D454071-CD2D-42AE-BFC5-26F543FDC97E"], ["67728BBA-DC9E-45D7-9B95-CDC144EC0E1A"], ["727F2438-46FB-41E3-AAB8-97DC413F673A"], ["774BFB84-6C22-4144-BFD6-AA3A8389D8FB"], ["7ADE160B-D5A0-447D-BE3E-93D7CF821492"], ["815589AA-AE4A-43B7-A7E2-542FAFAF4B03"], ["8C37B2F2-9B30-4068-A0F6-5685AC447B47"], ["AC433476-2C87-49F4-B46D-E7B9350225B4"], ["B2235A3B-454A-42FE-9547-9964BBA99F36"], ["server_change_token"], ["use_manatee_container"]], "Total_raw_rows": 13, "Exploration_sql": "SELECT device_uuid FROM cloud_tab_devices WHERE device_uuid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT tab_uuid FROM cloud_tabs WHERE tab_uuid REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT key FROM metadata WHERE key REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["Cellebrite"], ["falafel hardboiled egg israel - Google Search"], ["Dickens Cider is now Dickins Cider Company | Hard Cider Drinks | Dickens Cider"], ["Blue Bite | Connecting Possibility"], ["Home page - Mapat"], ["snack with dairy kids - Google Search"], ["figs growing green not ripe - Google Search"], ["Hours: Amusement Park & Hershey Attractions | Hersheypark"], ["How to Change a Light Bulb"], ["Tolls"], ["ravensmanorexperience.com"], [{"__bytes_b64__": "YnBsaXN0MDDUAQIDBAUGBwpYJHZlcnNpb25ZJGFyY2hpdmVyVCR0b3BYJG9iamVjdHMSAAGGoF8QD05TS2V5ZWRBcmNoaXZlctEICVRyb290gAGkCwwRElUkbnVsbNINDg8QViRjbGFzc18QD0NoYW5nZVRva2VuRGF0YYADgAJPEC8fChASDAAATBJW9UHWAAAAABgBGAAiFgj5m+me89ruvc4BEM2Hk5/Ph+jnpgEoANITFBUWWiRjbGFzc25hbWVYJGNsYXNzZXNfEBNDS1NlcnZlckNoYW5nZVRva2VuohUXWE5TT2JqZWN0CBEaJCkyN0lMUVNYXmNqfH6AsrfCy+HkAAAAAAAAAQEAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAO0="}]], "Total_raw_rows": 12, "Exploration_sql": "SELECT device_name FROM cloud_tab_devices WHERE device_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT title FROM cloud_tabs WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT value FROM metadata WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\I4_CloudTabs.db", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["cool dry place storage - Google Search"], ["Menu | Raven's Manor"]], "Total_raw_rows": 2, "Exploration_sql": "SELECT device_uuid FROM cloud_tab_devices WHERE device_uuid REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT title FROM cloud_tabs WHERE title REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b' \nUNION ALL \nSELECT value FROM metadata WHERE value REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_I4_History.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I4_History.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\I4_History.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_History.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_History.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_History.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I4_History.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
File diff suppressed because one or more lines are too long
5
model_PII_results/ground_truth/PII_I5_Calendar.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I5_Calendar.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_Type": "EMAIL", "PII": ["ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com", "ottomatik1234@gmail.com"], "Num_of_PII": 28, "source_columns": ["Calendar.last_sync_title", "Calendar.notes", "Calendar.owner_identity_email", "Calendar.self_identity_email", "Calendar.shared_owner_address", "Calendar.title", "Identity.address"], "num_of_source_columns": 7}
|
||||||
|
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_Type": "PERSON_NAME", "PII": ["Otto Matik", "Otto Matik", "Otto Matik", "Otto Matik", "Otto Matik", "Otto Matik", "Otto Matik", "Otto Matik", "Otto Matik"], "Num_of_PII": 9, "source_columns": ["Calendar.shared_owner_name", "Identity.display_name", "Store.owner_name"], "num_of_source_columns": 3}
|
||||||
|
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_Type": "POSTAL_ADDRESS", "PII": ["America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles", "America/Los_Angeles"], "Num_of_PII": 28, "source_columns": ["Recurrence.cached_end_date_tz"], "num_of_source_columns": 1}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "EMAIL", "PII": ["ottomatik1234@gmail.com"], "Num_of_PII": 1, "source_columns": ["Alarm.email_address", "Calendar.shared_owner_address", "Calendar.self_identity_email", "Calendar.owner_identity_email"], "Raw_rows_first_100": [["mailto:ottomatik1234@gmail.com"], ["mailto:ottomatik1234@gmail.com"], ["mailto:ottomatik1234@gmail.com"], ["mailto:ottomatik1234@gmail.com"], ["mailto:ottomatik1234@gmail.com"], ["mailto:ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"]], "Total_raw_rows": 18, "Exploration_sql": "SELECT email_address FROM Alarm WHERE email_address REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT shared_owner_address FROM Calendar WHERE shared_owner_address REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT self_identity_email FROM Calendar WHERE self_identity_email REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT owner_identity_email FROM Calendar WHERE owner_identity_email REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": "SELECT email_address FROM Alarm WHERE email_address REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\nUNION ALL\nSELECT shared_owner_address FROM Calendar WHERE shared_owner_address REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\nUNION ALL\nSELECT self_identity_email FROM Calendar WHERE self_identity_email REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\nUNION ALL\nSELECT owner_identity_email FROM Calendar WHERE owner_identity_email REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "PII_Prompt": "a unique identifier for a destination to which electronic mail (email) can be sent and received over the internet; examples include jane.doe@example.com, john.smith@provider.net, dev-team@startup.io, and user.name+label@domain.org"}
|
|
||||||
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT self_identity_phone_number FROM Calendar WHERE self_identity_phone_number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT owner_identity_phone_number FROM Calendar WHERE owner_identity_phone_number REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT description FROM CalendarItem WHERE description REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT summary FROM CalendarItem WHERE summary REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT notes FROM Calendar WHERE notes REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT filename FROM AttachmentFile WHERE filename REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT url FROM AttachmentFile WHERE url REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": null, "PII_Prompt": "a US phone number is a 10-digit NANP number (area code + exchange + line) that may be written as 2023133725, 202-313-3725, (202) 313-3725, 202.313.3725, +1 202 313 3725, or 1-202-313-3725"}
|
|
||||||
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["788AC6F7-EF35-4651-B7F4-0B2947123C1C"], ["/20674123080/calendars/16155228-E388-46B1-81A0-F3038033ACD5/"], ["/20674123080/calendars/510F474F-BA34-457D-9CE6-317C757B78AC/"], ["/20674123080/calendars/inbox/"], ["/20674123080/calendars/notification/"], ["/calendar/dav/ottomatik1234%40gmail.com/events/"], ["/calendar/dav/ottomatik1234%40gmail.com/inbox/"], ["033D6B71-4120-46D2-838E-EFB743F071A1"], ["05404572-7320-4A28-A860-3A1945713CF2"], ["05DB3E21-FB59-4BC7-BAC0-6ABD287CE02E"], ["07D92C83-A95D-48CF-B0E7-751D48E88543"], ["09E00B0C-1043-49A2-8A44-D66919A4D7DD"], ["0AE6C6ED-E1E0-4552-A12E-69058EFD119C"], ["0CB92CB9-0D88-412A-8276-CABCC686E670"], ["0D0FE4FE-9609-4C7C-BA1C-14D8AC3A214A"], ["0DBD1D39-4E69-45B4-B2C9-C6561A4805CD"], ["0DF9BF97-E5D9-4FE4-8F94-56CE3DB7A6EC"], ["com.apple.dataaccessd-3D8B9641-4D39-4E7D-A007-B69A2F855F6B"], ["com.apple.dataaccessd-FB18EA77-54AF-4788-865E-B5C4DF5E2818"], ["com.apple.suggestd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["com.apple.dataaccess.dataaccessd"], ["Otto Matik"], ["Holidays"], ["Default"], ["Other"], ["iCloud"], ["Subscribed Calendars"], ["Gmail"]], "Total_raw_rows": 37, "Exploration_sql": "SELECT external_id FROM Attachment WHERE external_id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT UUID FROM AttachmentFile WHERE UUID REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT external_id FROM Calendar WHERE external_id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT UUID FROM CalendarItem WHERE UUID REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT external_id FROM Category WHERE external_id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT client_identifier FROM ClientCursor WHERE client_identifier REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT client_identifier FROM ClientSequence WHERE client_identifier REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT display_name FROM Identity WHERE display_name REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT unique_identifier FROM Calendar WHERE unique_identifier REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT name FROM Category WHERE name REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT name FROM Store WHERE name REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null, "PII_Prompt": " unique, whitespace-free alphanumeric string used as a system-internal identifier or public handle that lacks a domain suffix, distinguishing it from an email address while still serving as a primary anchor for account attribution."}
|
|
||||||
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["Default"], ["Facebook Birthdays"], ["Found in Mail"], ["Found in Natural Language"], ["Home"], ["ottomatik1234@gmail.com"], ["ottomatik1234@gmail.com"], ["Work"], ["US Holidays"], ["ottomatik1234@gmail.com"], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["The exact date of this holiday is difficult to predict precisely; this is just an approximation."], ["This date is approximate because it is based on a lunar calendar; the beginning of Ramadan is the day after the new moon. "], ["This date is approximate because it is based on a lunar calendar; the beginning of Ramadan is the day after the new moon. "], ["Hanukkah (1st day)"], ["Eid al-Adha"], ["Ashura"], ["Easter"], ["Eid al-Fitr"], ["Ashura"], ["Eid al-Adha"], ["Daylight Saving Time End"], ["Inauguration Day"], ["Eid al-Fitr"]], "Total_raw_rows": 30, "Exploration_sql": "SELECT title FROM Calendar WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT description FROM CalendarItem WHERE description REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT summary FROM CalendarItem WHERE summary REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT contact_name FROM Identity WHERE contact_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT first_name FROM Identity WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT last_name FROM Identity WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": null, "PII_Prompt": "a loosely structured human name-like strings that typically consist of a first name, a first name and a last name, and may also include middle names, initials, prefixes (e.g., Mr., Dr.), and suffixes (e.g., Jr., Sr.)"}
|
|
||||||
{"db_path": "selectedDBs\\I5_Calendar.sqlitedb", "PII_type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT notes FROM Calendar WHERE notes REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'\nUNION ALL\nSELECT description FROM CalendarItem WHERE description REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'\nUNION ALL\nSELECT filename FROM AttachmentFile WHERE filename REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b'\nUNION ALL\nSELECT url FROM Attachment WHERE url REGEXP '(?i)\\b(?:p\\.?\\s*o\\.?\\s*box|post\\s+office\\s+box|ave\\.?|avenue|st\\.?|street|rd\\.?|road|blvd\\.?|boulevard|dr\\.?|drive|ln\\.?|lane|ct\\.?|court|pl\\.?|place|way|pkwy\\.?|parkway|cir\\.?|circle|ter\\.?|terrace|hwy\\.?|highway|trl\\.?|trail|sq\\.?|square|pike|loop|run|walk|path|byp\\.?|bypass|(?:n|s|e|w|ne|nw|se|sw)\\b)\\b';", "Extraction_sql": null, "PII_Prompt": "a US postal address is a street-level mailing location in the United States, commonly appearing as a street name and suffix (e.g., 'Market St') optionally with a street number (e.g., '1500 Market St'), unit, city/state, ZIP, or a PO Box (e.g., 'P.O. Box 123')"}
|
|
||||||
5
model_PII_results/ground_truth/PII_I5_Extras.jsonl
Normal file
5
model_PII_results/ground_truth/PII_I5_Extras.jsonl
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{"db_path": "selectedDBs\\I5_Extras.db", "PII_Type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I5_Extras.db", "PII_Type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I5_Extras.db", "PII_Type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I5_Extras.db", "PII_Type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
|
{"db_path": "selectedDBs\\I5_Extras.db", "PII_Type": "POSTAL_ADDRESS", "PII": [], "Num_of_PII": 0, "source_columns": [], "num_of_source_columns": 0}
|
||||||
File diff suppressed because one or more lines are too long
111
sql_utils.py
111
sql_utils.py
@@ -276,49 +276,96 @@ def split_union_selects(sql: str) -> list[str]:
|
|||||||
parts = re.split(r"\bUNION(?:\s+ALL)?\b", sql, flags=re.IGNORECASE)
|
parts = re.split(r"\bUNION(?:\s+ALL)?\b", sql, flags=re.IGNORECASE)
|
||||||
return [p.strip() for p in parts if p.strip()]
|
return [p.strip() for p in parts if p.strip()]
|
||||||
|
|
||||||
def extract_select_columns(select_sql: str) -> list[str]:
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
def extract_select_columns(select_sql: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Extract column names or column aliases from a single SELECT statement.
|
Extract raw column names from a simple SELECT statement:
|
||||||
|
- No SELECT *
|
||||||
|
- No functions (COUNT, LOWER, etc.)
|
||||||
|
- No expressions (a+b)
|
||||||
|
- No aliases (AS or implicit)
|
||||||
|
- Comma-separated columns only
|
||||||
|
|
||||||
Input:
|
Returns column names in order; strips any table prefix (e.g., u.email -> email).
|
||||||
select_sql (str): A SQL SELECT statement containing an explicit
|
|
||||||
projection list (no SELECT *), such as:
|
|
||||||
"SELECT col, col2 AS alias FROM table".
|
|
||||||
|
|
||||||
Output:
|
|
||||||
list[str]: A list of column names or aliases in the order they appear
|
|
||||||
in the SELECT clause.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
Input:
|
|
||||||
SELECT email, username AS user FROM users
|
|
||||||
|
|
||||||
Output:
|
|
||||||
["email", "user"]
|
|
||||||
"""
|
"""
|
||||||
m = re.search(
|
m = re.search(r"\bSELECT\s+(.*?)\s+\bFROM\b", select_sql, flags=re.IGNORECASE | re.DOTALL)
|
||||||
r"SELECT\s+(.*?)\s+FROM\s",
|
|
||||||
select_sql,
|
|
||||||
flags=re.IGNORECASE | re.DOTALL
|
|
||||||
)
|
|
||||||
if not m:
|
if not m:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
select_list = m.group(1)
|
select_list = m.group(1).strip()
|
||||||
|
if not select_list or select_list == "*":
|
||||||
|
return []
|
||||||
|
|
||||||
columns = []
|
cols: List[str] = []
|
||||||
for item in select_list.split(","):
|
for item in select_list.split(","):
|
||||||
item = item.strip()
|
item = item.strip()
|
||||||
|
|
||||||
# Handle aliases: col AS alias or col alias
|
# remove backticks/quotes around identifiers if present
|
||||||
alias_match = re.search(r"\bAS\s+(\w+)$", item, re.IGNORECASE)
|
item = item.strip("`").strip('"')
|
||||||
if alias_match:
|
|
||||||
columns.append(alias_match.group(1))
|
|
||||||
else:
|
|
||||||
# Take the final identifier
|
|
||||||
columns.append(item.split()[-1])
|
|
||||||
|
|
||||||
return columns
|
# strip table prefix if any (table.col -> col)
|
||||||
|
if "." in item:
|
||||||
|
item = item.split(".")[-1]
|
||||||
|
|
||||||
|
# basic validation: only simple identifiers
|
||||||
|
if re.fullmatch(r"[A-Za-z_]\w*", item):
|
||||||
|
cols.append(item)
|
||||||
|
else:
|
||||||
|
# For "simple SQL" this shouldn't happen; ignore or raise
|
||||||
|
# raise ValueError(f"Non-simple select item: {item}")
|
||||||
|
cols.append(item)
|
||||||
|
|
||||||
|
return cols
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# def extract_select_columns(select_sql: str) -> list[str]:
|
||||||
|
# """
|
||||||
|
# Extract column names or column aliases from a single SELECT statement.
|
||||||
|
|
||||||
|
# Input:
|
||||||
|
# select_sql (str): A SQL SELECT statement containing an explicit
|
||||||
|
# projection list (no SELECT *), such as:
|
||||||
|
# "SELECT col, col2 AS alias FROM table".
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# list[str]: A list of column names or aliases in the order they appear
|
||||||
|
# in the SELECT clause.
|
||||||
|
|
||||||
|
# Example:
|
||||||
|
# Input:
|
||||||
|
# SELECT email, username AS user FROM users
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# ["email", "user"]
|
||||||
|
# """
|
||||||
|
# m = re.search(
|
||||||
|
# r"SELECT\s+(.*?)\s+FROM\s",
|
||||||
|
# select_sql,
|
||||||
|
# flags=re.IGNORECASE | re.DOTALL
|
||||||
|
# )
|
||||||
|
# if not m:
|
||||||
|
# return []
|
||||||
|
|
||||||
|
# select_list = m.group(1)
|
||||||
|
|
||||||
|
# columns = []
|
||||||
|
# for item in select_list.split(","):
|
||||||
|
# item = item.strip()
|
||||||
|
|
||||||
|
# # Handle aliases: col AS alias or col alias
|
||||||
|
# alias_match = re.search(r"\bAS\s+(\w+)$", item, re.IGNORECASE)
|
||||||
|
# if alias_match:
|
||||||
|
# columns.append(alias_match.group(1))
|
||||||
|
# else:
|
||||||
|
# # Take the final identifier
|
||||||
|
# columns.append(item.split()[-1])
|
||||||
|
|
||||||
|
# return columns
|
||||||
|
|
||||||
|
|
||||||
def is_sqlite_file(p: Path) -> bool:
|
def is_sqlite_file(p: Path) -> bool:
|
||||||
|
|||||||
Reference in New Issue
Block a user