mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
add empirical study raw results
This commit is contained in:
7426
PII_Discovery.ipynb
7426
PII_Discovery.ipynb
File diff suppressed because one or more lines are too long
4
batch_results/PII_A1_commerce_20260120T155424Z.jsonl
Normal file
4
batch_results/PII_A1_commerce_20260120T155424Z.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A1_commerce.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
4
batch_results/PII_A1_msgstore_20260120T155536Z.jsonl
Normal file
4
batch_results/PII_A1_msgstore_20260120T155536Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
4
batch_results/PII_A1_wa_20260120T154718Z.jsonl
Normal file
4
batch_results/PII_A1_wa_20260120T154718Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
4
batch_results/PII_A2_core_20260120T155636Z.jsonl
Normal file
4
batch_results/PII_A2_core_20260120T155636Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
4
batch_results/PII_A2_journal_20260120T155711Z.jsonl
Normal file
4
batch_results/PII_A2_journal_20260120T155711Z.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT key FROM journal_entry WHERE key REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT value_count FROM journal_entry WHERE value_count REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["9F3F465DC00D96696DDDFE0A946AAB99.khand_medium"], ["C730963C61386A34712C819CA25436C9.media"], ["70177660B739FDDF75DE848B97DC6A6E.edits"], ["03FD66A15523689AD035E1E2B1AD6DAE.chat_wallpaper_media"], ["D41F76126B39D1F7E7EC3D8FA4079D0F.discover_story_streaming_snap"], ["F05AD4876AFE7190FBF88E879238978A.discover_story_streaming_snap_ff"], ["67B685FF2948DC22416716E822D4F5A1.discover_story_streaming_snap_ff"], ["4EDFB389483E360F0CBA63F7A928FD39.discover_story_streaming_snap_ff"], ["629156B858FDF391C0639F2DE6933EEB.discover_story_streaming_snap_ff"], ["5A4F0362F54488AC2542C174F69C9A24.discover_story_streaming_snap_ff"]], "Total_raw_rows": 10, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT key FROM journal_entry WHERE key REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["/data/data/com.snapchat.android/files/file_manager/BLOOPS_STICKER"], ["/data/data/com.snapchat.android/files/file_manager/Bitmoji_Preview"], ["/data/data/com.snapchat.android/files/file_manager/LENS_ASSET_CONTENT_TYPE_NAME"], ["/data/data/com.snapchat.android/files/file_manager/Live_Mirror_Model"], ["/data/data/com.snapchat.android/files/file_manager/Login_Kit_Privacy"], ["/data/data/com.snapchat.android/files/file_manager/MUSIC_GENERIC_ASSET_TYPE"], ["/data/data/com.snapchat.android/files/file_manager/Maps_Kashmir"], ["/data/data/com.snapchat.android/files/file_manager/Maps_WorldEffects"], ["/data/data/com.snapchat.android/files/file_manager/Perception"], ["/data/data/com.snapchat.android/files/file_manager/PerceptionMl"], ["12DB3FD3B46FC8F9DD60F79CB359FBFE.khand_medium"], ["9F3F465DC00D96696DDDFE0A946AAB99.khand_medium"], ["2FC6ABAAFF969A947FAB4E52FE0971FC.thumbnail"], ["C730963C61386A34712C819CA25436C9.media"], ["70177660B739FDDF75DE848B97DC6A6E.edits"], ["03FD66A15523689AD035E1E2B1AD6DAE.chat_wallpaper_media"], ["D41F76126B39D1F7E7EC3D8FA4079D0F.discover_story_streaming_snap"], ["5BBE52CE6D0010CB50CA3221C4741E7D.discover_story_streaming_snap_ff"], ["F05AD4876AFE7190FBF88E879238978A.discover_story_streaming_snap_ff"], ["67B685FF2948DC22416716E822D4F5A1.discover_story_streaming_snap_ff"]], "Total_raw_rows": 20, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT key FROM journal_entry WHERE key REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A2_journal.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["/data/data/com.snapchat.android/files/file_manager/BLOOPS_STICKER"], ["/data/data/com.snapchat.android/files/file_manager/Bitmoji_Preview"], ["/data/data/com.snapchat.android/files/file_manager/LENS_ASSET_CONTENT_TYPE_NAME"], ["/data/data/com.snapchat.android/files/file_manager/Live_Mirror_Model"], ["/data/data/com.snapchat.android/files/file_manager/Login_Kit_Privacy"], ["/data/data/com.snapchat.android/files/file_manager/MUSIC_GENERIC_ASSET_TYPE"], ["/data/data/com.snapchat.android/files/file_manager/Maps_Kashmir"], ["/data/data/com.snapchat.android/files/file_manager/Maps_WorldEffects"], ["/data/data/com.snapchat.android/files/file_manager/Perception"], ["/data/data/com.snapchat.android/files/file_manager/PerceptionMl"], ["12DB3FD3B46FC8F9DD60F79CB359FBFE.khand_medium"], ["9F3F465DC00D96696DDDFE0A946AAB99.khand_medium"], ["2FC6ABAAFF969A947FAB4E52FE0971FC.thumbnail"], ["C730963C61386A34712C819CA25436C9.media"], ["70177660B739FDDF75DE848B97DC6A6E.edits"], ["03FD66A15523689AD035E1E2B1AD6DAE.chat_wallpaper_media"], ["D41F76126B39D1F7E7EC3D8FA4079D0F.discover_story_streaming_snap"], ["5BBE52CE6D0010CB50CA3221C4741E7D.discover_story_streaming_snap_ff"], ["F05AD4876AFE7190FBF88E879238978A.discover_story_streaming_snap_ff"], ["67B685FF2948DC22416716E822D4F5A1.discover_story_streaming_snap_ff"]], "Total_raw_rows": 20, "Exploration_sql": "SELECT path FROM journal WHERE path REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT key FROM journal_entry WHERE key REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT value FROM journal_entry WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
4
batch_results/PII_A2_main_20260120T160648Z.jsonl
Normal file
4
batch_results/PII_A2_main_20260120T160648Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT CAST(pbytes AS TEXT) FROM params WHERE CAST(pbytes AS TEXT) REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT pbytes FROM params WHERE pbytes REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT id FROM params WHERE id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account1cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT id FROM params WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT pbytes FROM params WHERE pbytes REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT id FROM params WHERE id REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT id FROM params WHERE id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account2cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT id FROM params WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT pbytes FROM params WHERE pbytes REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT pbytes FROM params WHERE pbytes REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT id FROM params WHERE id REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A3_account3cache4.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["ALL_CHATS"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT title FROM dialog_filter WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT id FROM params WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
4
batch_results/PII_A4_gmm_myplaces_20260120T160842Z.jsonl
Normal file
4
batch_results/PII_A4_gmm_myplaces_20260120T160842Z.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT key_string FROM sync_item WHERE key_string REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT last_sync_time FROM sync_corpus WHERE last_sync_time REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT key_string FROM sync_item WHERE key_string REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT corpus FROM sync_corpus WHERE corpus REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"]], "Total_raw_rows": 1, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT key_string FROM sync_item WHERE key_string REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A4_gmm_myplaces.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT key_string FROM sync_item WHERE key_string REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT last_sync_time FROM sync_corpus WHERE last_sync_time REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
4
batch_results/PII_A4_gmm_storage_20260120T160915Z.jsonl
Normal file
4
batch_results/PII_A4_gmm_storage_20260120T160915Z.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT _key_pri FROM gmm_storage_table WHERE _key_pri REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT _key_sec FROM gmm_storage_table WHERE _key_sec REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT locale FROM android_metadata WHERE locale REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT _key_pri FROM gmm_storage_table WHERE _key_pri REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT _key_sec FROM gmm_storage_table WHERE _key_sec REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT locale FROM android_metadata WHERE locale REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"], ["CompletedNavigation"], ["GeofenceData"], ["WaypointsHaveChangedInNav"], ["bundled"], ["STORED_GEOFENCE_INDEX_STORAGE_ID"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT _key_pri FROM gmm_storage_table WHERE _key_pri REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' UNION ALL SELECT _key_sec FROM gmm_storage_table WHERE _key_sec REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A4_gmm_storage.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["CompletedNavigation"], ["GeofenceData"], ["WaypointsHaveChangedInNav"], ["bundled"], ["STORED_GEOFENCE_INDEX_STORAGE_ID"], ["en_US"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT _key_pri FROM gmm_storage_table WHERE _key_pri REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT _key_sec FROM gmm_storage_table WHERE _key_sec REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \nSELECT locale FROM android_metadata WHERE locale REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "EMAIL", "PII": ["heather@cellebrite.com", "hmahalik@gmail.com"], "Num_of_PII": 2, "source_columns": ["Tokens.value", "Tokens_content.c1value", "Tokens_stat.value", "CacheInfo.affinity_response_context"], "Raw_rows_first_100": [["heather@cellebrite.com"], ["hmahalik@gmail.com"], ["heather@cellebrite.com"], ["hmahalik@gmail.com"]], "Total_raw_rows": 4, "Exploration_sql": "SELECT value FROM Tokens WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT value FROM Tokens_stat WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": "SELECT value FROM Tokens WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT value FROM Tokens_stat WHERE value REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'"}
|
||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "PHONE", "PII": ["17423794330", "7423794330", "3794330"], "Num_of_PII": 3, "source_columns": ["Tokens.value", "Tokens_content.c1value", "Tokens_stat.value", "CacheInfo.affinity_response_context"], "Raw_rows_first_100": [["17423794330"], ["7423794330"], ["3794330"], ["17423794330"], ["7423794330"], ["3794330"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT value FROM Tokens WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT value FROM Tokens_stat WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": "SELECT value FROM Tokens WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'\nUNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'\nUNION ALL\nSELECT value FROM Tokens_stat WHERE value REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'\nUNION ALL\nSELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';"}
|
||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "USERNAME", "PII": ["heather@cellebrite.com", "heather", "hmahalik@gmail.com", "hmahalik"], "Num_of_PII": 4, "source_columns": ["CacheInfo.affinity_response_context", "Tokens.value", "Tokens_content.c1value", "Tokens_stat.value"], "Raw_rows_first_100": [["heather@cellebrite.com"], ["heather"], ["hmahalik@gmail.com"], ["hmahalik"], ["heather@cellebrite.com"], ["heather"], ["hmahalik@gmail.com"], ["hmahalik"]], "Total_raw_rows": 8, "Exploration_sql": "SELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT value FROM Tokens WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT value FROM Tokens_stat WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT identity_hash FROM android_metadata WHERE identity_hash REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": "SELECT affinity_response_context FROM CacheInfo WHERE affinity_response_context REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT value FROM Tokens WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT value FROM Tokens_stat WHERE value REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT identity_hash FROM android_metadata WHERE identity_hash REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';"}
|
||||
{"db_path": "selectedDBs\\A4_peopleCache_sharononeil368@gmail.com_com.google_14.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["Tokens.value", "Tokens_content.c1value", "Contacts.id", "CacheInfo.num_contacts"], "Raw_rows_first_100": [["heather@cellebrite.com"], ["heather"], ["hmahalik@gmail.com"], ["hmahalik"], ["Bo"], ["heather@cellebrite.com"], ["heather"], ["hmahalik@gmail.com"], ["hmahalik"], ["Bo"]], "Total_raw_rows": 10, "Exploration_sql": "SELECT value FROM Tokens WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT id FROM Contacts WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT num_contacts FROM CacheInfo WHERE num_contacts REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": "SELECT value FROM Tokens WHERE value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT c1value FROM Tokens_content WHERE c1value REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT id FROM Contacts WHERE id REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT num_contacts FROM CacheInfo WHERE num_contacts REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';"}
|
||||
2
batch_results/PII_A5_SBrowser2_20260120T161207Z.jsonl
Normal file
2
batch_results/PII_A5_SBrowser2_20260120T161207Z.jsonl
Normal file
@@ -0,0 +1,2 @@
|
||||
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT URL FROM BOOKMARKS WHERE URL REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT TAGS FROM BOOKMARKS WHERE TAGS REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT ACCOUNT_TYPE FROM BOOKMARKS WHERE ACCOUNT_TYPE REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' \nUNION ALL \nSELECT description FROM BOOKMARKS WHERE description REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A5_SBrowser2.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT URL FROM BOOKMARKS WHERE URL REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT TAGS FROM BOOKMARKS WHERE TAGS REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT SOURCEID FROM BOOKMARKS WHERE SOURCEID REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT ACCOUNT_TYPE FROM BOOKMARKS WHERE ACCOUNT_TYPE REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT DEVICE_ID FROM BOOKMARKS WHERE DEVICE_ID REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT DEVICE_NAME FROM BOOKMARKS WHERE DEVICE_NAME REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL SELECT description FROM BOOKMARKS WHERE description REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": null}
|
||||
4
batch_results/PII_A5_SBrowser_20260120T161115Z.jsonl
Normal file
4
batch_results/PII_A5_SBrowser_20260120T161115Z.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "EMAIL", "PII": ["sharononeil368@gmail.com"], "Num_of_PII": 1, "source_columns": ["BOOKMARKS.ACCOUNT_NAME", "TABS.ACCOUNT_NAME", "SYNC_STATE.account_name", "INTERNET_SYNC.SYNC_KEY"], "Raw_rows_first_100": [["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'"}
|
||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["https://www.samsung.com/mobile/?cid=global_ow_app_s-internet_none_none_bookmark_bookmark_202008_none"], ["SBROWSER_TAB1724467631361__BROWSER1724467632107__SBROWSER_SAVEDPAGES1724467635256__QUICKACCESS_SYNC_V21724467633471__SBROWSER_HISTORY1724467634746__"], ["https://www.pinterest.com/pin/410812797236816112/"]], "Total_raw_rows": 3, "Exploration_sql": "SELECT URL FROM BOOKMARKS WHERE URL REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT data FROM SYNC_STATE WHERE data REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' UNION ALL \nSELECT TAB_URL FROM TABS WHERE TAB_URL REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "USERNAME", "PII": ["sharononeil368@gmail.com"], "Num_of_PII": 1, "source_columns": ["BOOKMARKS.ACCOUNT_NAME", "TABS.ACCOUNT_NAME", "SYNC_STATE.account_name"], "Raw_rows_first_100": [["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"], ["sharononeil368@gmail.com"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": "SELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT ACCOUNT_NAME FROM TABS WHERE ACCOUNT_NAME REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT account_name FROM SYNC_STATE WHERE account_name REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';"}
|
||||
{"db_path": "selectedDBs\\A5_SBrowser.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": ["BOOKMARKS.TITLE", "BOOKMARKS.ACCOUNT_NAME", "BOOKMARKS.ACCOUNT_TYPE", "SYNC_STATE.data", "TABS.TAB_TITLE", "TABS.TAB_GROUP_NAME", "INTERNET_SYNC.SYNC_KEY"], "Raw_rows_first_100": [["Bookmarks"], ["Samsung account"], ["how to meditate - Google Search"], ["Google"], ["Galaxy Shop"], ["User guide"], ["sharononeil368@gmail.com"], ["com.osp.app.signin"], ["SBROWSER_TAB1724467631361__BROWSER1724467632107__SBROWSER_SAVEDPAGES1724467635256__QUICKACCESS_SYNC_V21724467633471__SBROWSER_HISTORY1724467634746__"], ["Pin on Simon, God of Hairdos"], ["puck from.glee - Google Search"], ["Midjourney AI - Free Image Generator"], ["billie eilish birds of a feather lyrics - Google Search"], ["sync_internet_data"], ["sync_bookmarks"], ["sync_open_pages"], ["sync_saved_pages"]], "Total_raw_rows": 17, "Exploration_sql": "SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ACCOUNT_TYPE FROM BOOKMARKS WHERE ACCOUNT_TYPE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT data FROM SYNC_STATE WHERE data REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT TAB_GROUP_NAME FROM TABS WHERE TAB_GROUP_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": "SELECT TITLE FROM BOOKMARKS WHERE TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT ACCOUNT_NAME FROM BOOKMARKS WHERE ACCOUNT_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT ACCOUNT_TYPE FROM BOOKMARKS WHERE ACCOUNT_TYPE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT data FROM SYNC_STATE WHERE data REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT TAB_TITLE FROM TABS WHERE TAB_TITLE REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT TAB_GROUP_NAME FROM TABS WHERE TAB_GROUP_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\nUNION ALL\nSELECT SYNC_KEY FROM INTERNET_SYNC WHERE SYNC_KEY REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';"}
|
||||
4
batch_results/PII_A5_searchengine_20260120T163422Z.jsonl
Normal file
4
batch_results/PII_A5_searchengine_20260120T163422Z.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "EMAIL", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [], "Total_raw_rows": 0, "Exploration_sql": "SELECT url FROM searchengine WHERE url REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT extra1 FROM searchengine WHERE extra1 REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT extra2 FROM searchengine WHERE extra2 REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL SELECT extra3 FROM searchengine WHERE extra3 REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}';", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["https://home.imgsmail.ru/resplash/123689/i/meta/favicon.ico"], ["https://search.seznam.cz/re/media/favicon.192a42730e.ico"]], "Total_raw_rows": 2, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT title FROM searchengine WHERE title REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT url FROM searchengine WHERE url REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT extra1 FROM searchengine WHERE extra1 REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT extra2 FROM searchengine WHERE extra2 REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT extra3 FROM searchengine WHERE extra3 REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}'", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "USERNAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"], ["google"], ["DuckDuckGo"], ["yahoo"], ["youtube"], ["bing"], ["so360"], ["qwant"], ["toutiao"], ["StartPage"], ["shenma"], ["https://duckduckgo.com/favicon.ico"], ["https://home.imgsmail.ru/resplash/123689/i/meta/favicon.ico"], ["https://m.toutiao.com/favicon.ico"], ["https://p0.ssl.qhimg.com/d/inn/128c749e/icon.png"], ["https://search.daum.net/favicon.ico"], ["https://search.naver.com/favicon.ico"], ["https://search.seznam.cz/re/media/favicon.192a42730e.ico"], ["https://search.yahoo.com/favicon.ico"], ["https://sm01.alicdn.com/L1/272/1990/favicon/favicon.ico"], ["https://www.baidu.com/favicon.ico"]], "Total_raw_rows": 21, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT title FROM searchengine WHERE title REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT url FROM searchengine WHERE url REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT image_url FROM searchengine WHERE image_url REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT extra1 FROM searchengine WHERE extra1 REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT extra2 FROM searchengine WHERE extra2 REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT extra3 FROM searchengine WHERE extra3 REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\A5_searchengine.db", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["en_US"], ["google"], ["DuckDuckGo"], ["yahoo"], ["youtube"], ["bing"], ["so360"], ["qwant"], ["toutiao"], ["StartPage"], ["shenma"], ["https://duckduckgo.com/favicon.ico"], ["https://home.imgsmail.ru/resplash/123689/i/meta/favicon.ico"], ["https://m.toutiao.com/favicon.ico"], ["https://p0.ssl.qhimg.com/d/inn/128c749e/icon.png"], ["https://search.daum.net/favicon.ico"], ["https://search.naver.com/favicon.ico"], ["https://search.seznam.cz/re/media/favicon.192a42730e.ico"], ["https://search.yahoo.com/favicon.ico"], ["https://sm01.alicdn.com/L1/272/1990/favicon/favicon.ico"], ["https://www.baidu.com/favicon.ico"]], "Total_raw_rows": 21, "Exploration_sql": "SELECT locale FROM android_metadata WHERE locale REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT title FROM searchengine WHERE title REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT url FROM searchengine WHERE url REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT extra1 FROM searchengine WHERE extra1 REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT extra2 FROM searchengine WHERE extra2 REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL SELECT extra3 FROM searchengine WHERE extra3 REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'", "Extraction_sql": null}
|
||||
4
batch_results/PII_I1_CallHistory_20260120T163520Z.jsonl
Normal file
4
batch_results/PII_I1_CallHistory_20260120T163520Z.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "EMAIL", "PII": ["19735203731@s.whatsapp.net", "923402582955@s.whatsapp.net", "14847353029@s.whatsapp.net", "19199037779@s.whatsapp.net"], "Num_of_PII": 4, "source_columns": ["ZWAAGGREGATECALLEVENT.ZLINKTOKEN", "ZWACDCALLEVENT.ZCALLIDSTRING", "ZWACDCALLEVENT.ZGROUPCALLCREATORUSERJIDSTRING", "ZWACDCALLEVENT.ZGROUPJIDSTRING", "ZWACDCALLEVENTPARTICIPANT.ZJIDSTRING"], "Raw_rows_first_100": [["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"]], "Total_raw_rows": 9, "Exploration_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'", "Extraction_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}' UNION ALL\nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'"}
|
||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "PHONE", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["372FA57E129467051E04B3E4DD5A26D5"], ["3A9CEF8B4996D645358B"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"]], "Total_raw_rows": 6, "Exploration_sql": "SELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}' \nUNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}';", "Extraction_sql": null}
|
||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "USERNAME", "PII": ["19735203731@s.whatsapp.net", "923402582955@s.whatsapp.net", "14847353029@s.whatsapp.net", "19199037779@s.whatsapp.net"], "Num_of_PII": 4, "source_columns": ["ZWAAGGREGATECALLEVENT.ZLINKTOKEN", "ZWACDCALLEVENT.ZCALLIDSTRING", "ZWACDCALLEVENT.ZGROUPCALLCREATORUSERJIDSTRING", "ZWACDCALLEVENT.ZGROUPJIDSTRING", "ZWACDCALLEVENTPARTICIPANT.ZJIDSTRING"], "Raw_rows_first_100": [["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"]], "Total_raw_rows": 9, "Exploration_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \nUNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';", "Extraction_sql": "SELECT ZLINKTOKEN FROM ZWAAGGREGATECALLEVENT WHERE ZLINKTOKEN REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\nUNION ALL\nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';"}
|
||||
{"db_path": "selectedDBs\\I1_CallHistory.sqlite", "PII_type": "PERSON_NAME", "PII": [], "Num_of_PII": 0, "source_columns": [], "Raw_rows_first_100": [["3C399CDDAF11A41F7AFF2892E0A4B10C"], ["3C37CBFE11C261E6CD80C2DE7834D770"], ["372FA57E129467051E04B3E4DD5A26D5"], ["3A6DF670F7121CD6D08B"], ["3A9CEF8B4996D645358B"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["19735203731@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["14847353029@s.whatsapp.net"], ["19199037779@s.whatsapp.net"], ["923402582955@s.whatsapp.net"], ["WAAggregateCallEvent"], ["WACDCallEvent"], ["WACDCallEventParticipant"], ["WAJoinableCallEvent"], ["WAJoinableCallEventParticipant"], ["WAUpcomingCallEvent"]], "Total_raw_rows": 20, "Exploration_sql": "SELECT ZCALLIDSTRING FROM ZWACDCALLEVENT WHERE ZCALLIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZGROUPCALLCREATORUSERJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPCALLCREATORUSERJIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZGROUPJIDSTRING FROM ZWACDCALLEVENT WHERE ZGROUPJIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT ZJIDSTRING FROM ZWACDCALLEVENTPARTICIPANT WHERE ZJIDSTRING REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \nUNION ALL \nSELECT Z_NAME FROM Z_PRIMARYKEY WHERE Z_NAME REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';", "Extraction_sql": null}
|
||||
4
batch_results/PII_I1_ChatStorage_20260120T164439Z.jsonl
Normal file
4
batch_results/PII_I1_ChatStorage_20260120T164439Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
4
batch_results/PII_I1_ContactsV2_20260120T165625Z.jsonl
Normal file
4
batch_results/PII_I1_ContactsV2_20260120T165625Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
4
batch_results/PII_I2_AddressBook_20260120T171307Z.jsonl
Normal file
4
batch_results/PII_I2_AddressBook_20260120T171307Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
4
batch_results/PII_I3_sms_20260120T171457Z.jsonl
Normal file
4
batch_results/PII_I3_sms_20260120T171457Z.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -3,7 +3,7 @@ db_files = [
|
||||
# "users.db",
|
||||
# "A1_commerce.db",
|
||||
# "A1_msgstore.db",
|
||||
"A1_wa.db",
|
||||
# "A1_wa.db",
|
||||
# "A2_core.db",
|
||||
# "A2_journal.db",
|
||||
# "A2_main.db",
|
||||
@@ -15,17 +15,17 @@ db_files = [
|
||||
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
|
||||
# "A5_SBrowser.db",
|
||||
# "A5_SBrowser2.db",
|
||||
# "A5_searchengine.db",
|
||||
# "I1_CallHistory.sqlite",
|
||||
# "I1_ChatStorage.sqlite",
|
||||
# "I1_ContactsV2.sqlite",
|
||||
# "I2_AddressBook.sqlitedb",
|
||||
# "I2_AddressBookImages.sqlitedb",
|
||||
# "I3_sms.db",
|
||||
# "I4_CloudTabs.db",
|
||||
# "I4_History.db",
|
||||
# "I5_Calendar.sqlitedb",
|
||||
# "I5_Extras.db",
|
||||
"A5_searchengine.db",
|
||||
"I1_CallHistory.sqlite",
|
||||
"I1_ChatStorage.sqlite",
|
||||
"I1_ContactsV2.sqlite",
|
||||
"I2_AddressBook.sqlitedb",
|
||||
"I2_AddressBookImages.sqlitedb",
|
||||
"I3_sms.db",
|
||||
"I4_CloudTabs.db",
|
||||
"I4_History.db",
|
||||
"I5_Calendar.sqlitedb",
|
||||
"I5_Extras.db",
|
||||
]
|
||||
|
||||
PII_CONFIG = {
|
||||
|
||||
114
sql_utils.py
114
sql_utils.py
@@ -31,6 +31,27 @@ def extract_single_table(select_sql: str) -> str | None:
|
||||
tables = sorted(set(m.values()))
|
||||
return tables[0] if len(tables) == 1 else None
|
||||
|
||||
|
||||
|
||||
|
||||
def _bytes_to_display(b: bytes, max_len: int) -> str:
|
||||
# Try UTF-8 first (common for text stored as BLOB)
|
||||
_PRINTABLE_RE = re.compile(r"^[\x09\x0a\x0d\x20-\x7e]+$") # tabs/newlines/spaces + printable ASCII
|
||||
try:
|
||||
s = b.decode("utf-8", errors="replace")
|
||||
s = s.strip()
|
||||
# If it is mostly printable, keep it
|
||||
if s and _PRINTABLE_RE.match(s[:min(len(s), 200)]):
|
||||
return s[:max_len] + ("..." if len(s) > max_len else "")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Otherwise show hex preview (compact, honest)
|
||||
hx = b.hex()
|
||||
if len(hx) > max_len:
|
||||
return hx[:max_len] + "..."
|
||||
return hx
|
||||
|
||||
def rows_to_text(rows, limit=None, max_chars=500000, cell_max=1000):
|
||||
"""
|
||||
Converts SQL rows to text with safety limits for LLM context.
|
||||
@@ -38,35 +59,47 @@ def rows_to_text(rows, limit=None, max_chars=500000, cell_max=1000):
|
||||
- max_chars: Hard limit for the total string length.
|
||||
- cell_max: Max length for any single column value.
|
||||
"""
|
||||
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
|
||||
out = []
|
||||
# 1. Row-level limiting
|
||||
target_rows = rows[:limit] if limit else rows
|
||||
|
||||
|
||||
for r in target_rows:
|
||||
# print(f"Test [ROW DATA] {r}")
|
||||
if r is None:
|
||||
continue
|
||||
s = str(r).strip() # trim whitespace first
|
||||
if len(s) == 0:
|
||||
continue
|
||||
if len(s) > cell_max:
|
||||
s = s[:cell_max] + "..."
|
||||
out.append(s)
|
||||
|
||||
|
||||
# Handle tuples/rows cell-by-cell so bytes do not become "b'...'"
|
||||
if isinstance(r, (tuple, list)):
|
||||
cells = []
|
||||
for v in r:
|
||||
if isinstance(v, bytes):
|
||||
cells.append(_bytes_to_display(v, cell_max))
|
||||
else:
|
||||
sv = "" if v is None else str(v).strip()
|
||||
if len(sv) > cell_max:
|
||||
sv = sv[:cell_max] + "..."
|
||||
cells.append(sv)
|
||||
s = "(" + ", ".join(cells) + ")"
|
||||
else:
|
||||
# Non-tuple row
|
||||
if isinstance(r, bytes):
|
||||
s = _bytes_to_display(r, cell_max)
|
||||
else:
|
||||
s = str(r).strip()
|
||||
if len(s) > cell_max:
|
||||
s = s[:cell_max] + "..."
|
||||
|
||||
if s:
|
||||
out.append(s)
|
||||
|
||||
final_text = "\n".join(out)
|
||||
|
||||
# 2. Final global character limit safety check
|
||||
|
||||
if len(final_text) > max_chars:
|
||||
return final_text[:max_chars] + "\n... [DATA TRUNCATED] ..."
|
||||
|
||||
# print(f"[ROWS TO TEXT] Input: {len(rows)} rows | Output: {len(final_text)} chars")
|
||||
# Optional: print only the first 200 characters of the text to keep logs clean
|
||||
# print(f"[PREVIEW]: {final_text[:200]}...")
|
||||
return final_text
|
||||
|
||||
return final_text
|
||||
|
||||
def regexp(expr, item):
|
||||
"""
|
||||
@@ -153,6 +186,18 @@ def normalize_sql(sql: str) -> str:
|
||||
|
||||
return sql
|
||||
|
||||
def upgrade_sql_remove_limit(sql: str) -> str:
|
||||
_LIMIT_RE = re.compile(r"\s+LIMIT\s+\d+\s*;?\s*$", re.IGNORECASE)
|
||||
_LIMIT_ANYWHERE_RE = re.compile(r"\s+LIMIT\s+\d+\s*(?=($|\n|UNION|ORDER|GROUP|HAVING))", re.IGNORECASE)
|
||||
# Remove LIMIT clauses robustly (including UNION queries)
|
||||
upgraded = re.sub(r"\bLIMIT\s+\d+\b", "", sql, flags=re.IGNORECASE)
|
||||
# Clean up extra whitespace
|
||||
upgraded = re.sub(r"\s+\n", "\n", upgraded)
|
||||
upgraded = re.sub(r"\n\s+\n", "\n", upgraded)
|
||||
upgraded = re.sub(r"\s{2,}", " ", upgraded).strip()
|
||||
return upgraded
|
||||
|
||||
|
||||
def safe_json_loads(text: str, default):
|
||||
"""
|
||||
Safely parse JSON from LLM-generated text.
|
||||
@@ -319,17 +364,26 @@ def print_db_path_report(db_paths: List[Path], missing: List[str], not_sqlite: L
|
||||
for x in not_sqlite:
|
||||
print(" -", x)
|
||||
|
||||
def save_jsonl(all_results, out_dir):
|
||||
def save_jsonl(results, out_dir: Path, db_path: str) -> Path:
|
||||
"""
|
||||
Save one JSONL file per database.
|
||||
Filename includes database stem + UTC timestamp.
|
||||
Converts bytes/BLOBs to JSON-safe base64.
|
||||
"""
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
|
||||
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
out_path = out_dir / f"evidence_{ts}.jsonl"
|
||||
db_stem = Path(db_path).stem
|
||||
out_path = out_dir / f"PII_{db_stem}_{ts}.jsonl"
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for r in all_results:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
for r in results:
|
||||
f.write(json.dumps(json_safe(r), ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Wrote: {out_path.resolve()}")
|
||||
return out_path
|
||||
|
||||
|
||||
def load_config_yaml(path: Path) -> dict:
|
||||
return yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||
|
||||
@@ -347,3 +401,19 @@ def load_vars_from_py(py_path: Path, *var_names: str):
|
||||
out[name] = getattr(mod, name)
|
||||
return out
|
||||
|
||||
import base64
|
||||
|
||||
# sanitize each result dict before writing JSONL
|
||||
def json_safe(obj):
|
||||
if isinstance(obj, bytes):
|
||||
# base64 keeps it compact and reversible
|
||||
return {"__bytes_b64__": base64.b64encode(obj).decode("ascii")}
|
||||
# or use hex:
|
||||
# return {"__bytes_hex__": obj.hex()}
|
||||
if isinstance(obj, tuple):
|
||||
return [json_safe(x) for x in obj]
|
||||
if isinstance(obj, list):
|
||||
return [json_safe(x) for x in obj]
|
||||
if isinstance(obj, dict):
|
||||
return {k: json_safe(v) for k, v in obj.items()}
|
||||
return obj
|
||||
|
||||
Reference in New Issue
Block a user