add automated process (folder level)

This commit is contained in:
Frank Xu
2026-01-19 21:03:32 -05:00
parent 41e720fb95
commit 82ecd08ea2

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"id": "a10c9a6a", "id": "a10c9a6a",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -21,6 +21,7 @@
"from langchain_core.messages import HumanMessage\n", "from langchain_core.messages import HumanMessage\n",
"from sql_utils import *\n", "from sql_utils import *\n",
"from datetime import datetime, timezone\n", "from datetime import datetime, timezone\n",
"from pathlib import Path\n",
"\n", "\n",
"load_dotenv() # This looks for the .env file and loads it into os.environ\n", "load_dotenv() # This looks for the .env file and loads it into os.environ\n",
"\n", "\n",
@@ -593,6 +594,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Will process 1 databases (from db_files list).\n",
"\n", "\n",
"Processing: selectedDBs\\test2.db\n", "Processing: selectedDBs\\test2.db\n",
" Processing: EMAIL\n", " Processing: EMAIL\n",
@@ -1015,7 +1017,7 @@
"extraction_sql : None\n", "extraction_sql : None\n",
"rows_count : 20\n", "rows_count : 20\n",
"rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n",
"classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that are likely application-specific login usernames created by users for login purposes.'}\n", "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n",
"evidence_count : 0\n", "evidence_count : 0\n",
"evidence_sample : []\n", "evidence_sample : []\n",
"source_columns : []\n", "source_columns : []\n",
@@ -1048,7 +1050,7 @@
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"rows_count : 20\n", "rows_count : 20\n",
"rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n",
"classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that are likely application-specific login usernames created by users for login purposes.'}\n", "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n",
"evidence_count : 0\n", "evidence_count : 0\n",
"evidence_sample : []\n", "evidence_sample : []\n",
"source_columns : []\n", "source_columns : []\n",
@@ -1083,39 +1085,6 @@
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"rows_count : 20\n", "rows_count : 20\n",
"rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n",
"classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that are likely application-specific login usernames created by users for login purposes.'}\n",
"evidence_count : 0\n",
"evidence_sample : []\n",
"source_columns : ['users.username', 'users.email']\n",
"\n",
"--- END METADATA ---\n",
"\n",
"=== STATE SNAPSHOT ===\n",
"\n",
"--- MESSAGES ---\n",
"0: HUMAN -> Find application-specific login usernames created by users for login purposes in the database\n",
"1: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n",
"UNION ALL \n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"2: AI -> Retrieved 20 rows\n",
"3: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n",
"UNION ALL\n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"4: AI -> Retrieved 20 rows\n",
"\n",
"--- BEGIN METADATA ---\n",
"attempt : 2\n",
"max_attempts : 2\n",
"phase : extraction\n",
"PII type : username\n",
"exploration_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n",
"UNION ALL \n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"extraction_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n",
"UNION ALL\n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"rows_count : 20\n",
"rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n",
"classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n", "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n",
"evidence_count : 0\n", "evidence_count : 0\n",
"evidence_sample : []\n", "evidence_sample : []\n",
@@ -1149,7 +1118,40 @@
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"rows_count : 20\n", "rows_count : 20\n",
"rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n",
"classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n", "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that appear to be application-specific login usernames created by users for login purposes.'}\n",
"evidence_count : 0\n",
"evidence_sample : []\n",
"source_columns : ['users.username', 'users.email']\n",
"\n",
"--- END METADATA ---\n",
"\n",
"=== STATE SNAPSHOT ===\n",
"\n",
"--- MESSAGES ---\n",
"0: HUMAN -> Find application-specific login usernames created by users for login purposes in the database\n",
"1: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n",
"UNION ALL \n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"2: AI -> Retrieved 20 rows\n",
"3: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n",
"UNION ALL\n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"4: AI -> Retrieved 20 rows\n",
"\n",
"--- BEGIN METADATA ---\n",
"attempt : 2\n",
"max_attempts : 2\n",
"phase : extraction\n",
"PII type : username\n",
"exploration_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n",
"UNION ALL \n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"extraction_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n",
"UNION ALL\n",
"SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n",
"rows_count : 20\n",
"rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n",
"classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that appear to be application-specific login usernames created by users for login purposes.'}\n",
"evidence_count : 10\n", "evidence_count : 10\n",
"evidence_sample : ['ajohnson', 'bsmith', 'cdavis', 'dmiller', 'ewilson', 'fbrown', 'gtaylor', 'handerson', 'ithomas', 'jmoore']\n", "evidence_sample : ['ajohnson', 'bsmith', 'cdavis', 'dmiller', 'ewilson', 'fbrown', 'gtaylor', 'handerson', 'ithomas', 'jmoore']\n",
"source_columns : ['users.username', 'users.email']\n", "source_columns : ['users.username', 'users.email']\n",
@@ -1161,22 +1163,18 @@
"\n", "\n",
"--- MESSAGES ---\n", "--- MESSAGES ---\n",
"0: HUMAN -> Find loosely structured human name-like strings in the database\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n",
"1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"\n", "\n",
"--- BEGIN METADATA ---\n", "--- BEGIN METADATA ---\n",
"attempt : 2\n", "attempt : 2\n",
"max_attempts : 2\n", "max_attempts : 2\n",
"phase : exploration\n", "phase : exploration\n",
"PII type : person name\n", "PII type : person name\n",
"exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"extraction_sql : None\n", "extraction_sql : None\n",
"rows_count : 0\n", "rows_count : 0\n",
"rows_sample : []\n", "rows_sample : []\n",
@@ -1192,11 +1190,9 @@
"\n", "\n",
"--- MESSAGES ---\n", "--- MESSAGES ---\n",
"0: HUMAN -> Find loosely structured human name-like strings in the database\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n",
"1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"2: AI -> Retrieved 30 rows\n", "2: AI -> Retrieved 30 rows\n",
"\n", "\n",
"--- BEGIN METADATA ---\n", "--- BEGIN METADATA ---\n",
@@ -1204,11 +1200,9 @@
"max_attempts : 2\n", "max_attempts : 2\n",
"phase : exploration\n", "phase : exploration\n",
"PII type : person name\n", "PII type : person name\n",
"exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"extraction_sql : None\n", "extraction_sql : None\n",
"rows_count : 30\n", "rows_count : 30\n",
"rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n",
@@ -1223,11 +1217,9 @@
"\n", "\n",
"--- MESSAGES ---\n", "--- MESSAGES ---\n",
"0: HUMAN -> Find loosely structured human name-like strings in the database\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n",
"1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"2: AI -> Retrieved 30 rows\n", "2: AI -> Retrieved 30 rows\n",
"\n", "\n",
"--- BEGIN METADATA ---\n", "--- BEGIN METADATA ---\n",
@@ -1235,11 +1227,9 @@
"max_attempts : 2\n", "max_attempts : 2\n",
"phase : exploration\n", "phase : exploration\n",
"PII type : person name\n", "PII type : person name\n",
"exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"extraction_sql : None\n", "extraction_sql : None\n",
"rows_count : 30\n", "rows_count : 30\n",
"rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n",
@@ -1255,33 +1245,25 @@
"\n", "\n",
"--- MESSAGES ---\n", "--- MESSAGES ---\n",
"0: HUMAN -> Find loosely structured human name-like strings in the database\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n",
"1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"2: AI -> Retrieved 30 rows\n", "2: AI -> Retrieved 30 rows\n",
"3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"UNION ALL\n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"\n", "\n",
"--- BEGIN METADATA ---\n", "--- BEGIN METADATA ---\n",
"attempt : 2\n", "attempt : 2\n",
"max_attempts : 2\n", "max_attempts : 2\n",
"phase : extraction\n", "phase : extraction\n",
"PII type : person name\n", "PII type : person name\n",
"exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n", "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"rows_count : 30\n", "rows_count : 30\n",
"rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n",
"classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n", "classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n",
@@ -1297,17 +1279,13 @@
"\n", "\n",
"--- MESSAGES ---\n", "--- MESSAGES ---\n",
"0: HUMAN -> Find loosely structured human name-like strings in the database\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n",
"1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"2: AI -> Retrieved 30 rows\n", "2: AI -> Retrieved 30 rows\n",
"3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"UNION ALL\n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"4: AI -> Retrieved 30 rows\n", "4: AI -> Retrieved 30 rows\n",
"\n", "\n",
"--- BEGIN METADATA ---\n", "--- BEGIN METADATA ---\n",
@@ -1315,16 +1293,12 @@
"max_attempts : 2\n", "max_attempts : 2\n",
"phase : extraction\n", "phase : extraction\n",
"PII type : person name\n", "PII type : person name\n",
"exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"UNION ALL \n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL \n", "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n",
"extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"rows_count : 30\n", "rows_count : 30\n",
"rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n",
"classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n", "classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n",
@@ -1332,90 +1306,7 @@
"evidence_sample : []\n", "evidence_sample : []\n",
"source_columns : ['users.first_name', 'users.last_name', 'users.username']\n", "source_columns : ['users.first_name', 'users.last_name', 'users.username']\n",
"\n", "\n",
"--- END METADATA ---\n", "--- END METADATA ---\n"
"\n",
"=== STATE SNAPSHOT ===\n",
"\n",
"--- MESSAGES ---\n",
"0: HUMAN -> Find loosely structured human name-like strings in the database\n",
"1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"2: AI -> Retrieved 30 rows\n",
"3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"4: AI -> Retrieved 30 rows\n",
"\n",
"--- BEGIN METADATA ---\n",
"attempt : 2\n",
"max_attempts : 2\n",
"phase : extraction\n",
"PII type : person name\n",
"exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"rows_count : 30\n",
"rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n",
"classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n",
"evidence_count : 0\n",
"evidence_sample : []\n",
"source_columns : ['users.first_name', 'users.last_name', 'users.username']\n",
"\n",
"--- END METADATA ---\n",
"\n",
"=== STATE SNAPSHOT ===\n",
"\n",
"--- MESSAGES ---\n",
"0: HUMAN -> Find loosely structured human name-like strings in the database\n",
"1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"2: AI -> Retrieved 30 rows\n",
"3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"4: AI -> Retrieved 30 rows\n",
"\n",
"--- BEGIN METADATA ---\n",
"attempt : 2\n",
"max_attempts : 2\n",
"phase : extraction\n",
"PII type : person name\n",
"exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n",
"UNION ALL \n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n",
"UNION ALL\n",
"SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n",
"rows_count : 30\n",
"rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n",
"classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n",
"evidence_count : 30\n",
"evidence_sample : ['Alice', 'Brian', 'Carol', 'David', 'Emma', 'Frank', 'Grace', 'Henry', 'Irene', 'Jack']\n",
"source_columns : ['users.first_name', 'users.last_name', 'users.username']\n",
"\n",
"--- END METADATA ---\n",
"Wrote: I:\\project2026\\llmagent\\batch_results\\evidence_20260120T014007Z.jsonl\n"
] ]
} }
], ],
@@ -1470,7 +1361,21 @@
"\n", "\n",
" return all_results\n", " return all_results\n",
"\n", "\n",
"def main():\n", "def main(): \n",
" DB_DIR = Path(r\"selectedDBs\") # folder that contains the dbs\n",
" OUT_DIR = Path(\"batch_results\")\n",
" OUT_DIR.mkdir(exist_ok=True)\n",
"\n",
" PII_TARGETS = [\"EMAIL\", \"PHONE\", \"USERNAME\", \"PERSON_NAME\"]\n",
"\n",
" # --- usage ---\n",
" DB_FILES_PY = Path(\"db_files.py\")\n",
" db_files = load_db_files_list(DB_FILES_PY)\n",
"\n",
" db_paths, missing, not_sqlite = build_db_paths(DB_DIR, db_files, is_sqlite_file)\n",
" print_db_path_report(db_paths, missing, not_sqlite)\n",
"\n",
" \n",
" all_results = run_batch(db_paths, PII_TARGETS, PII_CONFIG, app)\n", " all_results = run_batch(db_paths, PII_TARGETS, PII_CONFIG, app)\n",
" save_jsonl(all_results, OUT_DIR)\n", " save_jsonl(all_results, OUT_DIR)\n",
"\n", "\n",