mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
Initial commit: PII Discovery Agent for Mobile Databases
This commit is contained in:
700
agent_evidence_discovery.ipynb
Normal file
700
agent_evidence_discovery.ipynb
Normal file
@@ -0,0 +1,700 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0be1ee8e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"uing bnl environment\n",
|
||||||
|
"https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
|
||||||
|
"\n",
|
||||||
|
"pip uninstall -y langchain langchain-core langchain-openai langgraph langgraph-prebuilt langgraph-checkpoint langgraph-sdk langsmith langchain-community langchain-google-genai langchain-text-splitters\n",
|
||||||
|
"\n",
|
||||||
|
"pip install langchain==1.2.0 langchain-core==1.2.2 langchain-openai==1.1.4 langgraph==1.0.5 langgraph-prebuilt==1.0.5 langgraph-checkpoint==3.0.1 langgraph-sdk==0.3.0 langsmith==0.5.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"id": "2648a1f1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# only for find models\n",
|
||||||
|
"# import google.generativeai as genai\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"id": "a10c9a6a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"OK\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
|
||||||
|
"\n",
|
||||||
|
"import os\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"from langchain_openai import ChatOpenAI\n",
|
||||||
|
"from langchain_core.messages import HumanMessage\n",
|
||||||
|
"from sql_utils import *\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"load_dotenv() # This looks for the .env file and loads it into os.environ\n",
|
||||||
|
"\n",
|
||||||
|
"llm = ChatOpenAI(\n",
|
||||||
|
" model=\"gpt-4o-mini\", # recommended for tools + cost\n",
|
||||||
|
" api_key=os.environ[\"API_KEY\"],\n",
|
||||||
|
" temperature=0\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"response = llm.invoke([\n",
|
||||||
|
" HumanMessage(content=\"Reply with exactly: OK\")\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"print(response.content)\n",
|
||||||
|
"\n",
|
||||||
|
"# DB_PATH = r\"msgstore.db\"\n",
|
||||||
|
"DB_PATH = r\"users4.db\"\n",
|
||||||
|
"# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.whatsapp\\databases\\stickers.db\"\n",
|
||||||
|
"# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.android.vending\\databases\\localappstate.db\"\n",
|
||||||
|
"\n",
|
||||||
|
"ENTITY_CONFIG = {\n",
|
||||||
|
" \"EMAIL\": {\n",
|
||||||
|
" \"regex\": r\"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\",\n",
|
||||||
|
" \"desc\": \"valid electronic mail formats used for account registration or contact\"\n",
|
||||||
|
" },\n",
|
||||||
|
" \"PHONE\": {\n",
|
||||||
|
" \"regex\": r\"\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}\",\n",
|
||||||
|
" \"desc\": \"international or local telephone numbers\"\n",
|
||||||
|
" },\n",
|
||||||
|
" \"USERNAME\": {\n",
|
||||||
|
" \"regex\": r\"\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b\",\n",
|
||||||
|
" \"desc\": \"application-specific usernames, handles, or account identifiers\"\n",
|
||||||
|
" },\n",
|
||||||
|
" \"PERSON_NAME\": {\n",
|
||||||
|
" \"regex\": r\"[A-Za-z\\u4e00-\\u9fff][A-Za-z\\u4e00-\\u9fff\\s\\.\\-]{1,50}\",\n",
|
||||||
|
" \"desc\": (\n",
|
||||||
|
" \"loosely structured human name-like strings used only for discovery \"\n",
|
||||||
|
" \"and column pre-filtering; final identification is performed during extraction\"\n",
|
||||||
|
" )\n",
|
||||||
|
" }\n",
|
||||||
|
"}\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "48eda3ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Core Python\n",
|
||||||
|
"import sqlite3\n",
|
||||||
|
"import re\n",
|
||||||
|
"import json\n",
|
||||||
|
"from typing import TypedDict, Optional, List, Annotated\n",
|
||||||
|
"from langgraph.graph.message import add_messages\n",
|
||||||
|
"\n",
|
||||||
|
"# LangChain / LangGraph\n",
|
||||||
|
"from langchain_core.tools import tool\n",
|
||||||
|
"from langchain_core.messages import (\n",
|
||||||
|
" HumanMessage,\n",
|
||||||
|
" AIMessage,\n",
|
||||||
|
" SystemMessage\n",
|
||||||
|
")\n",
|
||||||
|
"from langchain.agents import create_agent\n",
|
||||||
|
"from langgraph.graph import StateGraph, END\n",
|
||||||
|
"from langgraph.graph.message import MessagesState\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"@tool\n",
|
||||||
|
"def list_tables() -> str:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" List all table names in the SQLite database.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" conn = sqlite3.connect(DB_PATH)\n",
|
||||||
|
" cur = conn.cursor()\n",
|
||||||
|
" cur.execute(\"SELECT name FROM sqlite_master WHERE type='table'\")\n",
|
||||||
|
" tables = [r[0] for r in cur.fetchall()]\n",
|
||||||
|
" conn.close()\n",
|
||||||
|
" return \", \".join(tables)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"@tool\n",
|
||||||
|
"def get_schema(table: str) -> str:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Return column names and types for a table.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" conn = sqlite3.connect(DB_PATH)\n",
|
||||||
|
" cur = conn.cursor()\n",
|
||||||
|
" cur.execute(f\"PRAGMA table_info('{table}')\")\n",
|
||||||
|
" cols = cur.fetchall()\n",
|
||||||
|
" conn.close()\n",
|
||||||
|
" return \", \".join(f\"{c[1]} {c[2]}\" for c in cols)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"@tool\n",
|
||||||
|
"def exec_sql(query: str) -> dict:\n",
|
||||||
|
" \"\"\"Execute SQL statements. If one fails, it is skipped and the next is executed.\"\"\"\n",
|
||||||
|
" query_text = normalize_sql(query)\n",
|
||||||
|
"\n",
|
||||||
|
" # 1. Parse column names from ALL SELECTs\n",
|
||||||
|
" column_names = []\n",
|
||||||
|
" for select_sql in split_union_selects(query_text):\n",
|
||||||
|
" for col in extract_select_columns(select_sql):\n",
|
||||||
|
" if col not in column_names:\n",
|
||||||
|
" column_names.append(col)\n",
|
||||||
|
"\n",
|
||||||
|
" # 2. Execute once\n",
|
||||||
|
" conn = sqlite3.connect(DB_PATH)\n",
|
||||||
|
" conn.create_function(\"REGEXP\", 2, regexp)\n",
|
||||||
|
" cur = conn.cursor()\n",
|
||||||
|
"\n",
|
||||||
|
" try:\n",
|
||||||
|
" print(f\"[EXECUTE] Running query\")\n",
|
||||||
|
" cur.execute(query_text)\n",
|
||||||
|
" rows = cur.fetchall()\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\"[SQL ERROR]: {e}\")\n",
|
||||||
|
" rows = []\n",
|
||||||
|
" finally:\n",
|
||||||
|
" conn.close()\n",
|
||||||
|
"\n",
|
||||||
|
" return {\n",
|
||||||
|
" \"rows\": rows,\n",
|
||||||
|
" \"columns\": column_names\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class EmailEvidenceState(TypedDict):\n",
|
||||||
|
" messages: Annotated[list, add_messages]\n",
|
||||||
|
" attempt: int\n",
|
||||||
|
" max_attempts: int\n",
|
||||||
|
" phase: str # \"discovery\" | \"extraction\"\n",
|
||||||
|
" sql: Optional[str] # SQL to execute\n",
|
||||||
|
" rows: Optional[List]\n",
|
||||||
|
" classification: Optional[dict]\n",
|
||||||
|
" evidence: Optional[List[str]]\n",
|
||||||
|
" target_entity: str # <--- ADD THIS LINE \n",
|
||||||
|
" # Add this to track the \"winning\" columns\n",
|
||||||
|
" source_columns: Optional[List[dict]]\n",
|
||||||
|
"\n",
|
||||||
|
" # SQL used during discovery that returned results\n",
|
||||||
|
" discovered_sql: Optional[List[str]]\n",
|
||||||
|
"\n",
|
||||||
|
"def get_discovery_system(target, regex):\n",
|
||||||
|
" return SystemMessage(\n",
|
||||||
|
" content=(\n",
|
||||||
|
" \"You are a SQL planner. You are provided databases that are extracted from Android or iOS devices.\\n\"\n",
|
||||||
|
" f\"Goal: discover if any column contains {target}.\\n\\n\"\n",
|
||||||
|
" \"Rules:\\n\"\n",
|
||||||
|
" \"- Use 'REGEXP' for pattern matching.\\n\"\n",
|
||||||
|
" f\"- Example: SELECT col FROM table WHERE col REGEXP '{regex}' LIMIT 5\\n\"\n",
|
||||||
|
" \"- Validate your SQL and make sure all tables and columns do exist.\\n\"\n",
|
||||||
|
" \"- If multiple SQL statements are provided, combine them using UNION ALL.\\n\"\n",
|
||||||
|
" \"- Return ONLY SQL.\"\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" \n",
|
||||||
|
"def get_sql_upgrade_system(target):\n",
|
||||||
|
" return SystemMessage(\n",
|
||||||
|
" content=(\n",
|
||||||
|
" \"You are a SQL refiner.\\n\"\n",
|
||||||
|
" f\"Goal: modify previously successful SQL to extract ALL {target}.\\n\\n\"\n",
|
||||||
|
" \"Rules:\\n\"\n",
|
||||||
|
" \"- Do NOT invent new tables or columns.\\n\"\n",
|
||||||
|
" \"- Remove LIMIT clauses.\\n\"\n",
|
||||||
|
" \"- Preserve WHERE conditions and REGEXP filters.\\n\"\n",
|
||||||
|
" \"- Return ONLY SQL.\"\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def planner(state: EmailEvidenceState):\n",
|
||||||
|
" # Extraction upgrade path\n",
|
||||||
|
" if state[\"phase\"] == \"extraction\" and state.get(\"discovered_sql\"):\n",
|
||||||
|
" system = get_sql_upgrade_system(state[\"target_entity\"])\n",
|
||||||
|
" joined_sql = \"\\nUNION ALL\\n\".join(state[\"discovered_sql\"])\n",
|
||||||
|
"\n",
|
||||||
|
" result = llm.invoke([\n",
|
||||||
|
" system,\n",
|
||||||
|
" HumanMessage(content=f\"Original SQL:\\n{joined_sql}\")\n",
|
||||||
|
" ])\n",
|
||||||
|
"\n",
|
||||||
|
" sql = normalize_sql(result.content)\n",
|
||||||
|
"\n",
|
||||||
|
" print(\"[PLANNER] Upgraded SQL for extraction\")\n",
|
||||||
|
"\n",
|
||||||
|
" return {\n",
|
||||||
|
" \"messages\": [AIMessage(content=sql)],\n",
|
||||||
|
" \"sql\": sql\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" # Original discovery logic\n",
|
||||||
|
" tables = list_tables.invoke({})\n",
|
||||||
|
" config = ENTITY_CONFIG[state[\"target_entity\"]]\n",
|
||||||
|
"\n",
|
||||||
|
" base_system = get_discovery_system(\n",
|
||||||
|
" state[\"target_entity\"],\n",
|
||||||
|
" config[\"regex\"]\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" grounded_content = (\n",
|
||||||
|
" f\"{base_system.content}\\n\\n\"\n",
|
||||||
|
" f\"EXISTING TABLES: {tables}\\n\"\n",
|
||||||
|
" f\"CURRENT PHASE: {state['phase']}\\n\"\n",
|
||||||
|
" \"CRITICAL: Do not query non-existent tables.\"\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" agent = create_agent(llm, [list_tables, get_schema])\n",
|
||||||
|
" result = agent.invoke({\n",
|
||||||
|
" \"messages\": [SystemMessage(content=grounded_content)] + state[\"messages\"]\n",
|
||||||
|
" })\n",
|
||||||
|
"\n",
|
||||||
|
" sql = normalize_sql(result[\"messages\"][-1].content)\n",
|
||||||
|
"\n",
|
||||||
|
" attempt = state[\"attempt\"] + 1 if state[\"phase\"] == \"discovery\" else state[\"attempt\"]\n",
|
||||||
|
"\n",
|
||||||
|
" return {\n",
|
||||||
|
" \"messages\": [AIMessage(content=sql)],\n",
|
||||||
|
" \"sql\": sql,\n",
|
||||||
|
" \"attempt\": attempt\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def sql_execute(state: EmailEvidenceState):\n",
|
||||||
|
" # Call the tool (it now returns a dict)\n",
|
||||||
|
" result = exec_sql.invoke(state[\"sql\"])\n",
|
||||||
|
" \n",
|
||||||
|
" rows = result.get(\"rows\", [])\n",
|
||||||
|
" cols = result.get(\"columns\", [])\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"[SQL EXEC] Retrieved {(rows)}\")\n",
|
||||||
|
" updates = {\n",
|
||||||
|
" \"rows\": rows,\n",
|
||||||
|
" \"messages\": [AIMessage(content=f\"Retrieved {len(rows)} rows\")]\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" if state[\"phase\"] == \"discovery\" and rows:\n",
|
||||||
|
" discovered = list(state.get(\"discovered_sql\", []))\n",
|
||||||
|
" discovered.append(state[\"sql\"])\n",
|
||||||
|
" updates[\"discovered_sql\"] = discovered\n",
|
||||||
|
" print(\"[DISCOVERY] Saved successful SQL\")\n",
|
||||||
|
"\n",
|
||||||
|
" # Tracking logic: Save columns to state only during extraction\n",
|
||||||
|
" if state[\"phase\"] == \"extraction\":\n",
|
||||||
|
" updates[\"source_columns\"] = cols\n",
|
||||||
|
" print(f\"[TRACKING] Saved source columns: {cols}\")\n",
|
||||||
|
"\n",
|
||||||
|
" return updates\n",
|
||||||
|
" \n",
|
||||||
|
"\n",
|
||||||
|
"def get_classify_system(target: str):\n",
|
||||||
|
" return SystemMessage(\n",
|
||||||
|
" content=(\n",
|
||||||
|
" f\"Decide whether the text contains {target}.\\n\"\n",
|
||||||
|
" \"Return ONLY a JSON object with these keys:\\n\"\n",
|
||||||
|
" \"{ \\\"found\\\": true/false, \\\"confidence\\\": number, \\\"reason\\\": \\\"string\\\" }\"\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"def classify(state: EmailEvidenceState):\n",
|
||||||
|
" # 1. Prepare the text sample for the LLM\n",
|
||||||
|
" text = rows_to_text(state[\"rows\"], limit=10)\n",
|
||||||
|
" \n",
|
||||||
|
" # 2. Get the target-specific system message\n",
|
||||||
|
" system_message = get_classify_system(state[\"target_entity\"])\n",
|
||||||
|
"\n",
|
||||||
|
" # 3. Invoke the LLM\n",
|
||||||
|
" result = llm.invoke([\n",
|
||||||
|
" system_message,\n",
|
||||||
|
" HumanMessage(content=f\"Data to analyze:\\n{text}\")\n",
|
||||||
|
" ]).content\n",
|
||||||
|
" \n",
|
||||||
|
"# 4. Parse the decision\n",
|
||||||
|
" decision = safe_json_loads(\n",
|
||||||
|
" result,\n",
|
||||||
|
" default={\"found\": False, \"confidence\": 0.0, \"reason\": \"parse failure\"}\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" # print(\"[CLASSIFY]\", decision)\n",
|
||||||
|
" return {\"classification\": decision}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def switch_to_extraction(state: EmailEvidenceState):\n",
|
||||||
|
" print(\"[PHASE] discovery → extraction\")\n",
|
||||||
|
" return {\"phase\": \"extraction\"}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def extract(state: EmailEvidenceState):\n",
|
||||||
|
" text = rows_to_text(state[\"rows\"])\n",
|
||||||
|
" system = f\"Extract and normalize {state['target_entity']} from text. Return ONLY a JSON array of strings.\"\n",
|
||||||
|
" result = llm.invoke([SystemMessage(content=system), HumanMessage(content=text)]).content\n",
|
||||||
|
" return {\"evidence\": safe_json_loads(result, default=[])}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def next_step(state: EmailEvidenceState):\n",
|
||||||
|
" # Once in extraction phase, extract and stop\n",
|
||||||
|
" if state[\"phase\"] == \"extraction\":\n",
|
||||||
|
" return \"do_extract\"\n",
|
||||||
|
"\n",
|
||||||
|
" c = state[\"classification\"]\n",
|
||||||
|
"\n",
|
||||||
|
" if c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
|
||||||
|
" return \"to_extraction\"\n",
|
||||||
|
"\n",
|
||||||
|
" if not c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
|
||||||
|
" return \"stop_none\"\n",
|
||||||
|
"\n",
|
||||||
|
" if state[\"attempt\"] >= state[\"max_attempts\"]:\n",
|
||||||
|
" return \"stop_limit\"\n",
|
||||||
|
"\n",
|
||||||
|
" return \"replan\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "0f5259d7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def observe(state: EmailEvidenceState):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Debug / inspection node.\n",
|
||||||
|
" Does NOT modify state.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
" print(\"\\n=== STATE SNAPSHOT ===\")\n",
|
||||||
|
"\n",
|
||||||
|
" # Messages\n",
|
||||||
|
" print(\"\\n--- MESSAGES ---\")\n",
|
||||||
|
" for i, m in enumerate(state[\"messages\"]):\n",
|
||||||
|
" print(f\"{i}: {m.type.upper()} -> {m.content}\")\n",
|
||||||
|
"\n",
|
||||||
|
" # Metadata\n",
|
||||||
|
" print(\"\\n--- BEGIN METADATA ---\")\n",
|
||||||
|
" print(f\"attempt : {state['attempt']}\")\n",
|
||||||
|
" print(f\"max_attempts : {state['max_attempts']}\")\n",
|
||||||
|
" print(f\"phase : {state['phase']}\")\n",
|
||||||
|
" print(f\"sql : {state['sql']}\")\n",
|
||||||
|
" print(f\"discovered sql : {state['discovered_sql']}\")\n",
|
||||||
|
" print(f\"rows : {state['rows']}\")\n",
|
||||||
|
" print(f\"classification: {state['classification']}\")\n",
|
||||||
|
" print(f\"evidence : {state['evidence']}\")\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"Source Columns: {state.get('source_columns')}\")\n",
|
||||||
|
" print(\"\\n--- END METADATA ---\")\n",
|
||||||
|
"\n",
|
||||||
|
" # IMPORTANT: do not return state, return no-op update\n",
|
||||||
|
" return {}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"from langgraph.graph import StateGraph, END\n",
|
||||||
|
"\n",
|
||||||
|
"graph = StateGraph(EmailEvidenceState)\n",
|
||||||
|
"\n",
|
||||||
|
"# Define nodes (reusing the same 'observe' function for two different node names)\n",
|
||||||
|
"graph.add_node(\"planner\", planner)\n",
|
||||||
|
"graph.add_node(\"observe_plan\", observe) # Checkpoint 1: The SQL Plan\n",
|
||||||
|
"graph.add_node(\"execute\", sql_execute)\n",
|
||||||
|
"graph.add_node(\"classify\", classify)\n",
|
||||||
|
"graph.add_node(\"observe_classify\", observe) # Checkpoint 2: The Logic/Discovery\n",
|
||||||
|
"graph.add_node(\"switch_phase\", switch_to_extraction)\n",
|
||||||
|
"graph.add_node(\"extract\", extract)\n",
|
||||||
|
"graph.add_node(\"observe_final\", observe) # Checkpoint 3: The Results\n",
|
||||||
|
"\n",
|
||||||
|
"graph.set_entry_point(\"planner\")\n",
|
||||||
|
"\n",
|
||||||
|
"# --- THE FLOW ---\n",
|
||||||
|
"graph.add_edge(\"planner\", \"observe_plan\") # Check SQL before running\n",
|
||||||
|
"graph.add_edge(\"observe_plan\", \"execute\")\n",
|
||||||
|
"\n",
|
||||||
|
"graph.add_edge(\"execute\", \"classify\")\n",
|
||||||
|
"graph.add_edge(\"classify\", \"observe_classify\")\n",
|
||||||
|
"\n",
|
||||||
|
"# The decision logic now triggers after the second observation\n",
|
||||||
|
"graph.add_conditional_edges(\n",
|
||||||
|
" \"observe_classify\", # Must match the new node name exactly\n",
|
||||||
|
" next_step,\n",
|
||||||
|
" {\n",
|
||||||
|
" \"to_extraction\": \"switch_phase\",\n",
|
||||||
|
" \"do_extract\": \"extract\",\n",
|
||||||
|
" \"replan\": \"planner\",\n",
|
||||||
|
" \"stop_none\": END,\n",
|
||||||
|
" \"stop_limit\": END,\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"graph.add_edge(\"switch_phase\", \"planner\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Change this: Route 'extract' to our new observer instead of END\n",
|
||||||
|
"graph.add_edge(\"extract\", \"observe_final\")\n",
|
||||||
|
"graph.add_edge(\"observe_final\", END)\n",
|
||||||
|
"\n",
|
||||||
|
"app = graph.compile()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"id": "0b1fce49",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== STATE SNAPSHOT ===\n",
|
||||||
|
"\n",
|
||||||
|
"--- MESSAGES ---\n",
|
||||||
|
"0: HUMAN -> Find USERNAME in the database\n",
|
||||||
|
"1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"\n",
|
||||||
|
"--- BEGIN METADATA ---\n",
|
||||||
|
"attempt : 1\n",
|
||||||
|
"max_attempts : 3\n",
|
||||||
|
"phase : discovery\n",
|
||||||
|
"sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"discovered sql : []\n",
|
||||||
|
"rows : None\n",
|
||||||
|
"classification: None\n",
|
||||||
|
"evidence : []\n",
|
||||||
|
"Source Columns: []\n",
|
||||||
|
"\n",
|
||||||
|
"--- END METADATA ---\n",
|
||||||
|
"[EXECUTE] Running query\n",
|
||||||
|
"[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
|
||||||
|
"[DISCOVERY] Saved successful SQL\n",
|
||||||
|
"\n",
|
||||||
|
"=== STATE SNAPSHOT ===\n",
|
||||||
|
"\n",
|
||||||
|
"--- MESSAGES ---\n",
|
||||||
|
"0: HUMAN -> Find USERNAME in the database\n",
|
||||||
|
"1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"2: AI -> Retrieved 11 rows\n",
|
||||||
|
"\n",
|
||||||
|
"--- BEGIN METADATA ---\n",
|
||||||
|
"attempt : 1\n",
|
||||||
|
"max_attempts : 3\n",
|
||||||
|
"phase : discovery\n",
|
||||||
|
"sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
|
||||||
|
"rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
|
||||||
|
"classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n",
|
||||||
|
"evidence : []\n",
|
||||||
|
"Source Columns: []\n",
|
||||||
|
"\n",
|
||||||
|
"--- END METADATA ---\n",
|
||||||
|
"[PHASE] discovery → extraction\n",
|
||||||
|
"[PLANNER] Upgraded SQL for extraction\n",
|
||||||
|
"\n",
|
||||||
|
"=== STATE SNAPSHOT ===\n",
|
||||||
|
"\n",
|
||||||
|
"--- MESSAGES ---\n",
|
||||||
|
"0: HUMAN -> Find USERNAME in the database\n",
|
||||||
|
"1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"2: AI -> Retrieved 11 rows\n",
|
||||||
|
"3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"\n",
|
||||||
|
"--- BEGIN METADATA ---\n",
|
||||||
|
"attempt : 1\n",
|
||||||
|
"max_attempts : 3\n",
|
||||||
|
"phase : extraction\n",
|
||||||
|
"sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
|
||||||
|
"rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
|
||||||
|
"classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n",
|
||||||
|
"evidence : []\n",
|
||||||
|
"Source Columns: []\n",
|
||||||
|
"\n",
|
||||||
|
"--- END METADATA ---\n",
|
||||||
|
"[EXECUTE] Running query\n",
|
||||||
|
"[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
|
||||||
|
"[TRACKING] Saved source columns: ['email', 'content']\n",
|
||||||
|
"\n",
|
||||||
|
"=== STATE SNAPSHOT ===\n",
|
||||||
|
"\n",
|
||||||
|
"--- MESSAGES ---\n",
|
||||||
|
"0: HUMAN -> Find USERNAME in the database\n",
|
||||||
|
"1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"2: AI -> Retrieved 11 rows\n",
|
||||||
|
"3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"4: AI -> Retrieved 11 rows\n",
|
||||||
|
"\n",
|
||||||
|
"--- BEGIN METADATA ---\n",
|
||||||
|
"attempt : 1\n",
|
||||||
|
"max_attempts : 3\n",
|
||||||
|
"phase : extraction\n",
|
||||||
|
"sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
|
||||||
|
"rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
|
||||||
|
"classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n",
|
||||||
|
"evidence : []\n",
|
||||||
|
"Source Columns: ['email', 'content']\n",
|
||||||
|
"\n",
|
||||||
|
"--- END METADATA ---\n",
|
||||||
|
"\n",
|
||||||
|
"=== STATE SNAPSHOT ===\n",
|
||||||
|
"\n",
|
||||||
|
"--- MESSAGES ---\n",
|
||||||
|
"0: HUMAN -> Find USERNAME in the database\n",
|
||||||
|
"1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"2: AI -> Retrieved 11 rows\n",
|
||||||
|
"3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"4: AI -> Retrieved 11 rows\n",
|
||||||
|
"\n",
|
||||||
|
"--- BEGIN METADATA ---\n",
|
||||||
|
"attempt : 1\n",
|
||||||
|
"max_attempts : 3\n",
|
||||||
|
"phase : extraction\n",
|
||||||
|
"sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
|
||||||
|
"UNION ALL\n",
|
||||||
|
"SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
|
||||||
|
"discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
|
||||||
|
"rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
|
||||||
|
"classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n",
|
||||||
|
"evidence : ['alice.johnson', 'brian.smith', 'carol.davis', 'emma.wilson', 'private_carol']\n",
|
||||||
|
"Source Columns: ['email', 'content']\n",
|
||||||
|
"\n",
|
||||||
|
"--- END METADATA ---\n",
|
||||||
|
"\n",
|
||||||
|
"========================================\n",
|
||||||
|
" 🏁 FORENSIC REPORT: USERNAME \n",
|
||||||
|
"========================================\n",
|
||||||
|
"✅ Success! Found 5 unique USERNAME:\n",
|
||||||
|
" 1. alice.johnson\n",
|
||||||
|
" 2. brian.smith\n",
|
||||||
|
" 3. carol.davis\n",
|
||||||
|
" 4. emma.wilson\n",
|
||||||
|
" 5. private_carol\n",
|
||||||
|
"\n",
|
||||||
|
"Source Columns: email, content\n",
|
||||||
|
"========================================\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"# Set your target here once\n",
|
||||||
|
"# TARGET = \"EMAIL\" \n",
|
||||||
|
"# TARGET = \"PHONE\"\n",
|
||||||
|
"TARGET = \"USERNAME\"\n",
|
||||||
|
"# TARGET = \"PERSON_NAME\"\n",
|
||||||
|
"\n",
|
||||||
|
"result = app.invoke({\n",
|
||||||
|
" \"messages\": [HumanMessage(content=f\"Find {TARGET} in the database\")],\n",
|
||||||
|
" \"attempt\": 0,\n",
|
||||||
|
" \"max_attempts\": 3,\n",
|
||||||
|
" \"phase\": \"discovery\",\n",
|
||||||
|
" \"target_entity\": TARGET, # CRITICAL: This tells the planner what to look for\n",
|
||||||
|
" \"sql\": None,\n",
|
||||||
|
" \"rows\": None,\n",
|
||||||
|
" \"classification\": None,\n",
|
||||||
|
" \"evidence\": [], # Use the new generic key\n",
|
||||||
|
" \"source_columns\": [],\n",
|
||||||
|
" \"discovered_sql\": [] \n",
|
||||||
|
"})\n",
|
||||||
|
"\n",
|
||||||
|
"# Use the generic 'evidence' key we defined in the state\n",
|
||||||
|
"final_evidence = result.get(\"evidence\", [])\n",
|
||||||
|
"target_label = result.get(\"target_entity\", \"items\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n\" + \"=\"*40)\n",
|
||||||
|
"print(f\" 🏁 FORENSIC REPORT: {target_label.upper()} \")\n",
|
||||||
|
"print(\"=\"*40)\n",
|
||||||
|
"\n",
|
||||||
|
"if final_evidence:\n",
|
||||||
|
" print(f\"✅ Success! Found {len(final_evidence)} unique {target_label}:\")\n",
|
||||||
|
" for i, item in enumerate(sorted(final_evidence), 1):\n",
|
||||||
|
" print(f\" {i}. {item}\")\n",
|
||||||
|
" \n",
|
||||||
|
" # Also print the source columns we tracked!\n",
|
||||||
|
" sources = result.get(\"source_columns\")\n",
|
||||||
|
" if sources:\n",
|
||||||
|
" print(f\"\\nSource Columns: {', '.join(sources)}\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(f\"❌ No {target_label} were extracted.\")\n",
|
||||||
|
" print(f\"Last Phase : {result.get('phase')}\")\n",
|
||||||
|
" print(f\"Attempts : {result.get('attempt')}\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"=\"*40)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d24e126b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
20
readme.md
Normal file
20
readme.md
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# LLM-Guided SQL Evidence Extraction
|
||||||
|
|
||||||
|
This project implements a lightweight **LLM-assisted pipeline** for discovering and extracting evidentiary artifacts from SQLite databases commonly found in mobile device extractions.
|
||||||
|
|
||||||
|
The system separates **discovery** and **extraction** to reduce search space, avoid hallucinated SQL, and preserve explainability.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- LLM-guided SQL planning with deterministic execution
|
||||||
|
- Discovery to extraction workflow
|
||||||
|
- Fixed evidence types: `EMAIL`, `PHONE`, `USERNAME`, `PERSON_NAME`
|
||||||
|
- Safe SQLite execution with REGEXP support
|
||||||
|
- UNION / UNION ALL–aware column extraction
|
||||||
|
- Transparent, inspectable state machine
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install langchain langgraph python-dotenv
|
||||||
|
```
|
||||||
232
sql_utils.py
Normal file
232
sql_utils.py
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def rows_to_text(rows, limit=None, max_chars=5000, cell_max=1000):
|
||||||
|
"""
|
||||||
|
Converts SQL rows to text with safety limits for LLM context.
|
||||||
|
- limit: Max number of rows to process.
|
||||||
|
- max_chars: Hard limit for the total string length.
|
||||||
|
- cell_max: Max length for any single column value.
|
||||||
|
"""
|
||||||
|
if not rows:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
out = []
|
||||||
|
# 1. Row-level limiting
|
||||||
|
target_rows = rows[:limit] if limit else rows
|
||||||
|
|
||||||
|
for r in target_rows:
|
||||||
|
row_str = ",".join(
|
||||||
|
(str(c)[:cell_max] + "..." if len(str(c)) > cell_max else str(c))
|
||||||
|
for c in r if c is not None
|
||||||
|
)
|
||||||
|
out.append(row_str)
|
||||||
|
|
||||||
|
final_text = "\n".join(out)
|
||||||
|
|
||||||
|
# 2. Final global character limit safety check
|
||||||
|
if len(final_text) > max_chars:
|
||||||
|
return final_text[:max_chars] + "\n... [DATA TRUNCATED] ..."
|
||||||
|
|
||||||
|
# print(f"[ROWS TO TEXT] Input: {len(rows)} rows | Output: {len(final_text)} chars")
|
||||||
|
# Optional: print only the first 200 characters of the text to keep logs clean
|
||||||
|
# print(f"[PREVIEW]: {final_text[:200]}...")
|
||||||
|
return final_text
|
||||||
|
|
||||||
|
|
||||||
|
def regexp(expr, item):
|
||||||
|
"""
|
||||||
|
Safe regular expression matcher for SQLite REGEXP queries.
|
||||||
|
|
||||||
|
This function allows SQLite to apply regex matching on arbitrary column
|
||||||
|
values without raising exceptions. It safely handles NULL values, bytes
|
||||||
|
or BLOB data, and malformed inputs. The match is case-insensitive and
|
||||||
|
always fails gracefully instead of crashing the query engine.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
# SQL:
|
||||||
|
# SELECT * FROM users WHERE email REGEXP '[a-z0-9._%+-]+@[a-z0-9.-]+';
|
||||||
|
|
||||||
|
regexp("[a-z0-9._%+-]+@[a-z0-9.-]+", "john.doe@example.com")
|
||||||
|
→ True
|
||||||
|
|
||||||
|
regexp("[a-z0-9._%+-]+@[a-z0-9.-]+", None)
|
||||||
|
→ False
|
||||||
|
"""
|
||||||
|
# 1. Handle NULLs (None in Python)
|
||||||
|
if item is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 2. Ensure item is a string (handles BLOBs/Bytes)
|
||||||
|
if isinstance(item, bytes):
|
||||||
|
item = item.decode('utf-8', errors='ignore')
|
||||||
|
else:
|
||||||
|
item = str(item)
|
||||||
|
|
||||||
|
# 3. Compile and search
|
||||||
|
return re.search(expr, item, re.IGNORECASE) is not None
|
||||||
|
except Exception:
|
||||||
|
# 4. If anything else goes wrong, don't crash SQLite
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_sql(sql: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize LLM-generated SQL into a clean, executable SQL string.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
sql (str): A raw SQL string that may include Markdown code fences
|
||||||
|
(``` or ```sql), leading language tokens (e.g. "sql"),
|
||||||
|
or extra whitespace.
|
||||||
|
|
||||||
|
Output:
|
||||||
|
str: A cleaned SQL string with all formatting artifacts removed,
|
||||||
|
ready to be executed directly by SQLite.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Input:
|
||||||
|
```sql
|
||||||
|
SELECT * FROM users;
|
||||||
|
```
|
||||||
|
|
||||||
|
Output:
|
||||||
|
SELECT * FROM users;
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not sql:
|
||||||
|
return sql
|
||||||
|
|
||||||
|
sql = sql.strip()
|
||||||
|
|
||||||
|
# Remove ```sql or ``` fences
|
||||||
|
sql = re.sub(r"^```(?:sql)?", "", sql, flags=re.IGNORECASE).strip()
|
||||||
|
sql = re.sub(r"```$", "", sql).strip()
|
||||||
|
|
||||||
|
# Remove leading 'sql' token if present
|
||||||
|
if sql.lower().startswith("sql"):
|
||||||
|
sql = sql[3:].strip()
|
||||||
|
|
||||||
|
return sql
|
||||||
|
|
||||||
|
def safe_json_loads(text: str, default):
|
||||||
|
"""
|
||||||
|
Safely parse JSON from LLM-generated text.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
text (str): A raw string that may contain JSON wrapped in Markdown
|
||||||
|
code fences (```), prefixed with a language token
|
||||||
|
(e.g. "json"), or include extra whitespace.
|
||||||
|
default: A fallback value to return if JSON parsing fails.
|
||||||
|
|
||||||
|
Output:
|
||||||
|
Any: The parsed Python object if valid JSON is found; otherwise
|
||||||
|
the provided default value.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Input:
|
||||||
|
```json
|
||||||
|
{ "found": true, "confidence": 0.85 }
|
||||||
|
```
|
||||||
|
|
||||||
|
Output:
|
||||||
|
{ "found": True, "confidence": 0.85 }
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return default
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
# Remove markdown fences
|
||||||
|
if text.startswith("```"):
|
||||||
|
parts = text.split("```")
|
||||||
|
if len(parts) >= 2:
|
||||||
|
text = parts[1].strip()
|
||||||
|
|
||||||
|
# Remove leading 'json' token
|
||||||
|
if text.lower().startswith("json"):
|
||||||
|
text = text[4:].strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads(text)
|
||||||
|
except Exception as e:
|
||||||
|
print("[JSON PARSE ERROR]")
|
||||||
|
print("RAW:", repr(text))
|
||||||
|
print("ERROR:", e)
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def split_union_selects(sql: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Split a SQL query into individual SELECT statements joined by UNION or UNION ALL.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
sql (str): A single SQL query string that may contain multiple SELECT
|
||||||
|
statements combined using UNION or UNION ALL.
|
||||||
|
|
||||||
|
Output:
|
||||||
|
list[str]: A list of individual SELECT statement strings, with UNION
|
||||||
|
keywords removed and whitespace normalized.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Input:
|
||||||
|
SELECT email FROM users
|
||||||
|
UNION ALL
|
||||||
|
SELECT handle FROM accounts
|
||||||
|
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
"SELECT email FROM users",
|
||||||
|
"SELECT handle FROM accounts"
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
# Normalize spacing
|
||||||
|
sql = re.sub(r"\s+", " ", sql.strip())
|
||||||
|
|
||||||
|
# Split on UNION or UNION ALL, case-insensitive
|
||||||
|
parts = re.split(r"\bUNION(?:\s+ALL)?\b", sql, flags=re.IGNORECASE)
|
||||||
|
return [p.strip() for p in parts if p.strip()]
|
||||||
|
|
||||||
|
def extract_select_columns(select_sql: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Extract column names or column aliases from a single SELECT statement.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
select_sql (str): A SQL SELECT statement containing an explicit
|
||||||
|
projection list (no SELECT *), such as:
|
||||||
|
"SELECT col, col2 AS alias FROM table".
|
||||||
|
|
||||||
|
Output:
|
||||||
|
list[str]: A list of column names or aliases in the order they appear
|
||||||
|
in the SELECT clause.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Input:
|
||||||
|
SELECT email, username AS user FROM users
|
||||||
|
|
||||||
|
Output:
|
||||||
|
["email", "user"]
|
||||||
|
"""
|
||||||
|
m = re.search(
|
||||||
|
r"SELECT\s+(.*?)\s+FROM\s",
|
||||||
|
select_sql,
|
||||||
|
flags=re.IGNORECASE | re.DOTALL
|
||||||
|
)
|
||||||
|
if not m:
|
||||||
|
return []
|
||||||
|
|
||||||
|
select_list = m.group(1)
|
||||||
|
|
||||||
|
columns = []
|
||||||
|
for item in select_list.split(","):
|
||||||
|
item = item.strip()
|
||||||
|
|
||||||
|
# Handle aliases: col AS alias or col alias
|
||||||
|
alias_match = re.search(r"\bAS\s+(\w+)$", item, re.IGNORECASE)
|
||||||
|
if alias_match:
|
||||||
|
columns.append(alias_match.group(1))
|
||||||
|
else:
|
||||||
|
# Take the final identifier
|
||||||
|
columns.append(item.split()[-1])
|
||||||
|
|
||||||
|
return columns
|
||||||
Reference in New Issue
Block a user