Initial commit: PII Discovery Agent for Mobile Databases

2026-02-20 13:40:41 +00:00 · 2025-12-26 13:24:17 -05:00
parent 9b077fcf1f
commit 24353a3fb1
5 changed files with 953 additions and 0 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1 @@
 API_KEY=sk-proj-yourkeyhere
--- a/agent_evidence_discovery.ipynb
+++ b/agent_evidence_discovery.ipynb
@@ -0,0 +1,700 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0be1ee8e",
   "metadata": {},
   "source": [
    "uing bnl environment\n",
    "https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
    "\n",
    "pip uninstall -y langchain langchain-core langchain-openai langgraph langgraph-prebuilt langgraph-checkpoint langgraph-sdk langsmith langchain-community langchain-google-genai langchain-text-splitters\n",
    "\n",
    "pip install langchain==1.2.0 langchain-core==1.2.2 langchain-openai==1.1.4 langgraph==1.0.5 langgraph-prebuilt==1.0.5 langgraph-checkpoint==3.0.1 langgraph-sdk==0.3.0 langsmith==0.5.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "2648a1f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# only for find models\n",
    "# import google.generativeai as genai\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a10c9a6a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OK\n"
     ]
    }
   ],
   "source": [
    "# https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
    "\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain_core.messages import HumanMessage\n",
    "from sql_utils import *\n",
    "\n",
    "\n",
    "load_dotenv()  # This looks for the .env file and loads it into os.environ\n",
    "\n",
    "llm = ChatOpenAI(\n",
    "    model=\"gpt-4o-mini\",  # recommended for tools + cost\n",
    "    api_key=os.environ[\"API_KEY\"],\n",
    "    temperature=0\n",
    ")\n",
    "\n",
    "response = llm.invoke([\n",
    "    HumanMessage(content=\"Reply with exactly: OK\")\n",
    "])\n",
    "\n",
    "print(response.content)\n",
    "\n",
    "# DB_PATH = r\"msgstore.db\"\n",
    "DB_PATH = r\"users4.db\"\n",
    "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.whatsapp\\databases\\stickers.db\"\n",
    "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.android.vending\\databases\\localappstate.db\"\n",
    "\n",
    "ENTITY_CONFIG = {\n",
    "    \"EMAIL\": {\n",
    "        \"regex\": r\"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\",\n",
    "        \"desc\": \"valid electronic mail formats used for account registration or contact\"\n",
    "    },\n",
    "    \"PHONE\": {\n",
    "        \"regex\": r\"\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}\",\n",
    "        \"desc\": \"international or local telephone numbers\"\n",
    "    },\n",
    "    \"USERNAME\": {\n",
    "        \"regex\": r\"\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b\",\n",
    "        \"desc\": \"application-specific usernames, handles, or account identifiers\"\n",
    "    },\n",
    "    \"PERSON_NAME\": {\n",
    "        \"regex\": r\"[A-Za-z\\u4e00-\\u9fff][A-Za-z\\u4e00-\\u9fff\\s\\.\\-]{1,50}\",\n",
    "        \"desc\": (\n",
    "            \"loosely structured human name-like strings used only for discovery \"\n",
    "            \"and column pre-filtering; final identification is performed during extraction\"\n",
    "        )\n",
    "    }\n",
    "}\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "48eda3ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Core Python\n",
    "import sqlite3\n",
    "import re\n",
    "import json\n",
    "from typing import TypedDict, Optional, List, Annotated\n",
    "from langgraph.graph.message import add_messages\n",
    "\n",
    "# LangChain / LangGraph\n",
    "from langchain_core.tools import tool\n",
    "from langchain_core.messages import (\n",
    "    HumanMessage,\n",
    "    AIMessage,\n",
    "    SystemMessage\n",
    ")\n",
    "from langchain.agents import create_agent\n",
    "from langgraph.graph import StateGraph, END\n",
    "from langgraph.graph.message import MessagesState\n",
    "\n",
    "\n",
    "@tool\n",
    "def list_tables() -> str:\n",
    "    \"\"\"\n",
    "    List all table names in the SQLite database.\n",
    "    \"\"\"\n",
    "    conn = sqlite3.connect(DB_PATH)\n",
    "    cur = conn.cursor()\n",
    "    cur.execute(\"SELECT name FROM sqlite_master WHERE type='table'\")\n",
    "    tables = [r[0] for r in cur.fetchall()]\n",
    "    conn.close()\n",
    "    return \", \".join(tables)\n",
    "\n",
    "\n",
    "@tool\n",
    "def get_schema(table: str) -> str:\n",
    "    \"\"\"\n",
    "    Return column names and types for a table.\n",
    "    \"\"\"\n",
    "    conn = sqlite3.connect(DB_PATH)\n",
    "    cur = conn.cursor()\n",
    "    cur.execute(f\"PRAGMA table_info('{table}')\")\n",
    "    cols = cur.fetchall()\n",
    "    conn.close()\n",
    "    return \", \".join(f\"{c[1]} {c[2]}\" for c in cols)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "@tool\n",
    "def exec_sql(query: str) -> dict:\n",
    "    \"\"\"Execute SQL statements. If one fails, it is skipped and the next is executed.\"\"\"\n",
    "    query_text = normalize_sql(query)\n",
    "\n",
    "    # 1. Parse column names from ALL SELECTs\n",
    "    column_names = []\n",
    "    for select_sql in split_union_selects(query_text):\n",
    "        for col in extract_select_columns(select_sql):\n",
    "            if col not in column_names:\n",
    "                column_names.append(col)\n",
    "\n",
    "    # 2. Execute once\n",
    "    conn = sqlite3.connect(DB_PATH)\n",
    "    conn.create_function(\"REGEXP\", 2, regexp)\n",
    "    cur = conn.cursor()\n",
    "\n",
    "    try:\n",
    "        print(f\"[EXECUTE] Running query\")\n",
    "        cur.execute(query_text)\n",
    "        rows = cur.fetchall()\n",
    "    except Exception as e:\n",
    "        print(f\"[SQL ERROR]: {e}\")\n",
    "        rows = []\n",
    "    finally:\n",
    "        conn.close()\n",
    "\n",
    "    return {\n",
    "        \"rows\": rows,\n",
    "        \"columns\": column_names\n",
    "    }\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "class EmailEvidenceState(TypedDict):\n",
    "    messages: Annotated[list, add_messages]\n",
    "    attempt: int\n",
    "    max_attempts: int\n",
    "    phase: str            # \"discovery\" | \"extraction\"\n",
    "    sql: Optional[str]   # SQL to execute\n",
    "    rows: Optional[List]\n",
    "    classification: Optional[dict]\n",
    "    evidence: Optional[List[str]]\n",
    "    target_entity: str     # <--- ADD THIS LINE   \n",
    "    # Add this to track the \"winning\" columns\n",
    "    source_columns: Optional[List[dict]]\n",
    "\n",
    "    # SQL used during discovery that returned results\n",
    "    discovered_sql: Optional[List[str]]\n",
    "\n",
    "def get_discovery_system(target, regex):\n",
    "    return SystemMessage(\n",
    "        content=(\n",
    "            \"You are a SQL planner. You are provided databases that are extracted from Android or iOS devices.\\n\"\n",
    "            f\"Goal: discover if any column contains {target}.\\n\\n\"\n",
    "            \"Rules:\\n\"\n",
    "            \"- Use 'REGEXP' for pattern matching.\\n\"\n",
    "            f\"- Example: SELECT col FROM table WHERE col REGEXP '{regex}' LIMIT 5\\n\"\n",
    "            \"- Validate your SQL and make sure all tables and columns do exist.\\n\"\n",
    "            \"- If multiple SQL statements are provided, combine them using UNION ALL.\\n\"\n",
    "            \"- Return ONLY SQL.\"\n",
    "        )\n",
    "    )\n",
    "\n",
    "    \n",
    "def get_sql_upgrade_system(target):\n",
    "    return SystemMessage(\n",
    "        content=(\n",
    "            \"You are a SQL refiner.\\n\"\n",
    "            f\"Goal: modify previously successful SQL to extract ALL {target}.\\n\\n\"\n",
    "            \"Rules:\\n\"\n",
    "            \"- Do NOT invent new tables or columns.\\n\"\n",
    "            \"- Remove LIMIT clauses.\\n\"\n",
    "            \"- Preserve WHERE conditions and REGEXP filters.\\n\"\n",
    "            \"- Return ONLY SQL.\"\n",
    "        )\n",
    "    )\n",
    "\n",
    "\n",
    "def planner(state: EmailEvidenceState):\n",
    "    # Extraction upgrade path\n",
    "    if state[\"phase\"] == \"extraction\" and state.get(\"discovered_sql\"):\n",
    "        system = get_sql_upgrade_system(state[\"target_entity\"])\n",
    "        joined_sql = \"\\nUNION ALL\\n\".join(state[\"discovered_sql\"])\n",
    "\n",
    "        result = llm.invoke([\n",
    "            system,\n",
    "            HumanMessage(content=f\"Original SQL:\\n{joined_sql}\")\n",
    "        ])\n",
    "\n",
    "        sql = normalize_sql(result.content)\n",
    "\n",
    "        print(\"[PLANNER] Upgraded SQL for extraction\")\n",
    "\n",
    "        return {\n",
    "            \"messages\": [AIMessage(content=sql)],\n",
    "            \"sql\": sql\n",
    "        }\n",
    "\n",
    "    # Original discovery logic\n",
    "    tables = list_tables.invoke({})\n",
    "    config = ENTITY_CONFIG[state[\"target_entity\"]]\n",
    "\n",
    "    base_system = get_discovery_system(\n",
    "        state[\"target_entity\"],\n",
    "        config[\"regex\"]\n",
    "    )\n",
    "\n",
    "    grounded_content = (\n",
    "        f\"{base_system.content}\\n\\n\"\n",
    "        f\"EXISTING TABLES: {tables}\\n\"\n",
    "        f\"CURRENT PHASE: {state['phase']}\\n\"\n",
    "        \"CRITICAL: Do not query non-existent tables.\"\n",
    "    )\n",
    "\n",
    "    agent = create_agent(llm, [list_tables, get_schema])\n",
    "    result = agent.invoke({\n",
    "        \"messages\": [SystemMessage(content=grounded_content)] + state[\"messages\"]\n",
    "    })\n",
    "\n",
    "    sql = normalize_sql(result[\"messages\"][-1].content)\n",
    "\n",
    "    attempt = state[\"attempt\"] + 1 if state[\"phase\"] == \"discovery\" else state[\"attempt\"]\n",
    "\n",
    "    return {\n",
    "        \"messages\": [AIMessage(content=sql)],\n",
    "        \"sql\": sql,\n",
    "        \"attempt\": attempt\n",
    "    }\n",
    "\n",
    "\n",
    "def sql_execute(state: EmailEvidenceState):\n",
    "    # Call the tool (it now returns a dict)\n",
    "    result = exec_sql.invoke(state[\"sql\"])\n",
    "    \n",
    "    rows = result.get(\"rows\", [])\n",
    "    cols = result.get(\"columns\", [])\n",
    "\n",
    "    print(f\"[SQL EXEC] Retrieved {(rows)}\")\n",
    "    updates = {\n",
    "        \"rows\": rows,\n",
    "        \"messages\": [AIMessage(content=f\"Retrieved {len(rows)} rows\")]\n",
    "    }\n",
    "\n",
    "    if state[\"phase\"] == \"discovery\" and rows:\n",
    "        discovered = list(state.get(\"discovered_sql\", []))\n",
    "        discovered.append(state[\"sql\"])\n",
    "        updates[\"discovered_sql\"] = discovered\n",
    "        print(\"[DISCOVERY] Saved successful SQL\")\n",
    "\n",
    "    # Tracking logic: Save columns to state only during extraction\n",
    "    if state[\"phase\"] == \"extraction\":\n",
    "        updates[\"source_columns\"] = cols\n",
    "        print(f\"[TRACKING] Saved source columns: {cols}\")\n",
    "\n",
    "    return updates\n",
    "    \n",
    "\n",
    "def get_classify_system(target: str):\n",
    "    return SystemMessage(\n",
    "        content=(\n",
    "            f\"Decide whether the text contains {target}.\\n\"\n",
    "            \"Return ONLY a JSON object with these keys:\\n\"\n",
    "            \"{ \\\"found\\\": true/false, \\\"confidence\\\": number, \\\"reason\\\": \\\"string\\\" }\"\n",
    "        )\n",
    "    )\n",
    "\n",
    "def classify(state: EmailEvidenceState):\n",
    "    # 1. Prepare the text sample for the LLM\n",
    "    text = rows_to_text(state[\"rows\"], limit=10)\n",
    "    \n",
    "    # 2. Get the target-specific system message\n",
    "    system_message = get_classify_system(state[\"target_entity\"])\n",
    "\n",
    "    # 3. Invoke the LLM\n",
    "    result = llm.invoke([\n",
    "        system_message,\n",
    "        HumanMessage(content=f\"Data to analyze:\\n{text}\")\n",
    "    ]).content\n",
    "    \n",
    "# 4. Parse the decision\n",
    "    decision = safe_json_loads(\n",
    "        result,\n",
    "        default={\"found\": False, \"confidence\": 0.0, \"reason\": \"parse failure\"}\n",
    "    )\n",
    "\n",
    "    # print(\"[CLASSIFY]\", decision)\n",
    "    return {\"classification\": decision}\n",
    "\n",
    "\n",
    "def switch_to_extraction(state: EmailEvidenceState):\n",
    "    print(\"[PHASE] discovery → extraction\")\n",
    "    return {\"phase\": \"extraction\"}\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "def extract(state: EmailEvidenceState):\n",
    "    text = rows_to_text(state[\"rows\"])\n",
    "    system = f\"Extract and normalize {state['target_entity']} from text. Return ONLY a JSON array of strings.\"\n",
    "    result = llm.invoke([SystemMessage(content=system), HumanMessage(content=text)]).content\n",
    "    return {\"evidence\": safe_json_loads(result, default=[])}\n",
    "\n",
    "\n",
    "def next_step(state: EmailEvidenceState):\n",
    "    # Once in extraction phase, extract and stop\n",
    "    if state[\"phase\"] == \"extraction\":\n",
    "        return \"do_extract\"\n",
    "\n",
    "    c = state[\"classification\"]\n",
    "\n",
    "    if c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
    "        return \"to_extraction\"\n",
    "\n",
    "    if not c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
    "        return \"stop_none\"\n",
    "\n",
    "    if state[\"attempt\"] >= state[\"max_attempts\"]:\n",
    "        return \"stop_limit\"\n",
    "\n",
    "    return \"replan\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "0f5259d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def observe(state: EmailEvidenceState):\n",
    "    \"\"\"\n",
    "    Debug / inspection node.\n",
    "    Does NOT modify state.\n",
    "    \"\"\"\n",
    "\n",
    "    print(\"\\n=== STATE SNAPSHOT ===\")\n",
    "\n",
    "    # Messages\n",
    "    print(\"\\n--- MESSAGES ---\")\n",
    "    for i, m in enumerate(state[\"messages\"]):\n",
    "        print(f\"{i}: {m.type.upper()} -> {m.content}\")\n",
    "\n",
    "    # Metadata\n",
    "    print(\"\\n--- BEGIN METADATA ---\")\n",
    "    print(f\"attempt       : {state['attempt']}\")\n",
    "    print(f\"max_attempts : {state['max_attempts']}\")\n",
    "    print(f\"phase         : {state['phase']}\")\n",
    "    print(f\"sql           : {state['sql']}\")\n",
    "    print(f\"discovered sql           : {state['discovered_sql']}\")\n",
    "    print(f\"rows          : {state['rows']}\")\n",
    "    print(f\"classification: {state['classification']}\")\n",
    "    print(f\"evidence        : {state['evidence']}\")\n",
    "    \n",
    "    print(f\"Source Columns: {state.get('source_columns')}\")\n",
    "    print(\"\\n--- END METADATA ---\")\n",
    "\n",
    "    # IMPORTANT: do not return state, return no-op update\n",
    "    return {}\n",
    "\n",
    "\n",
    "\n",
    "from langgraph.graph import StateGraph, END\n",
    "\n",
    "graph = StateGraph(EmailEvidenceState)\n",
    "\n",
    "# Define nodes (reusing the same 'observe' function for two different node names)\n",
    "graph.add_node(\"planner\", planner)\n",
    "graph.add_node(\"observe_plan\", observe)      # Checkpoint 1: The SQL Plan\n",
    "graph.add_node(\"execute\", sql_execute)\n",
    "graph.add_node(\"classify\", classify)\n",
    "graph.add_node(\"observe_classify\", observe)  # Checkpoint 2: The Logic/Discovery\n",
    "graph.add_node(\"switch_phase\", switch_to_extraction)\n",
    "graph.add_node(\"extract\", extract)\n",
    "graph.add_node(\"observe_final\", observe)     # Checkpoint 3: The Results\n",
    "\n",
    "graph.set_entry_point(\"planner\")\n",
    "\n",
    "# --- THE FLOW ---\n",
    "graph.add_edge(\"planner\", \"observe_plan\")   # Check SQL before running\n",
    "graph.add_edge(\"observe_plan\", \"execute\")\n",
    "\n",
    "graph.add_edge(\"execute\", \"classify\")\n",
    "graph.add_edge(\"classify\", \"observe_classify\")\n",
    "\n",
    "# The decision logic now triggers after the second observation\n",
    "graph.add_conditional_edges(\n",
    "    \"observe_classify\",  # Must match the new node name exactly\n",
    "    next_step,\n",
    "    {\n",
    "        \"to_extraction\": \"switch_phase\",\n",
    "        \"do_extract\": \"extract\",\n",
    "        \"replan\": \"planner\",\n",
    "        \"stop_none\": END,\n",
    "        \"stop_limit\": END,\n",
    "    }\n",
    ")\n",
    "\n",
    "graph.add_edge(\"switch_phase\", \"planner\")\n",
    "\n",
    "# Change this: Route 'extract' to our new observer instead of END\n",
    "graph.add_edge(\"extract\", \"observe_final\")\n",
    "graph.add_edge(\"observe_final\", END)\n",
    "\n",
    "app = graph.compile()\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "0b1fce49",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== STATE SNAPSHOT ===\n",
      "\n",
      "--- MESSAGES ---\n",
      "0: HUMAN -> Find USERNAME in the database\n",
      "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "\n",
      "--- BEGIN METADATA ---\n",
      "attempt       : 1\n",
      "max_attempts : 3\n",
      "phase         : discovery\n",
      "sql           : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "discovered sql           : []\n",
      "rows          : None\n",
      "classification: None\n",
      "evidence        : []\n",
      "Source Columns: []\n",
      "\n",
      "--- END METADATA ---\n",
      "[EXECUTE] Running query\n",
      "[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
      "[DISCOVERY] Saved successful SQL\n",
      "\n",
      "=== STATE SNAPSHOT ===\n",
      "\n",
      "--- MESSAGES ---\n",
      "0: HUMAN -> Find USERNAME in the database\n",
      "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "2: AI -> Retrieved 11 rows\n",
      "\n",
      "--- BEGIN METADATA ---\n",
      "attempt       : 1\n",
      "max_attempts : 3\n",
      "phase         : discovery\n",
      "sql           : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "discovered sql           : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
      "rows          : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
      "classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n",
      "evidence        : []\n",
      "Source Columns: []\n",
      "\n",
      "--- END METADATA ---\n",
      "[PHASE] discovery → extraction\n",
      "[PLANNER] Upgraded SQL for extraction\n",
      "\n",
      "=== STATE SNAPSHOT ===\n",
      "\n",
      "--- MESSAGES ---\n",
      "0: HUMAN -> Find USERNAME in the database\n",
      "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "2: AI -> Retrieved 11 rows\n",
      "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "\n",
      "--- BEGIN METADATA ---\n",
      "attempt       : 1\n",
      "max_attempts : 3\n",
      "phase         : extraction\n",
      "sql           : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "discovered sql           : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
      "rows          : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
      "classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n",
      "evidence        : []\n",
      "Source Columns: []\n",
      "\n",
      "--- END METADATA ---\n",
      "[EXECUTE] Running query\n",
      "[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
      "[TRACKING] Saved source columns: ['email', 'content']\n",
      "\n",
      "=== STATE SNAPSHOT ===\n",
      "\n",
      "--- MESSAGES ---\n",
      "0: HUMAN -> Find USERNAME in the database\n",
      "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "2: AI -> Retrieved 11 rows\n",
      "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "4: AI -> Retrieved 11 rows\n",
      "\n",
      "--- BEGIN METADATA ---\n",
      "attempt       : 1\n",
      "max_attempts : 3\n",
      "phase         : extraction\n",
      "sql           : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "discovered sql           : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
      "rows          : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
      "classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n",
      "evidence        : []\n",
      "Source Columns: ['email', 'content']\n",
      "\n",
      "--- END METADATA ---\n",
      "\n",
      "=== STATE SNAPSHOT ===\n",
      "\n",
      "--- MESSAGES ---\n",
      "0: HUMAN -> Find USERNAME in the database\n",
      "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "2: AI -> Retrieved 11 rows\n",
      "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "4: AI -> Retrieved 11 rows\n",
      "\n",
      "--- BEGIN METADATA ---\n",
      "attempt       : 1\n",
      "max_attempts : 3\n",
      "phase         : extraction\n",
      "sql           : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n",
      "UNION ALL\n",
      "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n",
      "discovered sql           : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n",
      "rows          : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n",
      "classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n",
      "evidence        : ['alice.johnson', 'brian.smith', 'carol.davis', 'emma.wilson', 'private_carol']\n",
      "Source Columns: ['email', 'content']\n",
      "\n",
      "--- END METADATA ---\n",
      "\n",
      "========================================\n",
      "       🏁 FORENSIC REPORT: USERNAME       \n",
      "========================================\n",
      "✅ Success! Found 5 unique USERNAME:\n",
      "  1. alice.johnson\n",
      "  2. brian.smith\n",
      "  3. carol.davis\n",
      "  4. emma.wilson\n",
      "  5. private_carol\n",
      "\n",
      "Source Columns: email, content\n",
      "========================================\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Set your target here once\n",
    "# TARGET = \"EMAIL\" \n",
    "# TARGET = \"PHONE\"\n",
    "TARGET = \"USERNAME\"\n",
    "# TARGET = \"PERSON_NAME\"\n",
    "\n",
    "result = app.invoke({\n",
    "    \"messages\": [HumanMessage(content=f\"Find {TARGET} in the database\")],\n",
    "    \"attempt\": 0,\n",
    "    \"max_attempts\": 3,\n",
    "    \"phase\": \"discovery\",\n",
    "    \"target_entity\": TARGET,  # CRITICAL: This tells the planner what to look for\n",
    "    \"sql\": None,\n",
    "    \"rows\": None,\n",
    "    \"classification\": None,\n",
    "    \"evidence\": [],           # Use the new generic key\n",
    "    \"source_columns\": [],\n",
    "    \"discovered_sql\": []  \n",
    "})\n",
    "\n",
    "# Use the generic 'evidence' key we defined in the state\n",
    "final_evidence = result.get(\"evidence\", [])\n",
    "target_label = result.get(\"target_entity\", \"items\")\n",
    "\n",
    "print(\"\\n\" + \"=\"*40)\n",
    "print(f\"       🏁 FORENSIC REPORT: {target_label.upper()}       \")\n",
    "print(\"=\"*40)\n",
    "\n",
    "if final_evidence:\n",
    "    print(f\"✅ Success! Found {len(final_evidence)} unique {target_label}:\")\n",
    "    for i, item in enumerate(sorted(final_evidence), 1):\n",
    "        print(f\"  {i}. {item}\")\n",
    "    \n",
    "    # Also print the source columns we tracked!\n",
    "    sources = result.get(\"source_columns\")\n",
    "    if sources:\n",
    "        print(f\"\\nSource Columns: {', '.join(sources)}\")\n",
    "else:\n",
    "    print(f\"❌ No {target_label} were extracted.\")\n",
    "    print(f\"Last Phase : {result.get('phase')}\")\n",
    "    print(f\"Attempts   : {result.get('attempt')}\")\n",
    "\n",
    "print(\"=\"*40)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d24e126b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/readme.md
+++ b/readme.md
@@ -0,0 +1,20 @@
 # LLM-Guided SQL Evidence Extraction
 This project implements a lightweight **LLM-assisted pipeline** for discovering and extracting evidentiary artifacts from SQLite databases commonly found in mobile device extractions.
 The system separates **discovery** and **extraction** to reduce search space, avoid hallucinated SQL, and preserve explainability.
 ## Features
 - LLM-guided SQL planning with deterministic execution
 - Discovery to extraction workflow
 - Fixed evidence types: `EMAIL`, `PHONE`, `USERNAME`, `PERSON_NAME`
 - Safe SQLite execution with REGEXP support
 - UNION / UNION ALL–aware column extraction
 - Transparent, inspectable state machine
 ## Setup
 ```bash
 pip install langchain langgraph python-dotenv
 ```
--- a/sql_utils.py
+++ b/sql_utils.py
@@ -0,0 +1,232 @@
 import re
 def rows_to_text(rows, limit=None, max_chars=5000, cell_max=1000):
    """
    Converts SQL rows to text with safety limits for LLM context.
    - limit: Max number of rows to process.
    - max_chars: Hard limit for the total string length.
    - cell_max: Max length for any single column value.
    """
    if not rows:
        return ""
    out = []
    # 1. Row-level limiting
    target_rows = rows[:limit] if limit else rows
    for r in target_rows:
        row_str = ",".join(
            (str(c)[:cell_max] + "..." if len(str(c)) > cell_max else str(c))
            for c in r if c is not None
        )
        out.append(row_str)
    final_text = "\n".join(out)
    # 2. Final global character limit safety check
    if len(final_text) > max_chars:
        return final_text[:max_chars] + "\n... [DATA TRUNCATED] ..."
    # print(f"[ROWS TO TEXT] Input: {len(rows)} rows | Output: {len(final_text)} chars")
    # Optional: print only the first 200 characters of the text to keep logs clean
    # print(f"[PREVIEW]: {final_text[:200]}...")
    return final_text
 def regexp(expr, item):
    """
    Safe regular expression matcher for SQLite REGEXP queries.
    This function allows SQLite to apply regex matching on arbitrary column
    values without raising exceptions. It safely handles NULL values, bytes
    or BLOB data, and malformed inputs. The match is case-insensitive and
    always fails gracefully instead of crashing the query engine.
    Example:
        # SQL:
        # SELECT * FROM users WHERE email REGEXP '[a-z0-9._%+-]+@[a-z0-9.-]+';
        regexp("[a-z0-9._%+-]+@[a-z0-9.-]+", "john.doe@example.com")
        → True
        regexp("[a-z0-9._%+-]+@[a-z0-9.-]+", None)
        → False
    """
    # 1. Handle NULLs (None in Python)
    if item is None:
        return False
    try:
        # 2. Ensure item is a string (handles BLOBs/Bytes)
        if isinstance(item, bytes):
            item = item.decode('utf-8', errors='ignore')
        else:
            item = str(item)
        # 3. Compile and search
        return re.search(expr, item, re.IGNORECASE) is not None
    except Exception:
        # 4. If anything else goes wrong, don't crash SQLite
        return False
 def normalize_sql(sql: str) -> str:    
    """
    Normalize LLM-generated SQL into a clean, executable SQL string.
    Input:
        sql (str): A raw SQL string that may include Markdown code fences
                   (``` or ```sql), leading language tokens (e.g. "sql"),
                   or extra whitespace.
    Output:
        str: A cleaned SQL string with all formatting artifacts removed,
             ready to be executed directly by SQLite.
    Example:
        Input:
            ```sql
            SELECT * FROM users;
            ```
        Output:
            SELECT * FROM users;
    """
    if not sql:
        return sql
    sql = sql.strip()
    # Remove ```sql or ``` fences
    sql = re.sub(r"^```(?:sql)?", "", sql, flags=re.IGNORECASE).strip()
    sql = re.sub(r"```$", "", sql).strip()
    # Remove leading 'sql' token if present
    if sql.lower().startswith("sql"):
        sql = sql[3:].strip()
    return sql
 def safe_json_loads(text: str, default):
    """
    Safely parse JSON from LLM-generated text.
    Input:
        text (str): A raw string that may contain JSON wrapped in Markdown
                    code fences (```), prefixed with a language token
                    (e.g. "json"), or include extra whitespace.
        default:    A fallback value to return if JSON parsing fails.
    Output:
        Any: The parsed Python object if valid JSON is found; otherwise
             the provided default value.
    Example:
        Input:
            ```json
            { "found": true, "confidence": 0.85 }
            ```
        Output:
            { "found": True, "confidence": 0.85 }
    """
    if not text:
        return default
    text = text.strip()
    # Remove markdown fences
    if text.startswith("```"):
        parts = text.split("```")
        if len(parts) >= 2:
            text = parts[1].strip()
    # Remove leading 'json' token
    if text.lower().startswith("json"):
        text = text[4:].strip()
    try:
        return json.loads(text)
    except Exception as e:
        print("[JSON PARSE ERROR]")
        print("RAW:", repr(text))
        print("ERROR:", e)
        return default
 def split_union_selects(sql: str) -> list[str]:
    """
    Split a SQL query into individual SELECT statements joined by UNION or UNION ALL.
    Input:
        sql (str): A single SQL query string that may contain multiple SELECT
                   statements combined using UNION or UNION ALL.
    Output:
        list[str]: A list of individual SELECT statement strings, with UNION
                   keywords removed and whitespace normalized.
    Example:
        Input:
            SELECT email FROM users
            UNION ALL
            SELECT handle FROM accounts
        Output:
            [
                "SELECT email FROM users",
                "SELECT handle FROM accounts"
            ]
    """
    # Normalize spacing
    sql = re.sub(r"\s+", " ", sql.strip())
    # Split on UNION or UNION ALL, case-insensitive
    parts = re.split(r"\bUNION(?:\s+ALL)?\b", sql, flags=re.IGNORECASE)
    return [p.strip() for p in parts if p.strip()]
 def extract_select_columns(select_sql: str) -> list[str]:
    """
    Extract column names or column aliases from a single SELECT statement.
    Input:
        select_sql (str): A SQL SELECT statement containing an explicit
                          projection list (no SELECT *), such as:
                          "SELECT col, col2 AS alias FROM table".
    Output:
        list[str]: A list of column names or aliases in the order they appear
                   in the SELECT clause.
    Example:
        Input:
            SELECT email, username AS user FROM users
        Output:
            ["email", "user"]
    """
    m = re.search(
        r"SELECT\s+(.*?)\s+FROM\s",
        select_sql,
        flags=re.IGNORECASE | re.DOTALL
    )
    if not m:
        return []
    select_list = m.group(1)
    columns = []
    for item in select_list.split(","):
        item = item.strip()
        # Handle aliases: col AS alias or col alias
        alias_match = re.search(r"\bAS\s+(\w+)$", item, re.IGNORECASE)
        if alias_match:
            columns.append(alias_match.group(1))
        else:
            # Take the final identifier
            columns.append(item.split()[-1])
    return columns
--- a/user4.db
+++ b/user4.db