diff --git a/.env b/.env new file mode 100644 index 0000000..e1953bc --- /dev/null +++ b/.env @@ -0,0 +1 @@ +API_KEY=sk-proj-yourkeyhere \ No newline at end of file diff --git a/agent_evidence_discovery.ipynb b/agent_evidence_discovery.ipynb new file mode 100644 index 0000000..691f5f4 --- /dev/null +++ b/agent_evidence_discovery.ipynb @@ -0,0 +1,700 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0be1ee8e", + "metadata": {}, + "source": [ + "uing bnl environment\n", + "https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n", + "\n", + "pip uninstall -y langchain langchain-core langchain-openai langgraph langgraph-prebuilt langgraph-checkpoint langgraph-sdk langsmith langchain-community langchain-google-genai langchain-text-splitters\n", + "\n", + "pip install langchain==1.2.0 langchain-core==1.2.2 langchain-openai==1.1.4 langgraph==1.0.5 langgraph-prebuilt==1.0.5 langgraph-checkpoint==3.0.1 langgraph-sdk==0.3.0 langsmith==0.5.0" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2648a1f1", + "metadata": {}, + "outputs": [], + "source": [ + "# only for find models\n", + "# import google.generativeai as genai\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a10c9a6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OK\n" + ] + } + ], + "source": [ + "# https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n", + "\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from langchain_openai import ChatOpenAI\n", + "from langchain_core.messages import HumanMessage\n", + "from sql_utils import *\n", + "\n", + "\n", + "load_dotenv() # This looks for the .env file and loads it into os.environ\n", + "\n", + "llm = ChatOpenAI(\n", + " model=\"gpt-4o-mini\", # recommended for tools + cost\n", + " api_key=os.environ[\"API_KEY\"],\n", + " temperature=0\n", + ")\n", + "\n", + "response = llm.invoke([\n", + " HumanMessage(content=\"Reply with exactly: OK\")\n", + "])\n", + "\n", + "print(response.content)\n", + "\n", + "# DB_PATH = r\"msgstore.db\"\n", + "DB_PATH = r\"users4.db\"\n", + "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.whatsapp\\databases\\stickers.db\"\n", + "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.android.vending\\databases\\localappstate.db\"\n", + "\n", + "ENTITY_CONFIG = {\n", + " \"EMAIL\": {\n", + " \"regex\": r\"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\",\n", + " \"desc\": \"valid electronic mail formats used for account registration or contact\"\n", + " },\n", + " \"PHONE\": {\n", + " \"regex\": r\"\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}\",\n", + " \"desc\": \"international or local telephone numbers\"\n", + " },\n", + " \"USERNAME\": {\n", + " \"regex\": r\"\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b\",\n", + " \"desc\": \"application-specific usernames, handles, or account identifiers\"\n", + " },\n", + " \"PERSON_NAME\": {\n", + " \"regex\": r\"[A-Za-z\\u4e00-\\u9fff][A-Za-z\\u4e00-\\u9fff\\s\\.\\-]{1,50}\",\n", + " \"desc\": (\n", + " \"loosely structured human name-like strings used only for discovery \"\n", + " \"and column pre-filtering; final identification is performed during extraction\"\n", + " )\n", + " }\n", + "}\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "48eda3ec", + "metadata": {}, + "outputs": [], + "source": [ + "# Core Python\n", + "import sqlite3\n", + "import re\n", + "import json\n", + "from typing import TypedDict, Optional, List, Annotated\n", + "from langgraph.graph.message import add_messages\n", + "\n", + "# LangChain / LangGraph\n", + "from langchain_core.tools import tool\n", + "from langchain_core.messages import (\n", + " HumanMessage,\n", + " AIMessage,\n", + " SystemMessage\n", + ")\n", + "from langchain.agents import create_agent\n", + "from langgraph.graph import StateGraph, END\n", + "from langgraph.graph.message import MessagesState\n", + "\n", + "\n", + "@tool\n", + "def list_tables() -> str:\n", + " \"\"\"\n", + " List all table names in the SQLite database.\n", + " \"\"\"\n", + " conn = sqlite3.connect(DB_PATH)\n", + " cur = conn.cursor()\n", + " cur.execute(\"SELECT name FROM sqlite_master WHERE type='table'\")\n", + " tables = [r[0] for r in cur.fetchall()]\n", + " conn.close()\n", + " return \", \".join(tables)\n", + "\n", + "\n", + "@tool\n", + "def get_schema(table: str) -> str:\n", + " \"\"\"\n", + " Return column names and types for a table.\n", + " \"\"\"\n", + " conn = sqlite3.connect(DB_PATH)\n", + " cur = conn.cursor()\n", + " cur.execute(f\"PRAGMA table_info('{table}')\")\n", + " cols = cur.fetchall()\n", + " conn.close()\n", + " return \", \".join(f\"{c[1]} {c[2]}\" for c in cols)\n", + "\n", + "\n", + "\n", + "\n", + "@tool\n", + "def exec_sql(query: str) -> dict:\n", + " \"\"\"Execute SQL statements. If one fails, it is skipped and the next is executed.\"\"\"\n", + " query_text = normalize_sql(query)\n", + "\n", + " # 1. Parse column names from ALL SELECTs\n", + " column_names = []\n", + " for select_sql in split_union_selects(query_text):\n", + " for col in extract_select_columns(select_sql):\n", + " if col not in column_names:\n", + " column_names.append(col)\n", + "\n", + " # 2. Execute once\n", + " conn = sqlite3.connect(DB_PATH)\n", + " conn.create_function(\"REGEXP\", 2, regexp)\n", + " cur = conn.cursor()\n", + "\n", + " try:\n", + " print(f\"[EXECUTE] Running query\")\n", + " cur.execute(query_text)\n", + " rows = cur.fetchall()\n", + " except Exception as e:\n", + " print(f\"[SQL ERROR]: {e}\")\n", + " rows = []\n", + " finally:\n", + " conn.close()\n", + "\n", + " return {\n", + " \"rows\": rows,\n", + " \"columns\": column_names\n", + " }\n", + "\n", + "\n", + "\n", + "\n", + "class EmailEvidenceState(TypedDict):\n", + " messages: Annotated[list, add_messages]\n", + " attempt: int\n", + " max_attempts: int\n", + " phase: str # \"discovery\" | \"extraction\"\n", + " sql: Optional[str] # SQL to execute\n", + " rows: Optional[List]\n", + " classification: Optional[dict]\n", + " evidence: Optional[List[str]]\n", + " target_entity: str # <--- ADD THIS LINE \n", + " # Add this to track the \"winning\" columns\n", + " source_columns: Optional[List[dict]]\n", + "\n", + " # SQL used during discovery that returned results\n", + " discovered_sql: Optional[List[str]]\n", + "\n", + "def get_discovery_system(target, regex):\n", + " return SystemMessage(\n", + " content=(\n", + " \"You are a SQL planner. You are provided databases that are extracted from Android or iOS devices.\\n\"\n", + " f\"Goal: discover if any column contains {target}.\\n\\n\"\n", + " \"Rules:\\n\"\n", + " \"- Use 'REGEXP' for pattern matching.\\n\"\n", + " f\"- Example: SELECT col FROM table WHERE col REGEXP '{regex}' LIMIT 5\\n\"\n", + " \"- Validate your SQL and make sure all tables and columns do exist.\\n\"\n", + " \"- If multiple SQL statements are provided, combine them using UNION ALL.\\n\"\n", + " \"- Return ONLY SQL.\"\n", + " )\n", + " )\n", + "\n", + " \n", + "def get_sql_upgrade_system(target):\n", + " return SystemMessage(\n", + " content=(\n", + " \"You are a SQL refiner.\\n\"\n", + " f\"Goal: modify previously successful SQL to extract ALL {target}.\\n\\n\"\n", + " \"Rules:\\n\"\n", + " \"- Do NOT invent new tables or columns.\\n\"\n", + " \"- Remove LIMIT clauses.\\n\"\n", + " \"- Preserve WHERE conditions and REGEXP filters.\\n\"\n", + " \"- Return ONLY SQL.\"\n", + " )\n", + " )\n", + "\n", + "\n", + "def planner(state: EmailEvidenceState):\n", + " # Extraction upgrade path\n", + " if state[\"phase\"] == \"extraction\" and state.get(\"discovered_sql\"):\n", + " system = get_sql_upgrade_system(state[\"target_entity\"])\n", + " joined_sql = \"\\nUNION ALL\\n\".join(state[\"discovered_sql\"])\n", + "\n", + " result = llm.invoke([\n", + " system,\n", + " HumanMessage(content=f\"Original SQL:\\n{joined_sql}\")\n", + " ])\n", + "\n", + " sql = normalize_sql(result.content)\n", + "\n", + " print(\"[PLANNER] Upgraded SQL for extraction\")\n", + "\n", + " return {\n", + " \"messages\": [AIMessage(content=sql)],\n", + " \"sql\": sql\n", + " }\n", + "\n", + " # Original discovery logic\n", + " tables = list_tables.invoke({})\n", + " config = ENTITY_CONFIG[state[\"target_entity\"]]\n", + "\n", + " base_system = get_discovery_system(\n", + " state[\"target_entity\"],\n", + " config[\"regex\"]\n", + " )\n", + "\n", + " grounded_content = (\n", + " f\"{base_system.content}\\n\\n\"\n", + " f\"EXISTING TABLES: {tables}\\n\"\n", + " f\"CURRENT PHASE: {state['phase']}\\n\"\n", + " \"CRITICAL: Do not query non-existent tables.\"\n", + " )\n", + "\n", + " agent = create_agent(llm, [list_tables, get_schema])\n", + " result = agent.invoke({\n", + " \"messages\": [SystemMessage(content=grounded_content)] + state[\"messages\"]\n", + " })\n", + "\n", + " sql = normalize_sql(result[\"messages\"][-1].content)\n", + "\n", + " attempt = state[\"attempt\"] + 1 if state[\"phase\"] == \"discovery\" else state[\"attempt\"]\n", + "\n", + " return {\n", + " \"messages\": [AIMessage(content=sql)],\n", + " \"sql\": sql,\n", + " \"attempt\": attempt\n", + " }\n", + "\n", + "\n", + "def sql_execute(state: EmailEvidenceState):\n", + " # Call the tool (it now returns a dict)\n", + " result = exec_sql.invoke(state[\"sql\"])\n", + " \n", + " rows = result.get(\"rows\", [])\n", + " cols = result.get(\"columns\", [])\n", + "\n", + " print(f\"[SQL EXEC] Retrieved {(rows)}\")\n", + " updates = {\n", + " \"rows\": rows,\n", + " \"messages\": [AIMessage(content=f\"Retrieved {len(rows)} rows\")]\n", + " }\n", + "\n", + " if state[\"phase\"] == \"discovery\" and rows:\n", + " discovered = list(state.get(\"discovered_sql\", []))\n", + " discovered.append(state[\"sql\"])\n", + " updates[\"discovered_sql\"] = discovered\n", + " print(\"[DISCOVERY] Saved successful SQL\")\n", + "\n", + " # Tracking logic: Save columns to state only during extraction\n", + " if state[\"phase\"] == \"extraction\":\n", + " updates[\"source_columns\"] = cols\n", + " print(f\"[TRACKING] Saved source columns: {cols}\")\n", + "\n", + " return updates\n", + " \n", + "\n", + "def get_classify_system(target: str):\n", + " return SystemMessage(\n", + " content=(\n", + " f\"Decide whether the text contains {target}.\\n\"\n", + " \"Return ONLY a JSON object with these keys:\\n\"\n", + " \"{ \\\"found\\\": true/false, \\\"confidence\\\": number, \\\"reason\\\": \\\"string\\\" }\"\n", + " )\n", + " )\n", + "\n", + "def classify(state: EmailEvidenceState):\n", + " # 1. Prepare the text sample for the LLM\n", + " text = rows_to_text(state[\"rows\"], limit=10)\n", + " \n", + " # 2. Get the target-specific system message\n", + " system_message = get_classify_system(state[\"target_entity\"])\n", + "\n", + " # 3. Invoke the LLM\n", + " result = llm.invoke([\n", + " system_message,\n", + " HumanMessage(content=f\"Data to analyze:\\n{text}\")\n", + " ]).content\n", + " \n", + "# 4. Parse the decision\n", + " decision = safe_json_loads(\n", + " result,\n", + " default={\"found\": False, \"confidence\": 0.0, \"reason\": \"parse failure\"}\n", + " )\n", + "\n", + " # print(\"[CLASSIFY]\", decision)\n", + " return {\"classification\": decision}\n", + "\n", + "\n", + "def switch_to_extraction(state: EmailEvidenceState):\n", + " print(\"[PHASE] discovery β†’ extraction\")\n", + " return {\"phase\": \"extraction\"}\n", + "\n", + "\n", + "\n", + "\n", + "def extract(state: EmailEvidenceState):\n", + " text = rows_to_text(state[\"rows\"])\n", + " system = f\"Extract and normalize {state['target_entity']} from text. Return ONLY a JSON array of strings.\"\n", + " result = llm.invoke([SystemMessage(content=system), HumanMessage(content=text)]).content\n", + " return {\"evidence\": safe_json_loads(result, default=[])}\n", + "\n", + "\n", + "def next_step(state: EmailEvidenceState):\n", + " # Once in extraction phase, extract and stop\n", + " if state[\"phase\"] == \"extraction\":\n", + " return \"do_extract\"\n", + "\n", + " c = state[\"classification\"]\n", + "\n", + " if c[\"found\"] and c[\"confidence\"] >= 0.6:\n", + " return \"to_extraction\"\n", + "\n", + " if not c[\"found\"] and c[\"confidence\"] >= 0.6:\n", + " return \"stop_none\"\n", + "\n", + " if state[\"attempt\"] >= state[\"max_attempts\"]:\n", + " return \"stop_limit\"\n", + "\n", + " return \"replan\"" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0f5259d7", + "metadata": {}, + "outputs": [], + "source": [ + "def observe(state: EmailEvidenceState):\n", + " \"\"\"\n", + " Debug / inspection node.\n", + " Does NOT modify state.\n", + " \"\"\"\n", + "\n", + " print(\"\\n=== STATE SNAPSHOT ===\")\n", + "\n", + " # Messages\n", + " print(\"\\n--- MESSAGES ---\")\n", + " for i, m in enumerate(state[\"messages\"]):\n", + " print(f\"{i}: {m.type.upper()} -> {m.content}\")\n", + "\n", + " # Metadata\n", + " print(\"\\n--- BEGIN METADATA ---\")\n", + " print(f\"attempt : {state['attempt']}\")\n", + " print(f\"max_attempts : {state['max_attempts']}\")\n", + " print(f\"phase : {state['phase']}\")\n", + " print(f\"sql : {state['sql']}\")\n", + " print(f\"discovered sql : {state['discovered_sql']}\")\n", + " print(f\"rows : {state['rows']}\")\n", + " print(f\"classification: {state['classification']}\")\n", + " print(f\"evidence : {state['evidence']}\")\n", + " \n", + " print(f\"Source Columns: {state.get('source_columns')}\")\n", + " print(\"\\n--- END METADATA ---\")\n", + "\n", + " # IMPORTANT: do not return state, return no-op update\n", + " return {}\n", + "\n", + "\n", + "\n", + "from langgraph.graph import StateGraph, END\n", + "\n", + "graph = StateGraph(EmailEvidenceState)\n", + "\n", + "# Define nodes (reusing the same 'observe' function for two different node names)\n", + "graph.add_node(\"planner\", planner)\n", + "graph.add_node(\"observe_plan\", observe) # Checkpoint 1: The SQL Plan\n", + "graph.add_node(\"execute\", sql_execute)\n", + "graph.add_node(\"classify\", classify)\n", + "graph.add_node(\"observe_classify\", observe) # Checkpoint 2: The Logic/Discovery\n", + "graph.add_node(\"switch_phase\", switch_to_extraction)\n", + "graph.add_node(\"extract\", extract)\n", + "graph.add_node(\"observe_final\", observe) # Checkpoint 3: The Results\n", + "\n", + "graph.set_entry_point(\"planner\")\n", + "\n", + "# --- THE FLOW ---\n", + "graph.add_edge(\"planner\", \"observe_plan\") # Check SQL before running\n", + "graph.add_edge(\"observe_plan\", \"execute\")\n", + "\n", + "graph.add_edge(\"execute\", \"classify\")\n", + "graph.add_edge(\"classify\", \"observe_classify\")\n", + "\n", + "# The decision logic now triggers after the second observation\n", + "graph.add_conditional_edges(\n", + " \"observe_classify\", # Must match the new node name exactly\n", + " next_step,\n", + " {\n", + " \"to_extraction\": \"switch_phase\",\n", + " \"do_extract\": \"extract\",\n", + " \"replan\": \"planner\",\n", + " \"stop_none\": END,\n", + " \"stop_limit\": END,\n", + " }\n", + ")\n", + "\n", + "graph.add_edge(\"switch_phase\", \"planner\")\n", + "\n", + "# Change this: Route 'extract' to our new observer instead of END\n", + "graph.add_edge(\"extract\", \"observe_final\")\n", + "graph.add_edge(\"observe_final\", END)\n", + "\n", + "app = graph.compile()\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "0b1fce49", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== STATE SNAPSHOT ===\n", + "\n", + "--- MESSAGES ---\n", + "0: HUMAN -> Find USERNAME in the database\n", + "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "\n", + "--- BEGIN METADATA ---\n", + "attempt : 1\n", + "max_attempts : 3\n", + "phase : discovery\n", + "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "discovered sql : []\n", + "rows : None\n", + "classification: None\n", + "evidence : []\n", + "Source Columns: []\n", + "\n", + "--- END METADATA ---\n", + "[EXECUTE] Running query\n", + "[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", + "[DISCOVERY] Saved successful SQL\n", + "\n", + "=== STATE SNAPSHOT ===\n", + "\n", + "--- MESSAGES ---\n", + "0: HUMAN -> Find USERNAME in the database\n", + "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "2: AI -> Retrieved 11 rows\n", + "\n", + "--- BEGIN METADATA ---\n", + "attempt : 1\n", + "max_attempts : 3\n", + "phase : discovery\n", + "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", + "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", + "classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n", + "evidence : []\n", + "Source Columns: []\n", + "\n", + "--- END METADATA ---\n", + "[PHASE] discovery β†’ extraction\n", + "[PLANNER] Upgraded SQL for extraction\n", + "\n", + "=== STATE SNAPSHOT ===\n", + "\n", + "--- MESSAGES ---\n", + "0: HUMAN -> Find USERNAME in the database\n", + "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "2: AI -> Retrieved 11 rows\n", + "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "\n", + "--- BEGIN METADATA ---\n", + "attempt : 1\n", + "max_attempts : 3\n", + "phase : extraction\n", + "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", + "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", + "classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n", + "evidence : []\n", + "Source Columns: []\n", + "\n", + "--- END METADATA ---\n", + "[EXECUTE] Running query\n", + "[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", + "[TRACKING] Saved source columns: ['email', 'content']\n", + "\n", + "=== STATE SNAPSHOT ===\n", + "\n", + "--- MESSAGES ---\n", + "0: HUMAN -> Find USERNAME in the database\n", + "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "2: AI -> Retrieved 11 rows\n", + "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "4: AI -> Retrieved 11 rows\n", + "\n", + "--- BEGIN METADATA ---\n", + "attempt : 1\n", + "max_attempts : 3\n", + "phase : extraction\n", + "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", + "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", + "classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n", + "evidence : []\n", + "Source Columns: ['email', 'content']\n", + "\n", + "--- END METADATA ---\n", + "\n", + "=== STATE SNAPSHOT ===\n", + "\n", + "--- MESSAGES ---\n", + "0: HUMAN -> Find USERNAME in the database\n", + "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "2: AI -> Retrieved 11 rows\n", + "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "4: AI -> Retrieved 11 rows\n", + "\n", + "--- BEGIN METADATA ---\n", + "attempt : 1\n", + "max_attempts : 3\n", + "phase : extraction\n", + "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", + "UNION ALL\n", + "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", + "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", + "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", + "classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n", + "evidence : ['alice.johnson', 'brian.smith', 'carol.davis', 'emma.wilson', 'private_carol']\n", + "Source Columns: ['email', 'content']\n", + "\n", + "--- END METADATA ---\n", + "\n", + "========================================\n", + " 🏁 FORENSIC REPORT: USERNAME \n", + "========================================\n", + "βœ… Success! Found 5 unique USERNAME:\n", + " 1. alice.johnson\n", + " 2. brian.smith\n", + " 3. carol.davis\n", + " 4. emma.wilson\n", + " 5. private_carol\n", + "\n", + "Source Columns: email, content\n", + "========================================\n" + ] + } + ], + "source": [ + "\n", + "# Set your target here once\n", + "# TARGET = \"EMAIL\" \n", + "# TARGET = \"PHONE\"\n", + "TARGET = \"USERNAME\"\n", + "# TARGET = \"PERSON_NAME\"\n", + "\n", + "result = app.invoke({\n", + " \"messages\": [HumanMessage(content=f\"Find {TARGET} in the database\")],\n", + " \"attempt\": 0,\n", + " \"max_attempts\": 3,\n", + " \"phase\": \"discovery\",\n", + " \"target_entity\": TARGET, # CRITICAL: This tells the planner what to look for\n", + " \"sql\": None,\n", + " \"rows\": None,\n", + " \"classification\": None,\n", + " \"evidence\": [], # Use the new generic key\n", + " \"source_columns\": [],\n", + " \"discovered_sql\": [] \n", + "})\n", + "\n", + "# Use the generic 'evidence' key we defined in the state\n", + "final_evidence = result.get(\"evidence\", [])\n", + "target_label = result.get(\"target_entity\", \"items\")\n", + "\n", + "print(\"\\n\" + \"=\"*40)\n", + "print(f\" 🏁 FORENSIC REPORT: {target_label.upper()} \")\n", + "print(\"=\"*40)\n", + "\n", + "if final_evidence:\n", + " print(f\"βœ… Success! Found {len(final_evidence)} unique {target_label}:\")\n", + " for i, item in enumerate(sorted(final_evidence), 1):\n", + " print(f\" {i}. {item}\")\n", + " \n", + " # Also print the source columns we tracked!\n", + " sources = result.get(\"source_columns\")\n", + " if sources:\n", + " print(f\"\\nSource Columns: {', '.join(sources)}\")\n", + "else:\n", + " print(f\"❌ No {target_label} were extracted.\")\n", + " print(f\"Last Phase : {result.get('phase')}\")\n", + " print(f\"Attempts : {result.get('attempt')}\")\n", + "\n", + "print(\"=\"*40)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d24e126b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..d840073 --- /dev/null +++ b/readme.md @@ -0,0 +1,20 @@ +# LLM-Guided SQL Evidence Extraction + +This project implements a lightweight **LLM-assisted pipeline** for discovering and extracting evidentiary artifacts from SQLite databases commonly found in mobile device extractions. + +The system separates **discovery** and **extraction** to reduce search space, avoid hallucinated SQL, and preserve explainability. + +## Features + +- LLM-guided SQL planning with deterministic execution +- Discovery to extraction workflow +- Fixed evidence types: `EMAIL`, `PHONE`, `USERNAME`, `PERSON_NAME` +- Safe SQLite execution with REGEXP support +- UNION / UNION ALL–aware column extraction +- Transparent, inspectable state machine + +## Setup + +```bash +pip install langchain langgraph python-dotenv +``` diff --git a/sql_utils.py b/sql_utils.py new file mode 100644 index 0000000..2968b25 --- /dev/null +++ b/sql_utils.py @@ -0,0 +1,232 @@ +import re + + +def rows_to_text(rows, limit=None, max_chars=5000, cell_max=1000): + """ + Converts SQL rows to text with safety limits for LLM context. + - limit: Max number of rows to process. + - max_chars: Hard limit for the total string length. + - cell_max: Max length for any single column value. + """ + if not rows: + return "" + + out = [] + # 1. Row-level limiting + target_rows = rows[:limit] if limit else rows + + for r in target_rows: + row_str = ",".join( + (str(c)[:cell_max] + "..." if len(str(c)) > cell_max else str(c)) + for c in r if c is not None + ) + out.append(row_str) + + final_text = "\n".join(out) + + # 2. Final global character limit safety check + if len(final_text) > max_chars: + return final_text[:max_chars] + "\n... [DATA TRUNCATED] ..." + + # print(f"[ROWS TO TEXT] Input: {len(rows)} rows | Output: {len(final_text)} chars") + # Optional: print only the first 200 characters of the text to keep logs clean + # print(f"[PREVIEW]: {final_text[:200]}...") + return final_text + + +def regexp(expr, item): + """ + Safe regular expression matcher for SQLite REGEXP queries. + + This function allows SQLite to apply regex matching on arbitrary column + values without raising exceptions. It safely handles NULL values, bytes + or BLOB data, and malformed inputs. The match is case-insensitive and + always fails gracefully instead of crashing the query engine. + + Example: + # SQL: + # SELECT * FROM users WHERE email REGEXP '[a-z0-9._%+-]+@[a-z0-9.-]+'; + + regexp("[a-z0-9._%+-]+@[a-z0-9.-]+", "john.doe@example.com") + β†’ True + + regexp("[a-z0-9._%+-]+@[a-z0-9.-]+", None) + β†’ False + """ + # 1. Handle NULLs (None in Python) + if item is None: + return False + + try: + # 2. Ensure item is a string (handles BLOBs/Bytes) + if isinstance(item, bytes): + item = item.decode('utf-8', errors='ignore') + else: + item = str(item) + + # 3. Compile and search + return re.search(expr, item, re.IGNORECASE) is not None + except Exception: + # 4. If anything else goes wrong, don't crash SQLite + return False + + +def normalize_sql(sql: str) -> str: + """ + Normalize LLM-generated SQL into a clean, executable SQL string. + + Input: + sql (str): A raw SQL string that may include Markdown code fences + (``` or ```sql), leading language tokens (e.g. "sql"), + or extra whitespace. + + Output: + str: A cleaned SQL string with all formatting artifacts removed, + ready to be executed directly by SQLite. + + Example: + Input: + ```sql + SELECT * FROM users; + ``` + + Output: + SELECT * FROM users; + """ + + if not sql: + return sql + + sql = sql.strip() + + # Remove ```sql or ``` fences + sql = re.sub(r"^```(?:sql)?", "", sql, flags=re.IGNORECASE).strip() + sql = re.sub(r"```$", "", sql).strip() + + # Remove leading 'sql' token if present + if sql.lower().startswith("sql"): + sql = sql[3:].strip() + + return sql + +def safe_json_loads(text: str, default): + """ + Safely parse JSON from LLM-generated text. + + Input: + text (str): A raw string that may contain JSON wrapped in Markdown + code fences (```), prefixed with a language token + (e.g. "json"), or include extra whitespace. + default: A fallback value to return if JSON parsing fails. + + Output: + Any: The parsed Python object if valid JSON is found; otherwise + the provided default value. + + Example: + Input: + ```json + { "found": true, "confidence": 0.85 } + ``` + + Output: + { "found": True, "confidence": 0.85 } + """ + if not text: + return default + + text = text.strip() + + # Remove markdown fences + if text.startswith("```"): + parts = text.split("```") + if len(parts) >= 2: + text = parts[1].strip() + + # Remove leading 'json' token + if text.lower().startswith("json"): + text = text[4:].strip() + + try: + return json.loads(text) + except Exception as e: + print("[JSON PARSE ERROR]") + print("RAW:", repr(text)) + print("ERROR:", e) + return default + + +def split_union_selects(sql: str) -> list[str]: + """ + Split a SQL query into individual SELECT statements joined by UNION or UNION ALL. + + Input: + sql (str): A single SQL query string that may contain multiple SELECT + statements combined using UNION or UNION ALL. + + Output: + list[str]: A list of individual SELECT statement strings, with UNION + keywords removed and whitespace normalized. + + Example: + Input: + SELECT email FROM users + UNION ALL + SELECT handle FROM accounts + + Output: + [ + "SELECT email FROM users", + "SELECT handle FROM accounts" + ] + """ + # Normalize spacing + sql = re.sub(r"\s+", " ", sql.strip()) + + # Split on UNION or UNION ALL, case-insensitive + parts = re.split(r"\bUNION(?:\s+ALL)?\b", sql, flags=re.IGNORECASE) + return [p.strip() for p in parts if p.strip()] + +def extract_select_columns(select_sql: str) -> list[str]: + """ + Extract column names or column aliases from a single SELECT statement. + + Input: + select_sql (str): A SQL SELECT statement containing an explicit + projection list (no SELECT *), such as: + "SELECT col, col2 AS alias FROM table". + + Output: + list[str]: A list of column names or aliases in the order they appear + in the SELECT clause. + + Example: + Input: + SELECT email, username AS user FROM users + + Output: + ["email", "user"] + """ + m = re.search( + r"SELECT\s+(.*?)\s+FROM\s", + select_sql, + flags=re.IGNORECASE | re.DOTALL + ) + if not m: + return [] + + select_list = m.group(1) + + columns = [] + for item in select_list.split(","): + item = item.strip() + + # Handle aliases: col AS alias or col alias + alias_match = re.search(r"\bAS\s+(\w+)$", item, re.IGNORECASE) + if alias_match: + columns.append(alias_match.group(1)) + else: + # Take the final identifier + columns.append(item.split()[-1]) + + return columns \ No newline at end of file diff --git a/user4.db b/user4.db new file mode 100644 index 0000000..e69de29