{ "cells": [ { "cell_type": "markdown", "id": "0be1ee8e", "metadata": {}, "source": [ "uing bnl environment\n", "https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n", "\n", "pip uninstall -y langchain langchain-core langchain-openai langgraph langgraph-prebuilt langgraph-checkpoint langgraph-sdk langsmith langchain-community langchain-google-genai langchain-text-splitters\n", "\n", "pip install langchain==1.2.0 langchain-core==1.2.2 langchain-openai==1.1.4 langgraph==1.0.5 langgraph-prebuilt==1.0.5 langgraph-checkpoint==3.0.1 langgraph-sdk==0.3.0 langsmith==0.5.0" ] }, { "cell_type": "code", "execution_count": 17, "id": "2648a1f1", "metadata": {}, "outputs": [], "source": [ "# only for find models\n", "# import google.generativeai as genai\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "a10c9a6a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OK\n" ] } ], "source": [ "# https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n", "\n", "import os\n", "from dotenv import load_dotenv\n", "from langchain_openai import ChatOpenAI\n", "from langchain_core.messages import HumanMessage\n", "from sql_utils import *\n", "\n", "\n", "load_dotenv() # This looks for the .env file and loads it into os.environ\n", "\n", "llm = ChatOpenAI(\n", " model=\"gpt-4o-mini\", # recommended for tools + cost\n", " api_key=os.environ[\"API_KEY\"],\n", " temperature=0\n", ")\n", "\n", "response = llm.invoke([\n", " HumanMessage(content=\"Reply with exactly: OK\")\n", "])\n", "\n", "print(response.content)\n", "\n", "# DB_PATH = r\"msgstore.db\"\n", "DB_PATH = r\"users4.db\"\n", "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.whatsapp\\databases\\stickers.db\"\n", "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.android.vending\\databases\\localappstate.db\"\n", "\n", "ENTITY_CONFIG = {\n", " \"EMAIL\": {\n", " \"regex\": r\"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\",\n", " \"desc\": \"valid electronic mail formats used for account registration or contact\"\n", " },\n", " \"PHONE\": {\n", " \"regex\": r\"\\+?[0-9]{1,4}[- .]?\\(?[0-9]{1,3}?\\)?[- .]?[0-9]{1,4}[- .]?[0-9]{1,4}[- .]?[0-9]{1,9}\",\n", " \"desc\": \"international or local telephone numbers\"\n", " },\n", " \"USERNAME\": {\n", " \"regex\": r\"\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b\",\n", " \"desc\": \"application-specific usernames, handles, or account identifiers\"\n", " },\n", " \"PERSON_NAME\": {\n", " \"regex\": r\"[A-Za-z\\u4e00-\\u9fff][A-Za-z\\u4e00-\\u9fff\\s\\.\\-]{1,50}\",\n", " \"desc\": (\n", " \"loosely structured human name-like strings used only for discovery \"\n", " \"and column pre-filtering; final identification is performed during extraction\"\n", " )\n", " }\n", "}\n", "\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "48eda3ec", "metadata": {}, "outputs": [], "source": [ "# Core Python\n", "import sqlite3\n", "import re\n", "import json\n", "from typing import TypedDict, Optional, List, Annotated\n", "from langgraph.graph.message import add_messages\n", "\n", "# LangChain / LangGraph\n", "from langchain_core.tools import tool\n", "from langchain_core.messages import (\n", " HumanMessage,\n", " AIMessage,\n", " SystemMessage\n", ")\n", "from langchain.agents import create_agent\n", "from langgraph.graph import StateGraph, END\n", "from langgraph.graph.message import MessagesState\n", "\n", "\n", "@tool\n", "def list_tables() -> str:\n", " \"\"\"\n", " List all table names in the SQLite database.\n", " \"\"\"\n", " conn = sqlite3.connect(DB_PATH)\n", " cur = conn.cursor()\n", " cur.execute(\"SELECT name FROM sqlite_master WHERE type='table'\")\n", " tables = [r[0] for r in cur.fetchall()]\n", " conn.close()\n", " return \", \".join(tables)\n", "\n", "\n", "@tool\n", "def get_schema(table: str) -> str:\n", " \"\"\"\n", " Return column names and types for a table.\n", " \"\"\"\n", " conn = sqlite3.connect(DB_PATH)\n", " cur = conn.cursor()\n", " cur.execute(f\"PRAGMA table_info('{table}')\")\n", " cols = cur.fetchall()\n", " conn.close()\n", " return \", \".join(f\"{c[1]} {c[2]}\" for c in cols)\n", "\n", "\n", "\n", "\n", "@tool\n", "def exec_sql(query: str) -> dict:\n", " \"\"\"Execute SQL statements. If one fails, it is skipped and the next is executed.\"\"\"\n", " query_text = normalize_sql(query)\n", "\n", " # 1. Parse column names from ALL SELECTs\n", " column_names = []\n", " for select_sql in split_union_selects(query_text):\n", " for col in extract_select_columns(select_sql):\n", " if col not in column_names:\n", " column_names.append(col)\n", "\n", " # 2. Execute once\n", " conn = sqlite3.connect(DB_PATH)\n", " conn.create_function(\"REGEXP\", 2, regexp)\n", " cur = conn.cursor()\n", "\n", " try:\n", " print(f\"[EXECUTE] Running query\")\n", " cur.execute(query_text)\n", " rows = cur.fetchall()\n", " except Exception as e:\n", " print(f\"[SQL ERROR]: {e}\")\n", " rows = []\n", " finally:\n", " conn.close()\n", "\n", " return {\n", " \"rows\": rows,\n", " \"columns\": column_names\n", " }\n", "\n", "\n", "\n", "\n", "class EmailEvidenceState(TypedDict):\n", " messages: Annotated[list, add_messages]\n", " attempt: int\n", " max_attempts: int\n", " phase: str # \"discovery\" | \"extraction\"\n", " sql: Optional[str] # SQL to execute\n", " rows: Optional[List]\n", " classification: Optional[dict]\n", " evidence: Optional[List[str]]\n", " target_entity: str # <--- ADD THIS LINE \n", " # Add this to track the \"winning\" columns\n", " source_columns: Optional[List[dict]]\n", "\n", " # SQL used during discovery that returned results\n", " discovered_sql: Optional[List[str]]\n", "\n", "def get_discovery_system(target, regex):\n", " return SystemMessage(\n", " content=(\n", " \"You are a SQL planner. You are provided databases that are extracted from Android or iOS devices.\\n\"\n", " f\"Goal: discover if any column contains {target}.\\n\\n\"\n", " \"Rules:\\n\"\n", " \"- Use 'REGEXP' for pattern matching.\\n\"\n", " f\"- Example: SELECT col FROM table WHERE col REGEXP '{regex}' LIMIT 5\\n\"\n", " \"- Validate your SQL and make sure all tables and columns do exist.\\n\"\n", " \"- If multiple SQL statements are provided, combine them using UNION ALL.\\n\"\n", " \"- Return ONLY SQL.\"\n", " )\n", " )\n", "\n", " \n", "def get_sql_upgrade_system(target):\n", " return SystemMessage(\n", " content=(\n", " \"You are a SQL refiner.\\n\"\n", " f\"Goal: modify previously successful SQL to extract ALL {target}.\\n\\n\"\n", " \"Rules:\\n\"\n", " \"- Do NOT invent new tables or columns.\\n\"\n", " \"- Remove LIMIT clauses.\\n\"\n", " \"- Preserve WHERE conditions and REGEXP filters.\\n\"\n", " \"- Return ONLY SQL.\"\n", " )\n", " )\n", "\n", "\n", "def planner(state: EmailEvidenceState):\n", " # Extraction upgrade path\n", " if state[\"phase\"] == \"extraction\" and state.get(\"discovered_sql\"):\n", " system = get_sql_upgrade_system(state[\"target_entity\"])\n", " joined_sql = \"\\nUNION ALL\\n\".join(state[\"discovered_sql\"])\n", "\n", " result = llm.invoke([\n", " system,\n", " HumanMessage(content=f\"Original SQL:\\n{joined_sql}\")\n", " ])\n", "\n", " sql = normalize_sql(result.content)\n", "\n", " print(\"[PLANNER] Upgraded SQL for extraction\")\n", "\n", " return {\n", " \"messages\": [AIMessage(content=sql)],\n", " \"sql\": sql\n", " }\n", "\n", " # Original discovery logic\n", " tables = list_tables.invoke({})\n", " config = ENTITY_CONFIG[state[\"target_entity\"]]\n", "\n", " base_system = get_discovery_system(\n", " state[\"target_entity\"],\n", " config[\"regex\"]\n", " )\n", "\n", " grounded_content = (\n", " f\"{base_system.content}\\n\\n\"\n", " f\"EXISTING TABLES: {tables}\\n\"\n", " f\"CURRENT PHASE: {state['phase']}\\n\"\n", " \"CRITICAL: Do not query non-existent tables.\"\n", " )\n", "\n", " agent = create_agent(llm, [list_tables, get_schema])\n", " result = agent.invoke({\n", " \"messages\": [SystemMessage(content=grounded_content)] + state[\"messages\"]\n", " })\n", "\n", " sql = normalize_sql(result[\"messages\"][-1].content)\n", "\n", " attempt = state[\"attempt\"] + 1 if state[\"phase\"] == \"discovery\" else state[\"attempt\"]\n", "\n", " return {\n", " \"messages\": [AIMessage(content=sql)],\n", " \"sql\": sql,\n", " \"attempt\": attempt\n", " }\n", "\n", "\n", "def sql_execute(state: EmailEvidenceState):\n", " # Call the tool (it now returns a dict)\n", " result = exec_sql.invoke(state[\"sql\"])\n", " \n", " rows = result.get(\"rows\", [])\n", " cols = result.get(\"columns\", [])\n", "\n", " print(f\"[SQL EXEC] Retrieved {(rows)}\")\n", " updates = {\n", " \"rows\": rows,\n", " \"messages\": [AIMessage(content=f\"Retrieved {len(rows)} rows\")]\n", " }\n", "\n", " if state[\"phase\"] == \"discovery\" and rows:\n", " discovered = list(state.get(\"discovered_sql\", []))\n", " discovered.append(state[\"sql\"])\n", " updates[\"discovered_sql\"] = discovered\n", " print(\"[DISCOVERY] Saved successful SQL\")\n", "\n", " # Tracking logic: Save columns to state only during extraction\n", " if state[\"phase\"] == \"extraction\":\n", " updates[\"source_columns\"] = cols\n", " print(f\"[TRACKING] Saved source columns: {cols}\")\n", "\n", " return updates\n", " \n", "\n", "def get_classify_system(target: str):\n", " return SystemMessage(\n", " content=(\n", " f\"Decide whether the text contains {target}.\\n\"\n", " \"Return ONLY a JSON object with these keys:\\n\"\n", " \"{ \\\"found\\\": true/false, \\\"confidence\\\": number, \\\"reason\\\": \\\"string\\\" }\"\n", " )\n", " )\n", "\n", "def classify(state: EmailEvidenceState):\n", " # 1. Prepare the text sample for the LLM\n", " text = rows_to_text(state[\"rows\"], limit=10)\n", " \n", " # 2. Get the target-specific system message\n", " system_message = get_classify_system(state[\"target_entity\"])\n", "\n", " # 3. Invoke the LLM\n", " result = llm.invoke([\n", " system_message,\n", " HumanMessage(content=f\"Data to analyze:\\n{text}\")\n", " ]).content\n", " \n", "# 4. Parse the decision\n", " decision = safe_json_loads(\n", " result,\n", " default={\"found\": False, \"confidence\": 0.0, \"reason\": \"parse failure\"}\n", " )\n", "\n", " # print(\"[CLASSIFY]\", decision)\n", " return {\"classification\": decision}\n", "\n", "\n", "def switch_to_extraction(state: EmailEvidenceState):\n", " print(\"[PHASE] discovery → extraction\")\n", " return {\"phase\": \"extraction\"}\n", "\n", "\n", "\n", "\n", "def extract(state: EmailEvidenceState):\n", " text = rows_to_text(state[\"rows\"])\n", " system = f\"Extract and normalize {state['target_entity']} from text. Return ONLY a JSON array of strings.\"\n", " result = llm.invoke([SystemMessage(content=system), HumanMessage(content=text)]).content\n", " return {\"evidence\": safe_json_loads(result, default=[])}\n", "\n", "\n", "def next_step(state: EmailEvidenceState):\n", " # Once in extraction phase, extract and stop\n", " if state[\"phase\"] == \"extraction\":\n", " return \"do_extract\"\n", "\n", " c = state[\"classification\"]\n", "\n", " if c[\"found\"] and c[\"confidence\"] >= 0.6:\n", " return \"to_extraction\"\n", "\n", " if not c[\"found\"] and c[\"confidence\"] >= 0.6:\n", " return \"stop_none\"\n", "\n", " if state[\"attempt\"] >= state[\"max_attempts\"]:\n", " return \"stop_limit\"\n", "\n", " return \"replan\"" ] }, { "cell_type": "code", "execution_count": 20, "id": "0f5259d7", "metadata": {}, "outputs": [], "source": [ "def observe(state: EmailEvidenceState):\n", " \"\"\"\n", " Debug / inspection node.\n", " Does NOT modify state.\n", " \"\"\"\n", "\n", " print(\"\\n=== STATE SNAPSHOT ===\")\n", "\n", " # Messages\n", " print(\"\\n--- MESSAGES ---\")\n", " for i, m in enumerate(state[\"messages\"]):\n", " print(f\"{i}: {m.type.upper()} -> {m.content}\")\n", "\n", " # Metadata\n", " print(\"\\n--- BEGIN METADATA ---\")\n", " print(f\"attempt : {state['attempt']}\")\n", " print(f\"max_attempts : {state['max_attempts']}\")\n", " print(f\"phase : {state['phase']}\")\n", " print(f\"sql : {state['sql']}\")\n", " print(f\"discovered sql : {state['discovered_sql']}\")\n", " print(f\"rows : {state['rows']}\")\n", " print(f\"classification: {state['classification']}\")\n", " print(f\"evidence : {state['evidence']}\")\n", " \n", " print(f\"Source Columns: {state.get('source_columns')}\")\n", " print(\"\\n--- END METADATA ---\")\n", "\n", " # IMPORTANT: do not return state, return no-op update\n", " return {}\n", "\n", "\n", "\n", "from langgraph.graph import StateGraph, END\n", "\n", "graph = StateGraph(EmailEvidenceState)\n", "\n", "# Define nodes (reusing the same 'observe' function for two different node names)\n", "graph.add_node(\"planner\", planner)\n", "graph.add_node(\"observe_plan\", observe) # Checkpoint 1: The SQL Plan\n", "graph.add_node(\"execute\", sql_execute)\n", "graph.add_node(\"classify\", classify)\n", "graph.add_node(\"observe_classify\", observe) # Checkpoint 2: The Logic/Discovery\n", "graph.add_node(\"switch_phase\", switch_to_extraction)\n", "graph.add_node(\"extract\", extract)\n", "graph.add_node(\"observe_final\", observe) # Checkpoint 3: The Results\n", "\n", "graph.set_entry_point(\"planner\")\n", "\n", "# --- THE FLOW ---\n", "graph.add_edge(\"planner\", \"observe_plan\") # Check SQL before running\n", "graph.add_edge(\"observe_plan\", \"execute\")\n", "\n", "graph.add_edge(\"execute\", \"classify\")\n", "graph.add_edge(\"classify\", \"observe_classify\")\n", "\n", "# The decision logic now triggers after the second observation\n", "graph.add_conditional_edges(\n", " \"observe_classify\", # Must match the new node name exactly\n", " next_step,\n", " {\n", " \"to_extraction\": \"switch_phase\",\n", " \"do_extract\": \"extract\",\n", " \"replan\": \"planner\",\n", " \"stop_none\": END,\n", " \"stop_limit\": END,\n", " }\n", ")\n", "\n", "graph.add_edge(\"switch_phase\", \"planner\")\n", "\n", "# Change this: Route 'extract' to our new observer instead of END\n", "graph.add_edge(\"extract\", \"observe_final\")\n", "graph.add_edge(\"observe_final\", END)\n", "\n", "app = graph.compile()\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 23, "id": "0b1fce49", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "=== STATE SNAPSHOT ===\n", "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find USERNAME in the database\n", "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "\n", "--- BEGIN METADATA ---\n", "attempt : 1\n", "max_attempts : 3\n", "phase : discovery\n", "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "discovered sql : []\n", "rows : None\n", "classification: None\n", "evidence : []\n", "Source Columns: []\n", "\n", "--- END METADATA ---\n", "[EXECUTE] Running query\n", "[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", "[DISCOVERY] Saved successful SQL\n", "\n", "=== STATE SNAPSHOT ===\n", "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find USERNAME in the database\n", "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "2: AI -> Retrieved 11 rows\n", "\n", "--- BEGIN METADATA ---\n", "attempt : 1\n", "max_attempts : 3\n", "phase : discovery\n", "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", "classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n", "evidence : []\n", "Source Columns: []\n", "\n", "--- END METADATA ---\n", "[PHASE] discovery → extraction\n", "[PLANNER] Upgraded SQL for extraction\n", "\n", "=== STATE SNAPSHOT ===\n", "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find USERNAME in the database\n", "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "2: AI -> Retrieved 11 rows\n", "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "\n", "--- BEGIN METADATA ---\n", "attempt : 1\n", "max_attempts : 3\n", "phase : extraction\n", "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", "classification: {'found': True, 'confidence': 0.95, 'reason': 'The text contains multiple email addresses, which are typically associated with usernames.'}\n", "evidence : []\n", "Source Columns: []\n", "\n", "--- END METADATA ---\n", "[EXECUTE] Running query\n", "[SQL EXEC] Retrieved [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", "[TRACKING] Saved source columns: ['email', 'content']\n", "\n", "=== STATE SNAPSHOT ===\n", "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find USERNAME in the database\n", "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "2: AI -> Retrieved 11 rows\n", "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "4: AI -> Retrieved 11 rows\n", "\n", "--- BEGIN METADATA ---\n", "attempt : 1\n", "max_attempts : 3\n", "phase : extraction\n", "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", "classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n", "evidence : []\n", "Source Columns: ['email', 'content']\n", "\n", "--- END METADATA ---\n", "\n", "=== STATE SNAPSHOT ===\n", "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find USERNAME in the database\n", "1: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "2: AI -> Retrieved 11 rows\n", "3: AI -> SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "4: AI -> Retrieved 11 rows\n", "\n", "--- BEGIN METADATA ---\n", "attempt : 1\n", "max_attempts : 3\n", "phase : extraction\n", "sql : SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b'\n", "UNION ALL\n", "SELECT content FROM messages WHERE content REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b';\n", "discovered sql : [\"SELECT email FROM users WHERE email REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b'\\nUNION ALL\\nSELECT content FROM messages WHERE content REGEXP '\\\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\\\b';\"]\n", "rows : [('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('emma.wilson@example.com',), ('Hey Brian, can you send that file to brian.smith@example.com? Thanks!',), ('Meeting confirmed for tomorrow morning.',), ('I lost my login. Please reset for carol.davis@example.com ASAP.',), ('Does anyone have the contact for support? Is it support@company.io?',), ('Great job on the presentation! Send the notes to emma.wilson@example.com.',), ('Standard message with no email content here.',), ('Verify this secondary address: private_carol@gmail.com for my records.',)]\n", "classification: {'found': True, 'confidence': 0.9, 'reason': 'The text contains multiple email addresses, which are often associated with usernames.'}\n", "evidence : ['alice.johnson', 'brian.smith', 'carol.davis', 'emma.wilson', 'private_carol']\n", "Source Columns: ['email', 'content']\n", "\n", "--- END METADATA ---\n", "\n", "========================================\n", " 🏁 FORENSIC REPORT: USERNAME \n", "========================================\n", "✅ Success! Found 5 unique USERNAME:\n", " 1. alice.johnson\n", " 2. brian.smith\n", " 3. carol.davis\n", " 4. emma.wilson\n", " 5. private_carol\n", "\n", "Source Columns: email, content\n", "========================================\n" ] } ], "source": [ "\n", "# Set your target here once\n", "# TARGET = \"EMAIL\" \n", "# TARGET = \"PHONE\"\n", "TARGET = \"USERNAME\"\n", "# TARGET = \"PERSON_NAME\"\n", "\n", "result = app.invoke({\n", " \"messages\": [HumanMessage(content=f\"Find {TARGET} in the database\")],\n", " \"attempt\": 0,\n", " \"max_attempts\": 3,\n", " \"phase\": \"discovery\",\n", " \"target_entity\": TARGET, # CRITICAL: This tells the planner what to look for\n", " \"sql\": None,\n", " \"rows\": None,\n", " \"classification\": None,\n", " \"evidence\": [], # Use the new generic key\n", " \"source_columns\": [],\n", " \"discovered_sql\": [] \n", "})\n", "\n", "# Use the generic 'evidence' key we defined in the state\n", "final_evidence = result.get(\"evidence\", [])\n", "target_label = result.get(\"target_entity\", \"items\")\n", "\n", "print(\"\\n\" + \"=\"*40)\n", "print(f\" 🏁 FORENSIC REPORT: {target_label.upper()} \")\n", "print(\"=\"*40)\n", "\n", "if final_evidence:\n", " print(f\"✅ Success! Found {len(final_evidence)} unique {target_label}:\")\n", " for i, item in enumerate(sorted(final_evidence), 1):\n", " print(f\" {i}. {item}\")\n", " \n", " # Also print the source columns we tracked!\n", " sources = result.get(\"source_columns\")\n", " if sources:\n", " print(f\"\\nSource Columns: {', '.join(sources)}\")\n", "else:\n", " print(f\"❌ No {target_label} were extracted.\")\n", " print(f\"Last Phase : {result.get('phase')}\")\n", " print(f\"Attempts : {result.get('attempt')}\")\n", "\n", "print(\"=\"*40)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d24e126b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }