add automated process (folder level)

2026-02-20 13:40:41 +00:00 · 2026-01-19 20:48:32 -05:00
parent e806e133ae
commit 41e720fb95
8 changed files with 1235 additions and 2982 deletions
--- a/agent_evidence_discovery.ipynb
+++ b/agent_evidence_discovery.ipynb
--- a/agent_evidence_discovery_auto.ipynb
+++ b/agent_evidence_discovery_auto.ipynb
--- a/agent_evidence_discovery_fix.ipynb
+++ b/agent_evidence_discovery_fix.ipynb
--- a/agent_evidence_many_tables.ipynb
+++ b/agent_evidence_many_tables.ipynb
@@ -1,692 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "0be1ee8e",
-   "metadata": {},
-   "source": [
-    "uing bnl environment\n",
-    "https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
-    "\n",
-    "pip uninstall -y langchain langchain-core langchain-openai langgraph langgraph-prebuilt langgraph-checkpoint langgraph-sdk langsmith langchain-community langchain-google-genai langchain-text-splitters\n",
-    "\n",
-    "pip install langchain==1.2.0 langchain-core==1.2.2 langchain-openai==1.1.4 langgraph==1.0.5 langgraph-prebuilt==1.0.5 langgraph-checkpoint==3.0.1 langgraph-sdk==0.3.0 langsmith==0.5.0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "2648a1f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# only for find models\n",
-    "# import google.generativeai as genai\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "a10c9a6a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "OK\n"
-     ]
-    }
-   ],
-   "source": [
-    "# https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
-    "\n",
-    "import os\n",
-    "from dotenv import load_dotenv\n",
-    "from langchain_openai import ChatOpenAI\n",
-    "from langchain_core.messages import HumanMessage\n",
-    "from sql_utils import *\n",
-    "\n",
-    "\n",
-    "load_dotenv()  # This looks for the .env file and loads it into os.environ\n",
-    "\n",
-    "llm = ChatOpenAI(\n",
-    "    model=\"gpt-4o-mini\",  # recommended for tools + cost\n",
-    "    api_key=os.environ[\"API_KEY\"],\n",
-    "    temperature=0\n",
-    ")\n",
-    "\n",
-    "response = llm.invoke([\n",
-    "    HumanMessage(content=\"Reply with exactly: OK\")\n",
-    "])\n",
-    "\n",
-    "print(response.content)\n",
-    "\n",
-    "DB_PATH = r\"msgstore.db\"\n",
-    "# DB_PATH = r\"users4.db\"\n",
-    "# DB_PATH = r\"test2.db\"\n",
-    "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.whatsapp\\databases\\stickers.db\"\n",
-    "# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.android.vending\\databases\\localappstate.db\"\n",
-    "\n",
-    "ENTITY_CONFIG = {\n",
-    "    \"EMAIL\": {\n",
-    "        \"regex\": r\"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\",\n",
-    "        \"desc\": \"valid electronic mail formats used for account registration or contact\"\n",
-    "    },\n",
-    "    \"PHONE\": {\n",
-    "        \"regex\": r\"\\+?\\d[\\d\\s().-]{7,}\\d\",\n",
-    "        \"desc\": \"phone number–like strings used for discovery; normalization occurs during extraction\"\n",
-    "    },\n",
-    "    \"USERNAME\": {\n",
-    "        \"regex\": r\"\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b\",\n",
-    "        \"desc\": \"application-specific usernames, handles, or account identifiers\"\n",
-    "    },\n",
-    "    \"PERSON_NAME\": {\n",
-    "        \"regex\": r\"[A-Za-z][A-Za-z\\s\\.\\-]{1,50}\",\n",
-    "        \"desc\": (\n",
-    "            \"loosely structured human name-like strings used only for discovery \"\n",
-    "            \"and column pre-filtering; final identification is performed during extraction\"\n",
-    "        )\n",
-    "    }\n",
-    "}\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "48eda3ec",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Core Python\n",
-    "import sqlite3\n",
-    "import re\n",
-    "import json\n",
-    "from typing import TypedDict, Optional, List, Annotated\n",
-    "from langgraph.graph.message import add_messages\n",
-    "\n",
-    "# LangChain / LangGraph\n",
-    "from langchain_core.tools import tool\n",
-    "from langchain_core.messages import (\n",
-    "    HumanMessage,\n",
-    "    AIMessage,\n",
-    "    SystemMessage\n",
-    ")\n",
-    "from langchain.agents import create_agent\n",
-    "from langgraph.graph import StateGraph, END\n",
-    "from langgraph.graph.message import MessagesState\n",
-    "\n",
-    "\n",
-    "@tool\n",
-    "def list_tables() -> str:\n",
-    "    \"\"\"\n",
-    "    List all table names in the SQLite database.\n",
-    "    \"\"\"\n",
-    "    conn = sqlite3.connect(DB_PATH)\n",
-    "    cur = conn.cursor()\n",
-    "    cur.execute(\"SELECT name FROM sqlite_master WHERE type='table'\")\n",
-    "    tables = [r[0] for r in cur.fetchall()]\n",
-    "    conn.close()\n",
-    "    return \", \".join(tables)\n",
-    "\n",
-    "\n",
-    "@tool\n",
-    "def get_schema(table: str) -> str:\n",
-    "    \"\"\"\n",
-    "    Return column names and types for a table.\n",
-    "    \"\"\"\n",
-    "    conn = sqlite3.connect(DB_PATH)\n",
-    "    cur = conn.cursor()\n",
-    "    cur.execute(f\"PRAGMA table_info('{table}')\")\n",
-    "    cols = cur.fetchall()\n",
-    "    conn.close()\n",
-    "    return \", \".join(f\"{c[1]} {c[2]}\" for c in cols)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "@tool\n",
-    "def exec_sql(query: str) -> dict:\n",
-    "    \"\"\"Execute SQL statements. If one fails, it is skipped and the next is executed.\"\"\"\n",
-    "    query_text = normalize_sql(query)\n",
-    "\n",
-    "    # 1. Parse column names from ALL SELECTs\n",
-    "    column_names = []\n",
-    "    for select_sql in split_union_selects(query_text):\n",
-    "        for col in extract_select_columns(select_sql):\n",
-    "            if col not in column_names:\n",
-    "                column_names.append(col)\n",
-    "\n",
-    "    # 2. Execute once\n",
-    "    conn = sqlite3.connect(DB_PATH)\n",
-    "    conn.create_function(\"REGEXP\", 2, regexp)\n",
-    "    cur = conn.cursor()\n",
-    "\n",
-    "    try:\n",
-    "        print(f\"[EXECUTE] Running query\")\n",
-    "        cur.execute(query_text)\n",
-    "        rows = cur.fetchall()\n",
-    "    except Exception as e:\n",
-    "        print(f\"[SQL ERROR]: {e}\")\n",
-    "        rows = []\n",
-    "    finally:\n",
-    "        conn.close()\n",
-    "\n",
-    "    return {\n",
-    "        \"rows\": rows,\n",
-    "        \"columns\": column_names\n",
-    "    }\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "class EmailEvidenceState(TypedDict):\n",
-    "    messages: Annotated[list, add_messages]\n",
-    "    attempt: int\n",
-    "    max_attempts: int\n",
-    "    phase: str            # \"discovery\" | \"extraction\"\n",
-    "    sql: Optional[str]   # SQL to execute\n",
-    "    rows: Optional[List]\n",
-    "    classification: Optional[dict]\n",
-    "    evidence: Optional[List[str]]\n",
-    "    target_entity: str     # <--- ADD THIS LINE   \n",
-    "    # Add this to track the \"winning\" columns\n",
-    "    source_columns: Optional[List[dict]]\n",
-    "\n",
-    "    # SQL used during discovery that returned results\n",
-    "    discovered_sql: Optional[List[str]]\n",
-    "\n",
-    "def get_discovery_system(target, regex):\n",
-    "    return SystemMessage(\n",
-    "        content=(\n",
-    "            \"You are a SQL planner. You are provided databases that are extracted from Android or iOS devices.\\n\"\n",
-    "            f\"Goal: discover if any column contains {target} from databases.\\n\\n\"\n",
-    "            \"Rules:\\n\"\n",
-    "            \"- Use 'REGEXP' for pattern matching.\\n\"\n",
-    "            f\"- Example: SELECT col FROM table WHERE col REGEXP '{regex}' LIMIT 10\\n\"\n",
-    "            f\"- pay special attention to tables and/or columns related to message/chat/text. {target} may be embedded in these text.\\n\"\n",
-    "            \"- Validate your SQL and make sure all tables and columns do exist.\\n\"\n",
-    "            \"- If multiple SQL statements are provided, combine them using UNION ALL.\\n\"\n",
-    "            \"- Return ONLY SQL.\"\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "    \n",
-    "def get_sql_upgrade_system(target):\n",
-    "    return SystemMessage(\n",
-    "        content=(\n",
-    "            \"You are a SQL refiner.\\n\"\n",
-    "            f\"Goal: modify previously successful SQL to extract ALL {target}.\\n\\n\"\n",
-    "            \"Rules:\\n\"\n",
-    "            \"- Do NOT invent new tables or columns.\\n\"\n",
-    "            \"- Remove LIMIT clauses.\\n\"\n",
-    "            \"- Preserve WHERE conditions and REGEXP filters.\\n\"\n",
-    "            \"- Return ONLY SQL.\"\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "def select_relevant_tables(llm, all_tables: list[str], target: str) -> list[str]:\n",
-    "    \"\"\"\n",
-    "    Ask the LLM to select relevant tables by name only.\n",
-    "    No schema access, no tools, no loops.\n",
-    "    \"\"\"\n",
-    "    system = SystemMessage(\n",
-    "        content=(\n",
-    "            \"You are a digital forensics assistant.\\n\"\n",
-    "            f\"Target evidence type: {target}.\\n\\n\"\n",
-    "            \"From the list of table names below, select the tables \"\n",
-    "            \"likely to contain this evidence.\\n\\n\"\n",
-    "            \"Return ONLY a JSON array of table names.\\n\"\n",
-    "            \"If unsure, return an empty array.\"\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "    result = llm.invoke([\n",
-    "        system,\n",
-    "        HumanMessage(content=\"\\n\".join(all_tables))\n",
-    "    ]).content\n",
-    "\n",
-    "    tables = safe_json_loads(result, default=[])\n",
-    "\n",
-    "    # Defensive cleanup\n",
-    "    if not isinstance(tables, list):\n",
-    "        return []\n",
-    "\n",
-    "    return [t for t in tables if t in all_tables]\n",
-    "\n",
-    "\n",
-    "def planner(state: EmailEvidenceState):\n",
-    "    # ---------- EXTRACTION PHASE: upgrade SQL ----------\n",
-    "    if state[\"phase\"] == \"extraction\" and state.get(\"discovered_sql\"):\n",
-    "        system = get_sql_upgrade_system(state[\"target_entity\"])\n",
-    "        joined_sql = \"\\nUNION ALL\\n\".join(state[\"discovered_sql\"])\n",
-    "\n",
-    "        sql = normalize_sql(\n",
-    "            llm.invoke([\n",
-    "                system,\n",
-    "                HumanMessage(content=f\"Original SQL:\\n{joined_sql}\")\n",
-    "            ]).content\n",
-    "        )\n",
-    "\n",
-    "        print(\"[PLANNER] Upgraded SQL for extraction\")\n",
-    "\n",
-    "        return {\n",
-    "            \"messages\": [AIMessage(content=sql)],\n",
-    "            \"sql\": sql\n",
-    "        }\n",
-    "\n",
-    "    # ---------- DISCOVERY PHASE ----------\n",
-    "    # 1. List tables once\n",
-    "    all_tables = [t.strip() for t in list_tables.invoke({}).split(\",\")]\n",
-    "\n",
-    "    # 2. Agent selects relevant tables by NAME ONLY\n",
-    "    selected_tables = select_relevant_tables(\n",
-    "        llm,\n",
-    "        all_tables,\n",
-    "        state[\"target_entity\"]\n",
-    "    )\n",
-    "\n",
-    "    # Fallback: ensure coverage\n",
-    "    if not selected_tables:\n",
-    "        selected_tables = all_tables[:10]\n",
-    "\n",
-    "    # 3. Fetch schema deterministically\n",
-    "    schemas = {\n",
-    "        table: get_schema.invoke({\"table\": table})\n",
-    "        for table in selected_tables\n",
-    "    }\n",
-    "\n",
-    "    # 4. Build grounded prompt\n",
-    "    config = ENTITY_CONFIG[state[\"target_entity\"]]\n",
-    "\n",
-    "    system = get_discovery_system(\n",
-    "        state[\"target_entity\"],\n",
-    "        config[\"regex\"]\n",
-    "    )\n",
-    "\n",
-    "    grounded_content = (\n",
-    "        f\"{system.content}\\n\\n\"\n",
-    "        f\"ALLOWED TABLES: {', '.join(selected_tables)}\\n\"\n",
-    "        f\"SCHEMA:\\n{json.dumps(schemas, indent=2)}\\n\"\n",
-    "        f\"CURRENT PHASE: {state['phase']}\\n\"\n",
-    "        \"CRITICAL: Use ONLY the tables and columns listed above.\"\n",
-    "    )\n",
-    "\n",
-    "    # 5. Single LLM call to generate SQL\n",
-    "    sql = normalize_sql(\n",
-    "        llm.invoke([\n",
-    "            SystemMessage(content=grounded_content),\n",
-    "            state[\"messages\"][0]  # original user request\n",
-    "        ]).content\n",
-    "    )\n",
-    "\n",
-    "    attempt = state[\"attempt\"] + 1\n",
-    "\n",
-    "    return {\n",
-    "        \"messages\": [AIMessage(content=sql)],\n",
-    "        \"sql\": sql,\n",
-    "        \"attempt\": attempt\n",
-    "    }\n",
-    "\n",
-    "\n",
-    "def sql_execute(state: EmailEvidenceState):\n",
-    "    # Call the tool (it now returns a dict)\n",
-    "    result = exec_sql.invoke(state[\"sql\"])\n",
-    "    \n",
-    "    rows = result.get(\"rows\", [])\n",
-    "    cols = result.get(\"columns\", [])\n",
-    "\n",
-    "    print(f\"[SQL EXEC] Retrieved {(rows)}\")\n",
-    "    updates = {\n",
-    "        \"rows\": rows,\n",
-    "        \"messages\": [AIMessage(content=f\"Retrieved {len(rows)} rows\")]\n",
-    "    }\n",
-    "\n",
-    "    if state[\"phase\"] == \"discovery\" and rows:\n",
-    "        discovered = list(state.get(\"discovered_sql\", []))\n",
-    "        discovered.append(state[\"sql\"])\n",
-    "        updates[\"discovered_sql\"] = discovered\n",
-    "        print(\"[DISCOVERY] Saved successful SQL\")\n",
-    "\n",
-    "    # Tracking logic: Save columns to state only during extraction\n",
-    "    if state[\"phase\"] == \"extraction\":\n",
-    "        updates[\"source_columns\"] = cols\n",
-    "        print(f\"[TRACKING] Saved source columns: {cols}\")\n",
-    "\n",
-    "    return updates\n",
-    "    \n",
-    "\n",
-    "def get_classify_system(target: str):\n",
-    "    return SystemMessage(\n",
-    "        content=(\n",
-    "            f\"Decide whether the text contains {target}.\\n\"\n",
-    "            \"Return ONLY a JSON object with these keys:\\n\"\n",
-    "            \"{ \\\"found\\\": true/false, \\\"confidence\\\": number, \\\"reason\\\": \\\"string\\\" }\"\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "def classify(state: EmailEvidenceState):\n",
-    "    # 1. Prepare the text sample for the LLM\n",
-    "    text = rows_to_text(state[\"rows\"], limit=10)\n",
-    "    \n",
-    "    # 2. Get the target-specific system message\n",
-    "    system_message = get_classify_system(state[\"target_entity\"])\n",
-    "\n",
-    "    # 3. Invoke the LLM\n",
-    "    result = llm.invoke([\n",
-    "        system_message,\n",
-    "        HumanMessage(content=f\"Data to analyze:\\n{text}\")\n",
-    "    ]).content\n",
-    "    \n",
-    "# 4. Parse the decision\n",
-    "    decision = safe_json_loads(\n",
-    "        result,\n",
-    "        default={\"found\": False, \"confidence\": 0.0, \"reason\": \"parse failure\"}\n",
-    "    )\n",
-    "\n",
-    "    # print(\"[CLASSIFY]\", decision)\n",
-    "    return {\"classification\": decision}\n",
-    "\n",
-    "\n",
-    "def switch_to_extraction(state: EmailEvidenceState):\n",
-    "    print(\"[PHASE] discovery → extraction\")\n",
-    "    return {\"phase\": \"extraction\"}\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "def extract(state: EmailEvidenceState):\n",
-    "    text = rows_to_text(state[\"rows\"])\n",
-    "    system = f\"Extract and normalize {state['target_entity']} from text. Return ONLY a JSON array of strings.\"\n",
-    "    result = llm.invoke([SystemMessage(content=system), HumanMessage(content=text)]).content\n",
-    "    return {\"evidence\": safe_json_loads(result, default=[])}\n",
-    "\n",
-    "\n",
-    "def next_step(state: EmailEvidenceState):\n",
-    "    # Once in extraction phase, extract and stop\n",
-    "    if state[\"phase\"] == \"extraction\":\n",
-    "        return \"do_extract\"\n",
-    "\n",
-    "    c = state[\"classification\"]\n",
-    "\n",
-    "    if c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
-    "        return \"to_extraction\"\n",
-    "\n",
-    "    if not c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
-    "        return \"stop_none\"\n",
-    "\n",
-    "    if state[\"attempt\"] >= state[\"max_attempts\"]:\n",
-    "        return \"stop_limit\"\n",
-    "\n",
-    "    return \"replan\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "0f5259d7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def observe(state: EmailEvidenceState):\n",
-    "    \"\"\"\n",
-    "    Debug / inspection node.\n",
-    "    Does NOT modify state.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    print(\"\\n=== STATE SNAPSHOT ===\")\n",
-    "\n",
-    "    # Messages\n",
-    "    print(\"\\n--- MESSAGES ---\")\n",
-    "    for i, m in enumerate(state[\"messages\"]):\n",
-    "        print(f\"{i}: {m.type.upper()} -> {m.content}\")\n",
-    "\n",
-    "    # Metadata\n",
-    "    print(\"\\n--- BEGIN METADATA ---\")\n",
-    "    print(f\"attempt       : {state['attempt']}\")\n",
-    "    print(f\"max_attempts : {state['max_attempts']}\")\n",
-    "    print(f\"phase         : {state['phase']}\")\n",
-    "    print(f\"sql           : {state['sql']}\")\n",
-    "    print(f\"discovered sql           : {state['discovered_sql']}\")\n",
-    "    print(f\"rows          : {state['rows']}\")\n",
-    "    print(f\"classification: {state['classification']}\")\n",
-    "    print(f\"evidence        : {state['evidence']}\")\n",
-    "    \n",
-    "    print(f\"Source Columns: {state.get('source_columns')}\")\n",
-    "    print(\"\\n--- END METADATA ---\")\n",
-    "\n",
-    "    # IMPORTANT: do not return state, return no-op update\n",
-    "    return {}\n",
-    "\n",
-    "\n",
-    "\n",
-    "from langgraph.graph import StateGraph, END\n",
-    "\n",
-    "graph = StateGraph(EmailEvidenceState)\n",
-    "\n",
-    "# Define nodes (reusing the same 'observe' function for two different node names)\n",
-    "graph.add_node(\"planner\", planner)\n",
-    "graph.add_node(\"observe_plan\", observe)      # Checkpoint 1: The SQL Plan\n",
-    "graph.add_node(\"execute\", sql_execute)\n",
-    "graph.add_node(\"classify\", classify)\n",
-    "graph.add_node(\"observe_classify\", observe)  # Checkpoint 2: The Logic/Discovery\n",
-    "graph.add_node(\"switch_phase\", switch_to_extraction)\n",
-    "graph.add_node(\"extract\", extract)\n",
-    "graph.add_node(\"observe_final\", observe)     # Checkpoint 3: The Results\n",
-    "\n",
-    "graph.set_entry_point(\"planner\")\n",
-    "\n",
-    "# --- THE FLOW ---\n",
-    "graph.add_edge(\"planner\", \"observe_plan\")   # Check SQL before running\n",
-    "graph.add_edge(\"observe_plan\", \"execute\")\n",
-    "\n",
-    "graph.add_edge(\"execute\", \"classify\")\n",
-    "graph.add_edge(\"classify\", \"observe_classify\")\n",
-    "\n",
-    "# The decision logic now triggers after the second observation\n",
-    "graph.add_conditional_edges(\n",
-    "    \"observe_classify\",  # Must match the new node name exactly\n",
-    "    next_step,\n",
-    "    {\n",
-    "        \"to_extraction\": \"switch_phase\",\n",
-    "        \"do_extract\": \"extract\",\n",
-    "        \"replan\": \"planner\",\n",
-    "        \"stop_none\": END,\n",
-    "        \"stop_limit\": END,\n",
-    "    }\n",
-    ")\n",
-    "\n",
-    "graph.add_edge(\"switch_phase\", \"planner\")\n",
-    "\n",
-    "# Change this: Route 'extract' to our new observer instead of END\n",
-    "graph.add_edge(\"extract\", \"observe_final\")\n",
-    "graph.add_edge(\"observe_final\", END)\n",
-    "\n",
-    "app = graph.compile()\n",
-    "\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "0b1fce49",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "=== STATE SNAPSHOT ===\n",
-      "\n",
-      "--- MESSAGES ---\n",
-      "0: HUMAN -> Find PHONE in the database\n",
-      "1: AI -> SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
-      "\n",
-      "--- BEGIN METADATA ---\n",
-      "attempt       : 1\n",
-      "max_attempts : 1\n",
-      "phase         : discovery\n",
-      "sql           : SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
-      "discovered sql           : []\n",
-      "rows          : None\n",
-      "classification: None\n",
-      "evidence        : []\n",
-      "Source Columns: []\n",
-      "\n",
-      "--- END METADATA ---\n",
-      "[EXECUTE] Running query\n",
-      "[SQL ERROR]: SELECTs to the left and right of UNION ALL do not have the same number of result columns\n",
-      "[SQL EXEC] Retrieved []\n",
-      "\n",
-      "=== STATE SNAPSHOT ===\n",
-      "\n",
-      "--- MESSAGES ---\n",
-      "0: HUMAN -> Find PHONE in the database\n",
-      "1: AI -> SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
-      "2: AI -> Retrieved 0 rows\n",
-      "\n",
-      "--- BEGIN METADATA ---\n",
-      "attempt       : 1\n",
-      "max_attempts : 1\n",
-      "phase         : discovery\n",
-      "sql           : SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
-      "UNION ALL\n",
-      "SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
-      "discovered sql           : []\n",
-      "rows          : []\n",
-      "classification: {'found': False, 'confidence': 0, 'reason': 'No phone-related content provided.'}\n",
-      "evidence        : []\n",
-      "Source Columns: []\n",
-      "\n",
-      "--- END METADATA ---\n",
-      "\n",
-      "========================================\n",
-      "       🏁 FORENSIC REPORT: PHONE       \n",
-      "========================================\n",
-      "❌ No PHONE were extracted.\n",
-      "Last Phase : discovery\n",
-      "Attempts   : 1\n",
-      "========================================\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "# Set your target here once\n",
-    "# TARGET = \"EMAIL\" \n",
-    "TARGET = \"PHONE\"\n",
-    "# TARGET = \"USERNAME\"\n",
-    "# TARGET = \"PERSON_NAME\"\n",
-    "\n",
-    "result = app.invoke({\n",
-    "    \"messages\": [HumanMessage(content=f\"Find {TARGET} in the database\")],\n",
-    "    \"attempt\": 0,\n",
-    "    \"max_attempts\": 1,\n",
-    "    \"phase\": \"discovery\",\n",
-    "    \"target_entity\": TARGET,  # CRITICAL: This tells the planner what to look for\n",
-    "    \"sql\": None,\n",
-    "    \"rows\": None,\n",
-    "    \"classification\": None,\n",
-    "    \"evidence\": [],           # Use the new generic key\n",
-    "    \"source_columns\": [],\n",
-    "    \"discovered_sql\": []  \n",
-    "})\n",
-    "\n",
-    "# Use the generic 'evidence' key we defined in the state\n",
-    "final_evidence = result.get(\"evidence\", [])\n",
-    "target_label = result.get(\"target_entity\", \"items\")\n",
-    "\n",
-    "print(\"\\n\" + \"=\"*40)\n",
-    "print(f\"       🏁 FORENSIC REPORT: {target_label.upper()}       \")\n",
-    "print(\"=\"*40)\n",
-    "\n",
-    "if final_evidence:\n",
-    "    print(f\"✅ Success! Found {len(final_evidence)} unique {target_label}:\")\n",
-    "    for i, item in enumerate(sorted(final_evidence), 1):\n",
-    "        print(f\"  {i}. {item}\")\n",
-    "    \n",
-    "    # Also print the source columns we tracked!\n",
-    "    sources = result.get(\"source_columns\")\n",
-    "    if sources:\n",
-    "        print(f\"\\nSource Columns: {', '.join(sources)}\")\n",
-    "else:\n",
-    "    print(f\"❌ No {target_label} were extracted.\")\n",
-    "    print(f\"Last Phase : {result.get('phase')}\")\n",
-    "    print(f\"Attempts   : {result.get('attempt')}\")\n",
-    "\n",
-    "print(\"=\"*40)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d24e126b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.18"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/db_files.py
+++ b/db_files.py
@@ -0,0 +1,28 @@
+db_files = [
+    "test2.db",
+    # "A1_commerce.db",
+    # "A1_msgstore.db",
+    # "A1_wa.db",
+    # "A2_core.db",
+    # "A2_journal.db",
+    # "A2_main.db",
+    # "A3_account1cache4.db",
+    # "A3_account2cache4.db",
+    # "A3_account3cache4.db",
+    # "A4_gmm_myplaces.db",
+    # "A4_gmm_storage.db",
+    # "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
+    # "A5_SBrowser.db",
+    # "A5_SBrowser2.db",
+    # "A5_searchengine.db",
+    # "I1_CallHistory.sqlite",
+    # "I1_ChatStorage.sqlite",
+    # "I1_ContactsV2.sqlite",
+    # "I2_AddressBook.sqlitedb",
+    # "I2_AddressBookImages.sqlitedb",
+    # "I3_sms.db",
+    # "I4_CloudTabs.db",
+    # "I4_History.db",
+    # "I5_Calendar.sqlitedb",
+    # "I5_Extras.db",
+]
--- a/selectedDBs/test2.db
+++ b/selectedDBs/test2.db
--- a/sql_utils.py
+++ b/sql_utils.py
@@ -1,7 +1,8 @@
 import re
 import json
 import sys
-
+from pathlib import Path
+from datetime import datetime, timezone


 def extract_tables_with_aliases(select_sql: str) -> dict[str, str]:
@@ -270,4 +271,79 @@ def extract_select_columns(select_sql: str) -> list[str]:
            # Take the final identifier
            columns.append(item.split()[-1])

-    return columns
+    return columns
+
+
+def is_sqlite_file(p: Path) -> bool:
+    try:
+        with p.open("rb") as f:
+            return f.read(16) == b"SQLite format 3\x00"
+    except Exception:
+        return False
+    
+    
+from pathlib import Path
+import importlib.util
+from typing import List, Tuple
+
+def load_db_files_list(py_path: Path, var_name: str = "db_files") -> List[str]:
+    """Load a list variable (default: db_files) from a .py file."""
+    spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
+    if spec is None or spec.loader is None:
+        raise ValueError(f"Cannot load module from {py_path}")
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)  # type: ignore
+
+    if not hasattr(mod, var_name):
+        raise AttributeError(f"{py_path} does not define `{var_name}`")
+    value = getattr(mod, var_name)
+    if not isinstance(value, list):
+        raise TypeError(f"`{var_name}` must be a list, got {type(value)}")
+    return value
+
+def build_db_paths(
+    db_dir: Path,
+    db_files: List[str],
+    is_sqlite_fn,
+) -> Tuple[List[Path], List[str], List[str]]:
+    """
+    Build ordered paths from filenames, skipping missing and non-sqlite.
+    Returns (db_paths, missing, not_sqlite).
+    """
+    db_paths: List[Path] = []
+    missing: List[str] = []
+    not_sqlite: List[str] = []
+
+    for name in db_files:
+        p = db_dir / name
+        if not p.exists():
+            missing.append(str(p))
+            continue
+        if not is_sqlite_fn(p):
+            not_sqlite.append(str(p))
+            continue
+        db_paths.append(p)
+
+    return db_paths, missing, not_sqlite
+
+def print_db_path_report(db_paths: List[Path], missing: List[str], not_sqlite: List[str]) -> None:
+    print(f"Will process {len(db_paths)} databases (from db_files list).")
+    if missing:
+        print("Missing files:")
+        for x in missing:
+            print("  -", x)
+    if not_sqlite:
+        print("Not SQLite (bad header):")
+        for x in not_sqlite:
+            print("  -", x)
+
+def save_jsonl(all_results, out_dir):
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    out_path = out_dir / f"evidence_{ts}.jsonl"
+
+    with out_path.open("w", encoding="utf-8") as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+    print(f"Wrote: {out_path.resolve()}")
+    return out_path
--- a/user4.db
+++ b/user4.db