mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-02-20 13:40:41 +00:00
add automated process (folder level)
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,692 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0be1ee8e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"uing bnl environment\n",
|
||||
"https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
|
||||
"\n",
|
||||
"pip uninstall -y langchain langchain-core langchain-openai langgraph langgraph-prebuilt langgraph-checkpoint langgraph-sdk langsmith langchain-community langchain-google-genai langchain-text-splitters\n",
|
||||
"\n",
|
||||
"pip install langchain==1.2.0 langchain-core==1.2.2 langchain-openai==1.1.4 langgraph==1.0.5 langgraph-prebuilt==1.0.5 langgraph-checkpoint==3.0.1 langgraph-sdk==0.3.0 langsmith==0.5.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2648a1f1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# only for find models\n",
|
||||
"# import google.generativeai as genai\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a10c9a6a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"OK\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# https://medium.com/@ayush4002gupta/building-an-llm-agent-to-directly-interact-with-a-database-0c0dd96b8196\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from langchain_openai import ChatOpenAI\n",
|
||||
"from langchain_core.messages import HumanMessage\n",
|
||||
"from sql_utils import *\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_dotenv() # This looks for the .env file and loads it into os.environ\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(\n",
|
||||
" model=\"gpt-4o-mini\", # recommended for tools + cost\n",
|
||||
" api_key=os.environ[\"API_KEY\"],\n",
|
||||
" temperature=0\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = llm.invoke([\n",
|
||||
" HumanMessage(content=\"Reply with exactly: OK\")\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"print(response.content)\n",
|
||||
"\n",
|
||||
"DB_PATH = r\"msgstore.db\"\n",
|
||||
"# DB_PATH = r\"users4.db\"\n",
|
||||
"# DB_PATH = r\"test2.db\"\n",
|
||||
"# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.whatsapp\\databases\\stickers.db\"\n",
|
||||
"# DB_PATH = r\"F:\\mobile_images\\Cellebriate_2024\\Cellebrite_CTF_File1\\CellebriteCTF24_Sharon\\Sharon\\EXTRACTION_FFS 01\\EXTRACTION_FFS\\Dump\\data\\data\\com.android.vending\\databases\\localappstate.db\"\n",
|
||||
"\n",
|
||||
"ENTITY_CONFIG = {\n",
|
||||
" \"EMAIL\": {\n",
|
||||
" \"regex\": r\"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\",\n",
|
||||
" \"desc\": \"valid electronic mail formats used for account registration or contact\"\n",
|
||||
" },\n",
|
||||
" \"PHONE\": {\n",
|
||||
" \"regex\": r\"\\+?\\d[\\d\\s().-]{7,}\\d\",\n",
|
||||
" \"desc\": \"phone number–like strings used for discovery; normalization occurs during extraction\"\n",
|
||||
" },\n",
|
||||
" \"USERNAME\": {\n",
|
||||
" \"regex\": r\"\\b[a-zA-Z][a-zA-Z0-9._-]{2,31}\\b\",\n",
|
||||
" \"desc\": \"application-specific usernames, handles, or account identifiers\"\n",
|
||||
" },\n",
|
||||
" \"PERSON_NAME\": {\n",
|
||||
" \"regex\": r\"[A-Za-z][A-Za-z\\s\\.\\-]{1,50}\",\n",
|
||||
" \"desc\": (\n",
|
||||
" \"loosely structured human name-like strings used only for discovery \"\n",
|
||||
" \"and column pre-filtering; final identification is performed during extraction\"\n",
|
||||
" )\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "48eda3ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Core Python\n",
|
||||
"import sqlite3\n",
|
||||
"import re\n",
|
||||
"import json\n",
|
||||
"from typing import TypedDict, Optional, List, Annotated\n",
|
||||
"from langgraph.graph.message import add_messages\n",
|
||||
"\n",
|
||||
"# LangChain / LangGraph\n",
|
||||
"from langchain_core.tools import tool\n",
|
||||
"from langchain_core.messages import (\n",
|
||||
" HumanMessage,\n",
|
||||
" AIMessage,\n",
|
||||
" SystemMessage\n",
|
||||
")\n",
|
||||
"from langchain.agents import create_agent\n",
|
||||
"from langgraph.graph import StateGraph, END\n",
|
||||
"from langgraph.graph.message import MessagesState\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@tool\n",
|
||||
"def list_tables() -> str:\n",
|
||||
" \"\"\"\n",
|
||||
" List all table names in the SQLite database.\n",
|
||||
" \"\"\"\n",
|
||||
" conn = sqlite3.connect(DB_PATH)\n",
|
||||
" cur = conn.cursor()\n",
|
||||
" cur.execute(\"SELECT name FROM sqlite_master WHERE type='table'\")\n",
|
||||
" tables = [r[0] for r in cur.fetchall()]\n",
|
||||
" conn.close()\n",
|
||||
" return \", \".join(tables)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@tool\n",
|
||||
"def get_schema(table: str) -> str:\n",
|
||||
" \"\"\"\n",
|
||||
" Return column names and types for a table.\n",
|
||||
" \"\"\"\n",
|
||||
" conn = sqlite3.connect(DB_PATH)\n",
|
||||
" cur = conn.cursor()\n",
|
||||
" cur.execute(f\"PRAGMA table_info('{table}')\")\n",
|
||||
" cols = cur.fetchall()\n",
|
||||
" conn.close()\n",
|
||||
" return \", \".join(f\"{c[1]} {c[2]}\" for c in cols)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@tool\n",
|
||||
"def exec_sql(query: str) -> dict:\n",
|
||||
" \"\"\"Execute SQL statements. If one fails, it is skipped and the next is executed.\"\"\"\n",
|
||||
" query_text = normalize_sql(query)\n",
|
||||
"\n",
|
||||
" # 1. Parse column names from ALL SELECTs\n",
|
||||
" column_names = []\n",
|
||||
" for select_sql in split_union_selects(query_text):\n",
|
||||
" for col in extract_select_columns(select_sql):\n",
|
||||
" if col not in column_names:\n",
|
||||
" column_names.append(col)\n",
|
||||
"\n",
|
||||
" # 2. Execute once\n",
|
||||
" conn = sqlite3.connect(DB_PATH)\n",
|
||||
" conn.create_function(\"REGEXP\", 2, regexp)\n",
|
||||
" cur = conn.cursor()\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" print(f\"[EXECUTE] Running query\")\n",
|
||||
" cur.execute(query_text)\n",
|
||||
" rows = cur.fetchall()\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"[SQL ERROR]: {e}\")\n",
|
||||
" rows = []\n",
|
||||
" finally:\n",
|
||||
" conn.close()\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" \"rows\": rows,\n",
|
||||
" \"columns\": column_names\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class EmailEvidenceState(TypedDict):\n",
|
||||
" messages: Annotated[list, add_messages]\n",
|
||||
" attempt: int\n",
|
||||
" max_attempts: int\n",
|
||||
" phase: str # \"discovery\" | \"extraction\"\n",
|
||||
" sql: Optional[str] # SQL to execute\n",
|
||||
" rows: Optional[List]\n",
|
||||
" classification: Optional[dict]\n",
|
||||
" evidence: Optional[List[str]]\n",
|
||||
" target_entity: str # <--- ADD THIS LINE \n",
|
||||
" # Add this to track the \"winning\" columns\n",
|
||||
" source_columns: Optional[List[dict]]\n",
|
||||
"\n",
|
||||
" # SQL used during discovery that returned results\n",
|
||||
" discovered_sql: Optional[List[str]]\n",
|
||||
"\n",
|
||||
"def get_discovery_system(target, regex):\n",
|
||||
" return SystemMessage(\n",
|
||||
" content=(\n",
|
||||
" \"You are a SQL planner. You are provided databases that are extracted from Android or iOS devices.\\n\"\n",
|
||||
" f\"Goal: discover if any column contains {target} from databases.\\n\\n\"\n",
|
||||
" \"Rules:\\n\"\n",
|
||||
" \"- Use 'REGEXP' for pattern matching.\\n\"\n",
|
||||
" f\"- Example: SELECT col FROM table WHERE col REGEXP '{regex}' LIMIT 10\\n\"\n",
|
||||
" f\"- pay special attention to tables and/or columns related to message/chat/text. {target} may be embedded in these text.\\n\"\n",
|
||||
" \"- Validate your SQL and make sure all tables and columns do exist.\\n\"\n",
|
||||
" \"- If multiple SQL statements are provided, combine them using UNION ALL.\\n\"\n",
|
||||
" \"- Return ONLY SQL.\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"def get_sql_upgrade_system(target):\n",
|
||||
" return SystemMessage(\n",
|
||||
" content=(\n",
|
||||
" \"You are a SQL refiner.\\n\"\n",
|
||||
" f\"Goal: modify previously successful SQL to extract ALL {target}.\\n\\n\"\n",
|
||||
" \"Rules:\\n\"\n",
|
||||
" \"- Do NOT invent new tables or columns.\\n\"\n",
|
||||
" \"- Remove LIMIT clauses.\\n\"\n",
|
||||
" \"- Preserve WHERE conditions and REGEXP filters.\\n\"\n",
|
||||
" \"- Return ONLY SQL.\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"def select_relevant_tables(llm, all_tables: list[str], target: str) -> list[str]:\n",
|
||||
" \"\"\"\n",
|
||||
" Ask the LLM to select relevant tables by name only.\n",
|
||||
" No schema access, no tools, no loops.\n",
|
||||
" \"\"\"\n",
|
||||
" system = SystemMessage(\n",
|
||||
" content=(\n",
|
||||
" \"You are a digital forensics assistant.\\n\"\n",
|
||||
" f\"Target evidence type: {target}.\\n\\n\"\n",
|
||||
" \"From the list of table names below, select the tables \"\n",
|
||||
" \"likely to contain this evidence.\\n\\n\"\n",
|
||||
" \"Return ONLY a JSON array of table names.\\n\"\n",
|
||||
" \"If unsure, return an empty array.\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" result = llm.invoke([\n",
|
||||
" system,\n",
|
||||
" HumanMessage(content=\"\\n\".join(all_tables))\n",
|
||||
" ]).content\n",
|
||||
"\n",
|
||||
" tables = safe_json_loads(result, default=[])\n",
|
||||
"\n",
|
||||
" # Defensive cleanup\n",
|
||||
" if not isinstance(tables, list):\n",
|
||||
" return []\n",
|
||||
"\n",
|
||||
" return [t for t in tables if t in all_tables]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def planner(state: EmailEvidenceState):\n",
|
||||
" # ---------- EXTRACTION PHASE: upgrade SQL ----------\n",
|
||||
" if state[\"phase\"] == \"extraction\" and state.get(\"discovered_sql\"):\n",
|
||||
" system = get_sql_upgrade_system(state[\"target_entity\"])\n",
|
||||
" joined_sql = \"\\nUNION ALL\\n\".join(state[\"discovered_sql\"])\n",
|
||||
"\n",
|
||||
" sql = normalize_sql(\n",
|
||||
" llm.invoke([\n",
|
||||
" system,\n",
|
||||
" HumanMessage(content=f\"Original SQL:\\n{joined_sql}\")\n",
|
||||
" ]).content\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" print(\"[PLANNER] Upgraded SQL for extraction\")\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" \"messages\": [AIMessage(content=sql)],\n",
|
||||
" \"sql\": sql\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # ---------- DISCOVERY PHASE ----------\n",
|
||||
" # 1. List tables once\n",
|
||||
" all_tables = [t.strip() for t in list_tables.invoke({}).split(\",\")]\n",
|
||||
"\n",
|
||||
" # 2. Agent selects relevant tables by NAME ONLY\n",
|
||||
" selected_tables = select_relevant_tables(\n",
|
||||
" llm,\n",
|
||||
" all_tables,\n",
|
||||
" state[\"target_entity\"]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Fallback: ensure coverage\n",
|
||||
" if not selected_tables:\n",
|
||||
" selected_tables = all_tables[:10]\n",
|
||||
"\n",
|
||||
" # 3. Fetch schema deterministically\n",
|
||||
" schemas = {\n",
|
||||
" table: get_schema.invoke({\"table\": table})\n",
|
||||
" for table in selected_tables\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # 4. Build grounded prompt\n",
|
||||
" config = ENTITY_CONFIG[state[\"target_entity\"]]\n",
|
||||
"\n",
|
||||
" system = get_discovery_system(\n",
|
||||
" state[\"target_entity\"],\n",
|
||||
" config[\"regex\"]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" grounded_content = (\n",
|
||||
" f\"{system.content}\\n\\n\"\n",
|
||||
" f\"ALLOWED TABLES: {', '.join(selected_tables)}\\n\"\n",
|
||||
" f\"SCHEMA:\\n{json.dumps(schemas, indent=2)}\\n\"\n",
|
||||
" f\"CURRENT PHASE: {state['phase']}\\n\"\n",
|
||||
" \"CRITICAL: Use ONLY the tables and columns listed above.\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 5. Single LLM call to generate SQL\n",
|
||||
" sql = normalize_sql(\n",
|
||||
" llm.invoke([\n",
|
||||
" SystemMessage(content=grounded_content),\n",
|
||||
" state[\"messages\"][0] # original user request\n",
|
||||
" ]).content\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" attempt = state[\"attempt\"] + 1\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" \"messages\": [AIMessage(content=sql)],\n",
|
||||
" \"sql\": sql,\n",
|
||||
" \"attempt\": attempt\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def sql_execute(state: EmailEvidenceState):\n",
|
||||
" # Call the tool (it now returns a dict)\n",
|
||||
" result = exec_sql.invoke(state[\"sql\"])\n",
|
||||
" \n",
|
||||
" rows = result.get(\"rows\", [])\n",
|
||||
" cols = result.get(\"columns\", [])\n",
|
||||
"\n",
|
||||
" print(f\"[SQL EXEC] Retrieved {(rows)}\")\n",
|
||||
" updates = {\n",
|
||||
" \"rows\": rows,\n",
|
||||
" \"messages\": [AIMessage(content=f\"Retrieved {len(rows)} rows\")]\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" if state[\"phase\"] == \"discovery\" and rows:\n",
|
||||
" discovered = list(state.get(\"discovered_sql\", []))\n",
|
||||
" discovered.append(state[\"sql\"])\n",
|
||||
" updates[\"discovered_sql\"] = discovered\n",
|
||||
" print(\"[DISCOVERY] Saved successful SQL\")\n",
|
||||
"\n",
|
||||
" # Tracking logic: Save columns to state only during extraction\n",
|
||||
" if state[\"phase\"] == \"extraction\":\n",
|
||||
" updates[\"source_columns\"] = cols\n",
|
||||
" print(f\"[TRACKING] Saved source columns: {cols}\")\n",
|
||||
"\n",
|
||||
" return updates\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def get_classify_system(target: str):\n",
|
||||
" return SystemMessage(\n",
|
||||
" content=(\n",
|
||||
" f\"Decide whether the text contains {target}.\\n\"\n",
|
||||
" \"Return ONLY a JSON object with these keys:\\n\"\n",
|
||||
" \"{ \\\"found\\\": true/false, \\\"confidence\\\": number, \\\"reason\\\": \\\"string\\\" }\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"def classify(state: EmailEvidenceState):\n",
|
||||
" # 1. Prepare the text sample for the LLM\n",
|
||||
" text = rows_to_text(state[\"rows\"], limit=10)\n",
|
||||
" \n",
|
||||
" # 2. Get the target-specific system message\n",
|
||||
" system_message = get_classify_system(state[\"target_entity\"])\n",
|
||||
"\n",
|
||||
" # 3. Invoke the LLM\n",
|
||||
" result = llm.invoke([\n",
|
||||
" system_message,\n",
|
||||
" HumanMessage(content=f\"Data to analyze:\\n{text}\")\n",
|
||||
" ]).content\n",
|
||||
" \n",
|
||||
"# 4. Parse the decision\n",
|
||||
" decision = safe_json_loads(\n",
|
||||
" result,\n",
|
||||
" default={\"found\": False, \"confidence\": 0.0, \"reason\": \"parse failure\"}\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # print(\"[CLASSIFY]\", decision)\n",
|
||||
" return {\"classification\": decision}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def switch_to_extraction(state: EmailEvidenceState):\n",
|
||||
" print(\"[PHASE] discovery → extraction\")\n",
|
||||
" return {\"phase\": \"extraction\"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract(state: EmailEvidenceState):\n",
|
||||
" text = rows_to_text(state[\"rows\"])\n",
|
||||
" system = f\"Extract and normalize {state['target_entity']} from text. Return ONLY a JSON array of strings.\"\n",
|
||||
" result = llm.invoke([SystemMessage(content=system), HumanMessage(content=text)]).content\n",
|
||||
" return {\"evidence\": safe_json_loads(result, default=[])}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def next_step(state: EmailEvidenceState):\n",
|
||||
" # Once in extraction phase, extract and stop\n",
|
||||
" if state[\"phase\"] == \"extraction\":\n",
|
||||
" return \"do_extract\"\n",
|
||||
"\n",
|
||||
" c = state[\"classification\"]\n",
|
||||
"\n",
|
||||
" if c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
|
||||
" return \"to_extraction\"\n",
|
||||
"\n",
|
||||
" if not c[\"found\"] and c[\"confidence\"] >= 0.6:\n",
|
||||
" return \"stop_none\"\n",
|
||||
"\n",
|
||||
" if state[\"attempt\"] >= state[\"max_attempts\"]:\n",
|
||||
" return \"stop_limit\"\n",
|
||||
"\n",
|
||||
" return \"replan\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "0f5259d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def observe(state: EmailEvidenceState):\n",
|
||||
" \"\"\"\n",
|
||||
" Debug / inspection node.\n",
|
||||
" Does NOT modify state.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" print(\"\\n=== STATE SNAPSHOT ===\")\n",
|
||||
"\n",
|
||||
" # Messages\n",
|
||||
" print(\"\\n--- MESSAGES ---\")\n",
|
||||
" for i, m in enumerate(state[\"messages\"]):\n",
|
||||
" print(f\"{i}: {m.type.upper()} -> {m.content}\")\n",
|
||||
"\n",
|
||||
" # Metadata\n",
|
||||
" print(\"\\n--- BEGIN METADATA ---\")\n",
|
||||
" print(f\"attempt : {state['attempt']}\")\n",
|
||||
" print(f\"max_attempts : {state['max_attempts']}\")\n",
|
||||
" print(f\"phase : {state['phase']}\")\n",
|
||||
" print(f\"sql : {state['sql']}\")\n",
|
||||
" print(f\"discovered sql : {state['discovered_sql']}\")\n",
|
||||
" print(f\"rows : {state['rows']}\")\n",
|
||||
" print(f\"classification: {state['classification']}\")\n",
|
||||
" print(f\"evidence : {state['evidence']}\")\n",
|
||||
" \n",
|
||||
" print(f\"Source Columns: {state.get('source_columns')}\")\n",
|
||||
" print(\"\\n--- END METADATA ---\")\n",
|
||||
"\n",
|
||||
" # IMPORTANT: do not return state, return no-op update\n",
|
||||
" return {}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from langgraph.graph import StateGraph, END\n",
|
||||
"\n",
|
||||
"graph = StateGraph(EmailEvidenceState)\n",
|
||||
"\n",
|
||||
"# Define nodes (reusing the same 'observe' function for two different node names)\n",
|
||||
"graph.add_node(\"planner\", planner)\n",
|
||||
"graph.add_node(\"observe_plan\", observe) # Checkpoint 1: The SQL Plan\n",
|
||||
"graph.add_node(\"execute\", sql_execute)\n",
|
||||
"graph.add_node(\"classify\", classify)\n",
|
||||
"graph.add_node(\"observe_classify\", observe) # Checkpoint 2: The Logic/Discovery\n",
|
||||
"graph.add_node(\"switch_phase\", switch_to_extraction)\n",
|
||||
"graph.add_node(\"extract\", extract)\n",
|
||||
"graph.add_node(\"observe_final\", observe) # Checkpoint 3: The Results\n",
|
||||
"\n",
|
||||
"graph.set_entry_point(\"planner\")\n",
|
||||
"\n",
|
||||
"# --- THE FLOW ---\n",
|
||||
"graph.add_edge(\"planner\", \"observe_plan\") # Check SQL before running\n",
|
||||
"graph.add_edge(\"observe_plan\", \"execute\")\n",
|
||||
"\n",
|
||||
"graph.add_edge(\"execute\", \"classify\")\n",
|
||||
"graph.add_edge(\"classify\", \"observe_classify\")\n",
|
||||
"\n",
|
||||
"# The decision logic now triggers after the second observation\n",
|
||||
"graph.add_conditional_edges(\n",
|
||||
" \"observe_classify\", # Must match the new node name exactly\n",
|
||||
" next_step,\n",
|
||||
" {\n",
|
||||
" \"to_extraction\": \"switch_phase\",\n",
|
||||
" \"do_extract\": \"extract\",\n",
|
||||
" \"replan\": \"planner\",\n",
|
||||
" \"stop_none\": END,\n",
|
||||
" \"stop_limit\": END,\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"graph.add_edge(\"switch_phase\", \"planner\")\n",
|
||||
"\n",
|
||||
"# Change this: Route 'extract' to our new observer instead of END\n",
|
||||
"graph.add_edge(\"extract\", \"observe_final\")\n",
|
||||
"graph.add_edge(\"observe_final\", END)\n",
|
||||
"\n",
|
||||
"app = graph.compile()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "0b1fce49",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== STATE SNAPSHOT ===\n",
|
||||
"\n",
|
||||
"--- MESSAGES ---\n",
|
||||
"0: HUMAN -> Find PHONE in the database\n",
|
||||
"1: AI -> SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
|
||||
"\n",
|
||||
"--- BEGIN METADATA ---\n",
|
||||
"attempt : 1\n",
|
||||
"max_attempts : 1\n",
|
||||
"phase : discovery\n",
|
||||
"sql : SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
|
||||
"discovered sql : []\n",
|
||||
"rows : None\n",
|
||||
"classification: None\n",
|
||||
"evidence : []\n",
|
||||
"Source Columns: []\n",
|
||||
"\n",
|
||||
"--- END METADATA ---\n",
|
||||
"[EXECUTE] Running query\n",
|
||||
"[SQL ERROR]: SELECTs to the left and right of UNION ALL do not have the same number of result columns\n",
|
||||
"[SQL EXEC] Retrieved []\n",
|
||||
"\n",
|
||||
"=== STATE SNAPSHOT ===\n",
|
||||
"\n",
|
||||
"--- MESSAGES ---\n",
|
||||
"0: HUMAN -> Find PHONE in the database\n",
|
||||
"1: AI -> SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
|
||||
"2: AI -> Retrieved 0 rows\n",
|
||||
"\n",
|
||||
"--- BEGIN METADATA ---\n",
|
||||
"attempt : 1\n",
|
||||
"max_attempts : 1\n",
|
||||
"phase : discovery\n",
|
||||
"sql : SELECT * FROM call_log WHERE call_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_logs WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM missed_call_log_participant WHERE jid REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_call_log WHERE call_log_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM bcall_session WHERE caption REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d'\n",
|
||||
"UNION ALL\n",
|
||||
"SELECT * FROM message_bcall_session WHERE message_row_id REGEXP '\\+?\\d[\\d\\s().-]{7,}\\d';\n",
|
||||
"discovered sql : []\n",
|
||||
"rows : []\n",
|
||||
"classification: {'found': False, 'confidence': 0, 'reason': 'No phone-related content provided.'}\n",
|
||||
"evidence : []\n",
|
||||
"Source Columns: []\n",
|
||||
"\n",
|
||||
"--- END METADATA ---\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
" 🏁 FORENSIC REPORT: PHONE \n",
|
||||
"========================================\n",
|
||||
"❌ No PHONE were extracted.\n",
|
||||
"Last Phase : discovery\n",
|
||||
"Attempts : 1\n",
|
||||
"========================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Set your target here once\n",
|
||||
"# TARGET = \"EMAIL\" \n",
|
||||
"TARGET = \"PHONE\"\n",
|
||||
"# TARGET = \"USERNAME\"\n",
|
||||
"# TARGET = \"PERSON_NAME\"\n",
|
||||
"\n",
|
||||
"result = app.invoke({\n",
|
||||
" \"messages\": [HumanMessage(content=f\"Find {TARGET} in the database\")],\n",
|
||||
" \"attempt\": 0,\n",
|
||||
" \"max_attempts\": 1,\n",
|
||||
" \"phase\": \"discovery\",\n",
|
||||
" \"target_entity\": TARGET, # CRITICAL: This tells the planner what to look for\n",
|
||||
" \"sql\": None,\n",
|
||||
" \"rows\": None,\n",
|
||||
" \"classification\": None,\n",
|
||||
" \"evidence\": [], # Use the new generic key\n",
|
||||
" \"source_columns\": [],\n",
|
||||
" \"discovered_sql\": [] \n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# Use the generic 'evidence' key we defined in the state\n",
|
||||
"final_evidence = result.get(\"evidence\", [])\n",
|
||||
"target_label = result.get(\"target_entity\", \"items\")\n",
|
||||
"\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(f\" 🏁 FORENSIC REPORT: {target_label.upper()} \")\n",
|
||||
"print(\"=\"*40)\n",
|
||||
"\n",
|
||||
"if final_evidence:\n",
|
||||
" print(f\"✅ Success! Found {len(final_evidence)} unique {target_label}:\")\n",
|
||||
" for i, item in enumerate(sorted(final_evidence), 1):\n",
|
||||
" print(f\" {i}. {item}\")\n",
|
||||
" \n",
|
||||
" # Also print the source columns we tracked!\n",
|
||||
" sources = result.get(\"source_columns\")\n",
|
||||
" if sources:\n",
|
||||
" print(f\"\\nSource Columns: {', '.join(sources)}\")\n",
|
||||
"else:\n",
|
||||
" print(f\"❌ No {target_label} were extracted.\")\n",
|
||||
" print(f\"Last Phase : {result.get('phase')}\")\n",
|
||||
" print(f\"Attempts : {result.get('attempt')}\")\n",
|
||||
"\n",
|
||||
"print(\"=\"*40)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d24e126b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
28
db_files.py
Normal file
28
db_files.py
Normal file
@@ -0,0 +1,28 @@
|
||||
db_files = [
|
||||
"test2.db",
|
||||
# "A1_commerce.db",
|
||||
# "A1_msgstore.db",
|
||||
# "A1_wa.db",
|
||||
# "A2_core.db",
|
||||
# "A2_journal.db",
|
||||
# "A2_main.db",
|
||||
# "A3_account1cache4.db",
|
||||
# "A3_account2cache4.db",
|
||||
# "A3_account3cache4.db",
|
||||
# "A4_gmm_myplaces.db",
|
||||
# "A4_gmm_storage.db",
|
||||
# "A4_peopleCache_sharononeil368@gmail.com_com.google_14.db",
|
||||
# "A5_SBrowser.db",
|
||||
# "A5_SBrowser2.db",
|
||||
# "A5_searchengine.db",
|
||||
# "I1_CallHistory.sqlite",
|
||||
# "I1_ChatStorage.sqlite",
|
||||
# "I1_ContactsV2.sqlite",
|
||||
# "I2_AddressBook.sqlitedb",
|
||||
# "I2_AddressBookImages.sqlitedb",
|
||||
# "I3_sms.db",
|
||||
# "I4_CloudTabs.db",
|
||||
# "I4_History.db",
|
||||
# "I5_Calendar.sqlitedb",
|
||||
# "I5_Extras.db",
|
||||
]
|
||||
BIN
selectedDBs/test2.db
Normal file
BIN
selectedDBs/test2.db
Normal file
Binary file not shown.
80
sql_utils.py
80
sql_utils.py
@@ -1,7 +1,8 @@
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def extract_tables_with_aliases(select_sql: str) -> dict[str, str]:
|
||||
@@ -270,4 +271,79 @@ def extract_select_columns(select_sql: str) -> list[str]:
|
||||
# Take the final identifier
|
||||
columns.append(item.split()[-1])
|
||||
|
||||
return columns
|
||||
return columns
|
||||
|
||||
|
||||
def is_sqlite_file(p: Path) -> bool:
|
||||
try:
|
||||
with p.open("rb") as f:
|
||||
return f.read(16) == b"SQLite format 3\x00"
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
from pathlib import Path
|
||||
import importlib.util
|
||||
from typing import List, Tuple
|
||||
|
||||
def load_db_files_list(py_path: Path, var_name: str = "db_files") -> List[str]:
|
||||
"""Load a list variable (default: db_files) from a .py file."""
|
||||
spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
|
||||
if spec is None or spec.loader is None:
|
||||
raise ValueError(f"Cannot load module from {py_path}")
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod) # type: ignore
|
||||
|
||||
if not hasattr(mod, var_name):
|
||||
raise AttributeError(f"{py_path} does not define `{var_name}`")
|
||||
value = getattr(mod, var_name)
|
||||
if not isinstance(value, list):
|
||||
raise TypeError(f"`{var_name}` must be a list, got {type(value)}")
|
||||
return value
|
||||
|
||||
def build_db_paths(
|
||||
db_dir: Path,
|
||||
db_files: List[str],
|
||||
is_sqlite_fn,
|
||||
) -> Tuple[List[Path], List[str], List[str]]:
|
||||
"""
|
||||
Build ordered paths from filenames, skipping missing and non-sqlite.
|
||||
Returns (db_paths, missing, not_sqlite).
|
||||
"""
|
||||
db_paths: List[Path] = []
|
||||
missing: List[str] = []
|
||||
not_sqlite: List[str] = []
|
||||
|
||||
for name in db_files:
|
||||
p = db_dir / name
|
||||
if not p.exists():
|
||||
missing.append(str(p))
|
||||
continue
|
||||
if not is_sqlite_fn(p):
|
||||
not_sqlite.append(str(p))
|
||||
continue
|
||||
db_paths.append(p)
|
||||
|
||||
return db_paths, missing, not_sqlite
|
||||
|
||||
def print_db_path_report(db_paths: List[Path], missing: List[str], not_sqlite: List[str]) -> None:
|
||||
print(f"Will process {len(db_paths)} databases (from db_files list).")
|
||||
if missing:
|
||||
print("Missing files:")
|
||||
for x in missing:
|
||||
print(" -", x)
|
||||
if not_sqlite:
|
||||
print("Not SQLite (bad header):")
|
||||
for x in not_sqlite:
|
||||
print(" -", x)
|
||||
|
||||
def save_jsonl(all_results, out_dir):
|
||||
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
out_path = out_dir / f"evidence_{ts}.jsonl"
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for r in all_results:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Wrote: {out_path.resolve()}")
|
||||
return out_path
|
||||
Reference in New Issue
Block a user