{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "03fbbc9f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CSV path: C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\\RQ2\\app_total_columns.csv\n", "Exists: True\n", "Using BASE_DIR: C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\n", "Using CSV: C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\\RQ2\\app_total_columns.csv\n", "CSV exists: True\n", "\\begin{tabular}{|l|p{1.4cm}|p{1.8cm}|}\n", "\\hline\n", "\\textbf{Method/LLM} &\n", "\\textbf{Avg. Cols Examined} &\n", "\\textbf{Avg. Search Space Reduc.} \\\\\n", "\\hline\n", "bulk\\_extractor-v1.6 (baseline) & NA & 0.0\\% \\\\\n", "\\hline\n", "GPT-4o-mini & 9.1 & 85.0\\% \\\\\n", "\\hline\n", "Gemini-2.5-Pro & 0 & 100.0\\% \\\\\n", "\\hline\n", "Qwen2.5-72B & 31.5 & 41.2\\% \\\\\n", "\\hline\n", "LLaMA-3.1-70B-Instruct & 1 & 97.5\\% \\\\\n", "\\hline\n", "LLaMA-3.1-8B-Instruct & 0.4 & 99.9\\% \\\\\n", "\\hline\n", "Mixtral-8x22B & 3.2 & 90.3\\% \\\\\n", "\\hline\n", "Mixtral-8x7B & 2.6 & 97.2\\% \\\\\n", "\\hline\n", "Mistral-Large & 24.9 & 75.6\\% \\\\\n", "\\hline\n", "GPT-5.1 & 29.9 & 81.0\\% \\\\\n", "\\hline\n", "GPT-4.1 & 20.5 & 69.7\\% \\\\\n", "\\hline\n", "GPT-3.5-turbo & 2.8 & 98.6\\% \\\\\n", "\\hline\n", "\\end{tabular}\n" ] } ], "source": [ "import json\n", "import csv\n", "from pathlib import Path\n", "from statistics import mean\n", "\n", "# -------------------------------------------------\n", "# Auto-detect project root (Agent directory)\n", "# -------------------------------------------------\n", "BASE_DIR = Path(r\"C:\\Users\\cyfij\\OneDrive\\Desktop\\DFRWS 2026\\Agent\\RQs\")\n", "\n", "\n", "CANDIDATE_CSV = BASE_DIR / \"RQ2\" / \"app_total_columns.csv\"\n", "\n", "print(\"CSV path:\", CANDIDATE_CSV)\n", "print(\"Exists:\", CANDIDATE_CSV.exists())\n", "\n", "\n", "print(\"Using BASE_DIR:\", BASE_DIR)\n", "print(\"Using CSV:\", CANDIDATE_CSV)\n", "print(\"CSV exists:\", CANDIDATE_CSV.exists())\n", "\n", "# -------------------------------------------------\n", "# MODELS\n", "# -------------------------------------------------\n", "MODELS = {\n", " \"gpt_4o_mini\": \"GPT-4o-mini\",\n", " \"gemini_2_5_pro\": \"Gemini-2.5-Pro\",\n", " \"qwen_2_5_72b\": \"Qwen2.5-72B\",\n", " \"llama_3_1_70b\": \"LLaMA-3.1-70B-Instruct\",\n", " \"llama_3_1_8b\": \"LLaMA-3.1-8B-Instruct\",\n", " \"mixtral_8x22b\": \"Mixtral-8x22B\",\n", " \"mixtral_8x7b\": \"Mixtral-8x7B\",\n", " \"mistral_large\": \"Mistral-Large\",\n", " \"gpt_5_1\": \"GPT-5.1\",\n", " \"gpt_4_1\": \"GPT-4.1\",\n", " \"gpt_3_5_turbo\": \"GPT-3.5-turbo\",\n", "}\n", "\n", "# -------------------------------------------------\n", "# Load candidate totals per app\n", "# CSV must contain columns:\n", "# app_code,total_columns\n", "# -------------------------------------------------\n", "def load_candidate_totals(csv_path):\n", " totals = {}\n", " with open(csv_path, newline=\"\", encoding=\"utf-8\") as f:\n", " reader = csv.DictReader(f)\n", " for row in reader:\n", " app = row.get(\"app_code\")\n", " total = row.get(\"total_columns\")\n", " if app and total:\n", " totals[app] = int(total)\n", " return totals\n", "\n", "\n", "# -------------------------------------------------\n", "# Load scanned columns per app\n", "# From normalized_results//app_level/app_level.jsonl\n", "# -------------------------------------------------\n", "def load_scanned_cols(app_level_jsonl: Path):\n", " \"\"\"\n", " Reads app_level.jsonl (corpus/app-level format).\n", "\n", " Your format:\n", " {\n", " \"db_path\": \"selectedDBs\\\\A1\",\n", " ...\n", " \"Num_of_source_columns_unique\": 29\n", " }\n", "\n", " Returns:\n", " { \"A1\": scanned_columns }\n", " \"\"\"\n", "\n", " scanned = {}\n", "\n", " with app_level_jsonl.open(\"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " line = line.strip()\n", " if not line:\n", " continue\n", "\n", " rec = json.loads(line)\n", "\n", " # ---- Extract app from db_path ----\n", " db_path = rec.get(\"db_path\", \"\")\n", " if not db_path:\n", " continue\n", "\n", " app = Path(db_path).name # \"A1\"\n", "\n", " # ---- Use UNIQUE count for efficiency ----\n", " n = rec.get(\"Num_of_source_columns_unique\")\n", " if n is None:\n", " n = rec.get(\"Num_of_source_columns\", 0)\n", "\n", " try:\n", " n = int(n)\n", " except:\n", " n = 0\n", "\n", " # Keep maximum seen per app\n", " scanned[app] = max(scanned.get(app, 0), n)\n", "\n", " return scanned\n", "\n", "\n", "\n", "\n", "# -------------------------------------------------\n", "# Main computation\n", "# -------------------------------------------------\n", "candidate_totals = load_candidate_totals(CANDIDATE_CSV)\n", "\n", "results = []\n", "\n", "for slug, display_name in MODELS.items():\n", "\n", " app_jsonl = (\n", " BASE_DIR\n", " / \"normalized_results\"\n", " / slug\n", " / \"app_level\"\n", " / \"app_level.jsonl\"\n", " )\n", "\n", " if not app_jsonl.exists():\n", " results.append((display_name, \"xx\", \"xx\"))\n", " continue\n", "\n", " scanned_cols = load_scanned_cols(app_jsonl)\n", "\n", " examined_vals = []\n", " reduction_vals = []\n", "\n", " for app, total_cols in candidate_totals.items():\n", "\n", " scanned = scanned_cols.get(app, 0)\n", " examined_vals.append(scanned)\n", "\n", " if total_cols > 0:\n", " reduction = 1 - (scanned / total_cols)\n", " reduction_vals.append(reduction)\n", "\n", " if examined_vals and reduction_vals:\n", " avg_examined = round(mean(examined_vals), 1)\n", " avg_reduction = round(mean(reduction_vals) * 100, 1)\n", " results.append((display_name, avg_examined, f\"{avg_reduction}\\\\%\"))\n", " else:\n", " results.append((display_name, \"xx\", \"xx\"))\n", "\n", "# -------------------------------------------------\n", "# Print LaTeX table\n", "# -------------------------------------------------\n", "print(r\"\\begin{tabular}{|l|p{1.4cm}|p{1.8cm}|}\")\n", "print(r\"\\hline\")\n", "print(r\"\\textbf{Method/LLM} &\")\n", "print(r\"\\textbf{Avg. Cols Examined} &\")\n", "print(r\"\\textbf{Avg. Search Space Reduc.} \\\\\")\n", "print(r\"\\hline\")\n", "print(r\"bulk\\_extractor-v1.6 (baseline) & NA & 0.0\\% \\\\\")\n", "print(r\"\\hline\")\n", "\n", "for name, cols, reduc in results:\n", " print(f\"{name} & {cols} & {reduc} \\\\\\\\\")\n", " print(r\"\\hline\")\n", "\n", "print(r\"\\end{tabular}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d17221d0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 5 }