update huggingface code

2026-02-20 13:50:41 +00:00 · 2023-03-21 12:08:53 +01:00
parent e4659fe56a
commit 8f559d3303
7 changed files with 932 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,9 @@ ML/algorithms/svm/__pycache__/utils.cpython-38.pyc
 __pycache__/
 *.pth.tar
 *.DS_STORE
+ML/Pytorch/huggingface/train.csv
+ML/Pytorch/huggingface/validation.csv
+ML/Pytorch/huggingface/test.csv
+ML/Pytorch/huggingface/tb_logs/
+ML/Pytorch/huggingface/checkpoints/
+ML/Pytorch/huggingface/notebooks/
--- a/ML/Pytorch/huggingface/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ b/ML/Pytorch/huggingface/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ML/Pytorch/huggingface/.ipynb_checkpoints/cnndaily_t5_lightning_customdataloading-checkpoint.ipynb
+++ b/ML/Pytorch/huggingface/.ipynb_checkpoints/cnndaily_t5_lightning_customdataloading-checkpoint.ipynb
@@ -0,0 +1,317 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f54ecf0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "# HuggingFace Tutorial Series\n",
+    "- 1. What is Huggingface?\n",
+    "- 2. Common tasks we can do with HuggingFace & explain the tasks briefly, like what is question answering etc\n",
+    "- 3. Using the HuggingFace Pipeline (High level feature)\n",
+    "- 4. How the pipeline works at a lower level\n",
+    "- 5. HuggingFace Datasets\n",
+    "- 6. HuggingFace Tokenizer\n",
+    "- 7. HuggingFace Evaluate\n",
+    "- 8. HuggingFace Trainer\n",
+    "- 9. Putting it together to finetune a news article summarizer\n",
+    "- 10. Making it more general and robust with Lightning and custom data loading\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec1aae37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.simplefilter(\"ignore\")\n",
+    "\n",
+    "import os\n",
+    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import datasets \n",
+    "import pytorch_lightning as pl\n",
+    "from datasets import load_dataset, load_metric\n",
+    "\n",
+    "from transformers import (\n",
+    "    AutoModel,\n",
+    "    AutoModelForSeq2SeqLM,\n",
+    "    AutoTokenizer,\n",
+    "    DataCollatorForSeq2Seq,\n",
+    "    Seq2SeqTrainingArguments,\n",
+    "    Seq2SeqTrainer,\n",
+    ")\n",
+    "\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from torch.utils.data import Dataset\n",
+    "import pytorch_lightning as pl\n",
+    "\n",
+    "torch.set_float32_matmul_precision(\"medium\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fd7cb0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"t5-small\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "418cb03a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class cnn_dailymail(Dataset):\n",
+    "    def __init__(self, csv_file, tokenizer, max_length=512):\n",
+    "        self.data = pd.read_csv(csv_file)\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.max_length = max_length\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        article = self.data.loc[idx, 'article']\n",
+    "        highlights = self.data.loc[idx, 'highlights']\n",
+    "\n",
+    "        inputs = self.tokenizer(\n",
+    "            article,\n",
+    "            truncation=True,\n",
+    "            padding='max_length',\n",
+    "            max_length=self.max_length,\n",
+    "            return_tensors='pt'\n",
+    "        )\n",
+    "        targets = self.tokenizer(\n",
+    "            highlights,\n",
+    "            truncation=True,\n",
+    "            padding='max_length',\n",
+    "            max_length=self.max_length,\n",
+    "            return_tensors='pt'\n",
+    "        )\n",
+    "\n",
+    "        return {\n",
+    "            'input_ids': inputs['input_ids'].squeeze(),\n",
+    "            'attention_mask': inputs['attention_mask'].squeeze(),\n",
+    "            'labels': targets['input_ids'].squeeze()\n",
+    "        }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aaa62755",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyDataModule(pl.LightningDataModule):\n",
+    "    def __init__(self, train_csv, val_csv, test_csv, tokenizer, batch_size=16, max_length=512):\n",
+    "        super().__init__()\n",
+    "        self.train_csv = train_csv\n",
+    "        self.val_csv = val_csv\n",
+    "        self.test_csv = test_csv\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.batch_size = batch_size\n",
+    "        self.max_length = max_length\n",
+    "\n",
+    "    def setup(self, stage=None):\n",
+    "        if stage in ('fit', None):\n",
+    "            self.train_dataset = cnn_dailymail(self.train_csv, self.tokenizer, self.max_length)\n",
+    "            self.val_dataset = cnn_dailymail(self.val_csv, self.tokenizer, self.max_length)\n",
+    "        if stage in ('test', None):\n",
+    "            self.test_dataset = cnn_dailymail(self.test_csv, self.tokenizer, self.max_length)\n",
+    "\n",
+    "    def train_dataloader(self):\n",
+    "        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)\n",
+    "\n",
+    "    def val_dataloader(self):\n",
+    "        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n",
+    "\n",
+    "    def test_dataloader(self):\n",
+    "        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbb699e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyLightningModule(pl.LightningModule):\n",
+    "    def __init__(self, model_name, learning_rate, weight_decay):\n",
+    "        super().__init__()\n",
+    "        self.model_name = model_name\n",
+    "        self.learning_rate = learning_rate\n",
+    "        self.weight_decay = weight_decay\n",
+    "        \n",
+    "        # Load the pre-trained model and tokenizer\n",
+    "        self.model = torch.compile(AutoModelForSeq2SeqLM.from_pretrained(self.model_name))\n",
+    "        \n",
+    "        # Load the ROUGE metric\n",
+    "        self.metric = load_metric(\"rouge\")\n",
+    "\n",
+    "    def forward(self, input_ids, attention_mask, labels=None):\n",
+    "        output = self.model(\n",
+    "            input_ids=input_ids,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "        return output.loss, output.logits\n",
+    "    \n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        input_ids = batch[\"input_ids\"]\n",
+    "        attention_mask = batch[\"attention_mask\"]\n",
+    "        labels = batch[\"labels\"]\n",
+    "        loss, logits = self(input_ids, attention_mask, labels)\n",
+    "        self.log('train_loss', loss, on_epoch=True, on_step=True, prog_bar=True)\n",
+    "        return {'loss': loss, 'logits': logits}\n",
+    "    \n",
+    "    def validation_step(self, batch, batch_idx):\n",
+    "        input_ids = batch[\"input_ids\"]\n",
+    "        attention_mask = batch[\"attention_mask\"]\n",
+    "        labels = batch[\"labels\"]\n",
+    "        loss, logits = self(input_ids, attention_mask, labels)\n",
+    "        self.log('val_loss', loss, on_epoch=True, on_step=False)\n",
+    "        \n",
+    "        # Save logits and labels as instance attributes\n",
+    "        if not hasattr(self, \"logits\"):\n",
+    "            self.logits = logits\n",
+    "        else:\n",
+    "            self.logits = torch.cat((self.logits, logits), dim=0)\n",
+    "        \n",
+    "        if not hasattr(self, \"labels\"):\n",
+    "            self.labels = labels\n",
+    "        else:\n",
+    "            self.labels = torch.cat((self.labels, labels), dim=0)\n",
+    "            \n",
+    "        return {'loss': loss, 'logits': logits, \"labels\":labels}\n",
+    "    \n",
+    "    def on_validation_epoch_end(self):\n",
+    "        # Convert logits to predicted token IDs\n",
+    "        pred_token_ids = self.logits.argmax(dim=-1)\n",
+    "\n",
+    "        # Decode predictions and labels using the saved instance attributes\n",
+    "        decoded_preds = tokenizer.batch_decode(pred_token_ids, skip_special_tokens=True)\n",
+    "        decoded_labels = tokenizer.batch_decode(self.labels, skip_special_tokens=True)\n",
+    "\n",
+    "        # Compute ROUGE scores\n",
+    "        scores = self.metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+    "\n",
+    "        self.log('rouge1_precision', scores.precision, prog_bar=True)\n",
+    "        self.log('rouge1_recall', scores.recall, prog_bar=True)\n",
+    "        self.log('rouge1_fmeasure', scores.fmeasure, prog_bar=True)\n",
+    "\n",
+    "        # Clear logits and labels instance attributes for the next validation epoch\n",
+    "        del self.logits\n",
+    "        del self.labels\n",
+    "    \n",
+    "    def configure_optimizers(self):\n",
+    "        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)\n",
+    "        return optimizer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd63c628",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# File paths\n",
+    "train_csv = \"train.csv\"\n",
+    "val_csv = \"validation.csv\"\n",
+    "test_csv = \"test.csv\"\n",
+    "\n",
+    "# Create the data module\n",
+    "dm = MyDataModule(train_csv, val_csv, test_csv, tokenizer, batch_size=16)\n",
+    "dm.setup()\n",
+    "\n",
+    "model = MyLightningModule(model_name=\"t5-small\", learning_rate=1e-4, weight_decay=1e-5)\n",
+    "trainer = pl.Trainer(accelerator=\"gpu\", devices=[0], max_epochs=1, precision=16)\n",
+    "trainer.fit(model, datamodule=dm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5d3d684",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "http://localhost:18888/notebooks/cnndaily_t5_lightning_customdataloading.ipynb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0494596",
+   "metadata": {},
+   "source": [
+    "### next steps:\n",
+    "* if article is > 512, because now we are truncating maybe it causes issues if the article is much longer?\n",
+    "\n",
+    "#### what we've done:\n",
+    "* Change the data loading so it's more general, meaning on the fly loading from disk\n",
+    "* add torch.compile\n",
+    "* 1. Clean up the code, make it into scripts instead of notebook -> Train for an epoch (add multi-gpu training?)\n",
+    "* add tensorboard visualization\n",
+    "* not use pretrained weights but from scratch to ensure that training setup works and actually improving\n",
+    "* 2. Create an inference step, send in news article -> get summary, check that it works\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80a2efab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f9b71ab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ML/Pytorch/huggingface/cnndaily_t5_lightning_customdataloading.ipynb
+++ b/ML/Pytorch/huggingface/cnndaily_t5_lightning_customdataloading.ipynb
@@ -0,0 +1,317 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f54ecf0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "# HuggingFace Tutorial Series\n",
+    "- 1. What is Huggingface?\n",
+    "- 2. Common tasks we can do with HuggingFace & explain the tasks briefly, like what is question answering etc\n",
+    "- 3. Using the HuggingFace Pipeline (High level feature)\n",
+    "- 4. How the pipeline works at a lower level\n",
+    "- 5. HuggingFace Datasets\n",
+    "- 6. HuggingFace Tokenizer\n",
+    "- 7. HuggingFace Evaluate\n",
+    "- 8. HuggingFace Trainer\n",
+    "- 9. Putting it together to finetune a news article summarizer\n",
+    "- 10. Making it more general and robust with Lightning and custom data loading\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec1aae37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.simplefilter(\"ignore\")\n",
+    "\n",
+    "import os\n",
+    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import datasets \n",
+    "import pytorch_lightning as pl\n",
+    "from datasets import load_dataset, load_metric\n",
+    "\n",
+    "from transformers import (\n",
+    "    AutoModel,\n",
+    "    AutoModelForSeq2SeqLM,\n",
+    "    AutoTokenizer,\n",
+    "    DataCollatorForSeq2Seq,\n",
+    "    Seq2SeqTrainingArguments,\n",
+    "    Seq2SeqTrainer,\n",
+    ")\n",
+    "\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from torch.utils.data import Dataset\n",
+    "import pytorch_lightning as pl\n",
+    "\n",
+    "torch.set_float32_matmul_precision(\"medium\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fd7cb0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"t5-small\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "418cb03a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class cnn_dailymail(Dataset):\n",
+    "    def __init__(self, csv_file, tokenizer, max_length=512):\n",
+    "        self.data = pd.read_csv(csv_file)\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.max_length = max_length\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        article = self.data.loc[idx, 'article']\n",
+    "        highlights = self.data.loc[idx, 'highlights']\n",
+    "\n",
+    "        inputs = self.tokenizer(\n",
+    "            article,\n",
+    "            truncation=True,\n",
+    "            padding='max_length',\n",
+    "            max_length=self.max_length,\n",
+    "            return_tensors='pt'\n",
+    "        )\n",
+    "        targets = self.tokenizer(\n",
+    "            highlights,\n",
+    "            truncation=True,\n",
+    "            padding='max_length',\n",
+    "            max_length=self.max_length,\n",
+    "            return_tensors='pt'\n",
+    "        )\n",
+    "\n",
+    "        return {\n",
+    "            'input_ids': inputs['input_ids'].squeeze(),\n",
+    "            'attention_mask': inputs['attention_mask'].squeeze(),\n",
+    "            'labels': targets['input_ids'].squeeze()\n",
+    "        }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aaa62755",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyDataModule(pl.LightningDataModule):\n",
+    "    def __init__(self, train_csv, val_csv, test_csv, tokenizer, batch_size=16, max_length=512):\n",
+    "        super().__init__()\n",
+    "        self.train_csv = train_csv\n",
+    "        self.val_csv = val_csv\n",
+    "        self.test_csv = test_csv\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.batch_size = batch_size\n",
+    "        self.max_length = max_length\n",
+    "\n",
+    "    def setup(self, stage=None):\n",
+    "        if stage in ('fit', None):\n",
+    "            self.train_dataset = cnn_dailymail(self.train_csv, self.tokenizer, self.max_length)\n",
+    "            self.val_dataset = cnn_dailymail(self.val_csv, self.tokenizer, self.max_length)\n",
+    "        if stage in ('test', None):\n",
+    "            self.test_dataset = cnn_dailymail(self.test_csv, self.tokenizer, self.max_length)\n",
+    "\n",
+    "    def train_dataloader(self):\n",
+    "        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)\n",
+    "\n",
+    "    def val_dataloader(self):\n",
+    "        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n",
+    "\n",
+    "    def test_dataloader(self):\n",
+    "        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbb699e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyLightningModule(pl.LightningModule):\n",
+    "    def __init__(self, model_name, learning_rate, weight_decay):\n",
+    "        super().__init__()\n",
+    "        self.model_name = model_name\n",
+    "        self.learning_rate = learning_rate\n",
+    "        self.weight_decay = weight_decay\n",
+    "        \n",
+    "        # Load the pre-trained model and tokenizer\n",
+    "        self.model = torch.compile(AutoModelForSeq2SeqLM.from_pretrained(self.model_name))\n",
+    "        \n",
+    "        # Load the ROUGE metric\n",
+    "        self.metric = load_metric(\"rouge\")\n",
+    "\n",
+    "    def forward(self, input_ids, attention_mask, labels=None):\n",
+    "        output = self.model(\n",
+    "            input_ids=input_ids,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "        return output.loss, output.logits\n",
+    "    \n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        input_ids = batch[\"input_ids\"]\n",
+    "        attention_mask = batch[\"attention_mask\"]\n",
+    "        labels = batch[\"labels\"]\n",
+    "        loss, logits = self(input_ids, attention_mask, labels)\n",
+    "        self.log('train_loss', loss, on_epoch=True, on_step=True, prog_bar=True)\n",
+    "        return {'loss': loss, 'logits': logits}\n",
+    "    \n",
+    "    def validation_step(self, batch, batch_idx):\n",
+    "        input_ids = batch[\"input_ids\"]\n",
+    "        attention_mask = batch[\"attention_mask\"]\n",
+    "        labels = batch[\"labels\"]\n",
+    "        loss, logits = self(input_ids, attention_mask, labels)\n",
+    "        self.log('val_loss', loss, on_epoch=True, on_step=False)\n",
+    "        \n",
+    "        # Save logits and labels as instance attributes\n",
+    "        if not hasattr(self, \"logits\"):\n",
+    "            self.logits = logits\n",
+    "        else:\n",
+    "            self.logits = torch.cat((self.logits, logits), dim=0)\n",
+    "        \n",
+    "        if not hasattr(self, \"labels\"):\n",
+    "            self.labels = labels\n",
+    "        else:\n",
+    "            self.labels = torch.cat((self.labels, labels), dim=0)\n",
+    "            \n",
+    "        return {'loss': loss, 'logits': logits, \"labels\":labels}\n",
+    "    \n",
+    "    def on_validation_epoch_end(self):\n",
+    "        # Convert logits to predicted token IDs\n",
+    "        pred_token_ids = self.logits.argmax(dim=-1)\n",
+    "\n",
+    "        # Decode predictions and labels using the saved instance attributes\n",
+    "        decoded_preds = tokenizer.batch_decode(pred_token_ids, skip_special_tokens=True)\n",
+    "        decoded_labels = tokenizer.batch_decode(self.labels, skip_special_tokens=True)\n",
+    "\n",
+    "        # Compute ROUGE scores\n",
+    "        scores = self.metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+    "\n",
+    "        self.log('rouge1_precision', scores.precision, prog_bar=True)\n",
+    "        self.log('rouge1_recall', scores.recall, prog_bar=True)\n",
+    "        self.log('rouge1_fmeasure', scores.fmeasure, prog_bar=True)\n",
+    "\n",
+    "        # Clear logits and labels instance attributes for the next validation epoch\n",
+    "        del self.logits\n",
+    "        del self.labels\n",
+    "    \n",
+    "    def configure_optimizers(self):\n",
+    "        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)\n",
+    "        return optimizer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd63c628",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# File paths\n",
+    "train_csv = \"train.csv\"\n",
+    "val_csv = \"validation.csv\"\n",
+    "test_csv = \"test.csv\"\n",
+    "\n",
+    "# Create the data module\n",
+    "dm = MyDataModule(train_csv, val_csv, test_csv, tokenizer, batch_size=16)\n",
+    "dm.setup()\n",
+    "\n",
+    "model = MyLightningModule(model_name=\"t5-small\", learning_rate=1e-4, weight_decay=1e-5)\n",
+    "trainer = pl.Trainer(accelerator=\"gpu\", devices=[0], max_epochs=1, precision=16)\n",
+    "trainer.fit(model, datamodule=dm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5d3d684",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "http://localhost:18888/notebooks/cnndaily_t5_lightning_customdataloading.ipynb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0494596",
+   "metadata": {},
+   "source": [
+    "### next steps:\n",
+    "* if article is > 512, because now we are truncating maybe it causes issues if the article is much longer?\n",
+    "\n",
+    "#### what we've done:\n",
+    "* Change the data loading so it's more general, meaning on the fly loading from disk\n",
+    "* add torch.compile\n",
+    "* 1. Clean up the code, make it into scripts instead of notebook -> Train for an epoch (add multi-gpu training?)\n",
+    "* add tensorboard visualization\n",
+    "* not use pretrained weights but from scratch to ensure that training setup works and actually improving\n",
+    "* 2. Create an inference step, send in news article -> get summary, check that it works\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80a2efab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f9b71ab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ML/Pytorch/huggingface/dataset.py
+++ b/ML/Pytorch/huggingface/dataset.py
@@ -0,0 +1,89 @@
+import pandas as pd
+import pytorch_lightning as pl
+from torch.utils.data import Dataset
+import torch
+
+
+class cnn_dailymail(Dataset):
+    def __init__(self, csv_file, tokenizer, max_length=512):
+        self.data = pd.read_csv(csv_file)
+
+        # if the csv_file is "train.csv" then only take out 10% of the data. make sure to reset indices etc 
+        #if csv_file == "train.csv":
+        #    self.data = self.data.sample(frac=0.05, random_state=42).reset_index(drop=True)
+
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        article = self.data.loc[idx, "article"]
+        highlights = self.data.loc[idx, "highlights"]
+
+        inputs = self.tokenizer(
+            article,
+            truncation=True,
+            padding="max_length",
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+        targets = self.tokenizer(
+            highlights,
+            truncation=True,
+            padding="max_length",
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+
+        return {
+            "input_ids": inputs["input_ids"].squeeze(),
+            "attention_mask": inputs["attention_mask"].squeeze(),
+            "labels": targets["input_ids"].squeeze(),
+        }
+
+
+class MyDataModule(pl.LightningDataModule):
+    def __init__(
+        self, train_csv, val_csv, test_csv, tokenizer, batch_size=16, max_length=512
+    ):
+        super().__init__()
+        self.train_csv = train_csv
+        self.val_csv = val_csv
+        self.test_csv = test_csv
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.max_length = max_length
+
+    def setup(self, stage=None):
+        if stage in ("fit", None):
+            self.train_dataset = cnn_dailymail(
+                self.train_csv, self.tokenizer, self.max_length
+            )
+            self.val_dataset = cnn_dailymail(
+                self.val_csv, self.tokenizer, self.max_length
+            )
+        if stage in ("test", None):
+            self.test_dataset = cnn_dailymail(
+                self.test_csv, self.tokenizer, self.max_length
+            )
+
+    def train_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            pin_memory=True,
+            shuffle=True,
+            num_workers=6,
+        )
+
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2
+        )
+
+    def test_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=1
+        )
--- a/ML/Pytorch/huggingface/model.py
+++ b/ML/Pytorch/huggingface/model.py
@@ -0,0 +1,130 @@
+import torch
+import pytorch_lightning as pl
+from datasets import load_dataset, load_metric
+from transformers import T5Config, T5ForConditionalGeneration
+
+from transformers import (
+    AutoModel,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainingArguments,
+    Seq2SeqTrainer,
+)
+
+
+class MyLightningModule(pl.LightningModule):
+    def __init__(self, model_name, learning_rate, weight_decay):
+        super().__init__()
+        self.model_name = model_name
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        # Load the pre-trained model and tokenizer
+        #self.model = torch.compile(
+        #    AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
+        #)
+
+        # Create a T5-small configuration
+        config = T5Config.from_pretrained("t5-small")
+
+        # Initialize the T5 model with random weights
+        self.model = torch.compile(T5ForConditionalGeneration(config))
+
+        # Load the ROUGE metric
+        self.metric = load_metric("rouge")
+        self.logits = []
+        self.labels = []
+
+    def forward(self, input_ids, attention_mask, labels=None):
+        output = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
+        return output.loss, output.logits
+
+    def training_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, logits = self(input_ids, attention_mask, labels)
+        self.log("train_loss", loss, on_epoch=True, on_step=True, prog_bar=True)
+        return {"loss": loss, "logits": logits}
+
+    def validation_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, logits = self(input_ids, attention_mask, labels)
+        self.log("val_loss", loss, on_epoch=True, on_step=False)
+
+        # add logits and labels to instance attributes, but make sure to detach them
+        # from the computational graph first
+        self.logits.append(logits.argmax(dim=-1).detach().cpu())
+        self.labels.append(labels.detach().cpu())
+        return {"loss": loss, "logits": logits, "labels": labels}
+
+    def on_validation_epoch_end(self):
+        # Concatenate tensors in logits and labels lists
+        pred_token_ids = torch.cat(self.logits, dim=0)
+        true_labels = torch.cat(self.labels, dim=0)
+
+        # Decode predictions and labels using the saved instance attributes
+        decoded_preds = self.tokenizer.batch_decode(
+            pred_token_ids, skip_special_tokens=True
+        )
+        decoded_labels = self.tokenizer.batch_decode(
+            true_labels, skip_special_tokens=True
+        )
+
+        # Compute ROUGE scores
+        scores = self.metric.compute(
+            predictions=decoded_preds, references=decoded_labels, rouge_types=["rouge1"]
+        )["rouge1"].mid
+
+        self.log("rouge1_precision", scores.precision, prog_bar=True)
+        self.log("rouge1_recall", scores.recall, prog_bar=True)
+        self.log("rouge1_fmeasure", scores.fmeasure, prog_bar=True)
+
+        # Clear logits and labels instance attributes for the next validation epoch
+        self.logits.clear()
+        self.labels.clear()
+
+    def predict(self, article: str, max_input_length: int = 512, max_output_length: int = 150) -> str:
+        # Set the model to evaluation mode
+        self.model.eval()
+
+        # Tokenize the input article
+        inputs = self.tokenizer(
+            article,
+            max_length=max_input_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        )
+
+        # Move the input tensors to the same device as the model
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+
+        # Generate summary
+        with torch.no_grad():
+            output = self.model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                max_length=max_output_length,
+                num_return_sequences=1,
+            )
+
+        # Decode and return the summary
+        summary = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        return summary
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
+        )
+        return optimizer
+
+
--- a/ML/Pytorch/huggingface/train.py
+++ b/ML/Pytorch/huggingface/train.py
@@ -0,0 +1,67 @@
+from dataset import MyDataModule
+from model import MyLightningModule
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers import TensorBoardLogger
+from transformers import (
+    AutoModel,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainingArguments,
+    Seq2SeqTrainer,
+)
+import torch
+
+torch.set_float32_matmul_precision("medium")
+
+if __name__ == "__main__":
+    # Define the checkpoint callback
+    checkpoint_callback = ModelCheckpoint(
+        monitor="val_loss",
+        dirpath="checkpoints",
+        filename="my_model-{epoch:02d}-{val_loss:.2f}",
+        save_top_k=-1,
+        every_n_epochs=1,
+        verbose=True,
+    )
+    logger = TensorBoardLogger("tb_logs", name="t5_dailymail")
+
+    model_name = "t5-small"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    # File paths
+    train_csv = "train.csv"
+    val_csv = "validation.csv"
+    test_csv = "test.csv"
+
+    # Create the data module
+    dm = MyDataModule(train_csv, val_csv, test_csv, tokenizer, batch_size=32)
+    dm.setup()
+
+    model = MyLightningModule(
+        model_name="t5-small", learning_rate=1e-4, weight_decay=1e-5
+    )
+
+
+    #checkpoint_path = "checkpoints/curr.ckpt"
+    #checkpoint = torch.load(checkpoint_path)
+    #model.load_state_dict(checkpoint["state_dict"])
+
+    trainer = pl.Trainer(
+        accelerator="gpu",
+        devices=[0, 1],
+        max_epochs=10,
+        precision=16,
+        logger=logger,
+        callbacks=[checkpoint_callback],
+        log_every_n_steps=10,
+    )
+    trainer.fit(model, dm)
+    trainer.validate(model, dm)
+    
+    #example = """Former President Donald Trump claims in a social media post that he will be arrested next week. The claim comes while a New York prosecutor considers charging Trump in connection with hush money paid to adult film actress Stormy Daniels but there has been no official announcement of any plans for an indictment. What we know about Trump possibly facing criminal indictment in New York City. Trump has been entangled in several criminal investigations but the case related to Daniels is the longest-running of all of them, reaching back to 2016. On his platform Truth Social on Saturday morning, Trump cited "illegal leaks" that he will be arrested Tuesday and he called for protests. Trump, who is running for president in 2024, also defended himself, saying that he has not committed a crime — though he did not disclose what he expects to be charged with — and he accused the Manhattan District Attorney's Office of being "corrupt & highly political.". 'I'M BACK!' Trump posts on Facebook, YouTube for first time in two years. The Manhattan District Attorney's Office declined to comment on whether it will soon be pursing an arrest warrant for Trump. But the Associated Press reported that law enforcement officials in New York are discussing security preparations in anticipation that Trump may be indicted in coming weeks. If it does occur, Trump would become the first former president to be indicted in U.S. history.""" 
+    #print(len(tokenizer(example)["input_ids"]))
+    #summary = model.predict(example)
+    #print(summary)