From 2b8e1fa0a75b1415d830b3d60017e4c4ef5d8922 Mon Sep 17 00:00:00 2001
From: Claudio Scheer <claudioscheer@protonmail.com>
Date: Sun, 10 May 2020 22:37:13 -0300
Subject: [PATCH] Split dataset into train and test

---
 stupid-bot/src/notebooks/dataset.ipynb | 236 +++++++++++--------------
 1 file changed, 104 insertions(+), 132 deletions(-)
diff --git a/stupid-bot/src/notebooks/dataset.ipynb b/stupid-bot/src/notebooks/dataset.ipynb
index 40792e4..a4e93ac 100644
--- a/stupid-bot/src/notebooks/dataset.ipynb
+++ b/stupid-bot/src/notebooks/dataset.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,78 +25,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>question</th>\n",
-       "      <th>answer</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>how are you?</td>\n",
-       "      <td>good</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>how are you?</td>\n",
-       "      <td>sad</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>how are you?</td>\n",
-       "      <td>upset</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>how old are you?</td>\n",
-       "      <td>23 years old</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>how old are you?</td>\n",
-       "      <td>9 years old</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "           question        answer\n",
-       "0      how are you?          good\n",
-       "1      how are you?           sad\n",
-       "2      how are you?         upset\n",
-       "3  how old are you?  23 years old\n",
-       "4  how old are you?   9 years old"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "dataset_path = \"../../dataset/data.csv\"\n",
     "data = pd.read_csv(dataset_path, header=0)\n",
@@ -112,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -152,23 +83,29 @@
     "        x = self.text2int(x)\n",
     "        # One-hot encode x.\n",
     "        x = self.one_hot_encode(x)\n",
-    "        x = torch.tensor(x).cuda()\n",
+    "        x = torch.tensor(x)\n",
     "        \n",
     "        y = self.answers[index]\n",
     "        # Map text to int.\n",
     "        y = self.text2int(y)\n",
     "        # One-hot encode y.\n",
     "        y = self.one_hot_encode(y)\n",
-    "        y = torch.tensor(y).cuda()\n",
+    "        y = torch.tensor(y)\n",
     "        return x, y\n",
     "\n",
     "    def __len__(self):\n",
     "        return self.data_len\n",
     "    \n",
     "    def text2int(self, text):\n",
+    "        \"\"\"\n",
+    "            Convert text to an array of integers.\n",
+    "        \"\"\"\n",
     "        return [self.char2int[c] for c in text]\n",
     "    \n",
     "    def one_hot_encode(self, sequence):\n",
+    "        \"\"\"\n",
+    "            Convert an array of integers to a matrix one-hot encoded.\n",
+    "        \"\"\"\n",
     "        encoded = np.zeros([self.unique_characters_length, len(sequence)], dtype=int)\n",
     "        for i, character in enumerate(sequence):\n",
     "            encoded[character][i] = 1\n",
@@ -176,71 +113,81 @@
     "    \n",
     "    def one_hot_decode(self, sequence):\n",
     "        \"\"\"\n",
-    "            sequence: expected to be a PyTorch tensor.\n",
+    "            sequence: PyTorch tensor.\n",
     "        \"\"\"\n",
     "        return [np.argmax(x) for x in sequence.numpy().T]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The cell below shows an example of how to use the `StupidBotDataset` class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = StupidBotDataset(dataset_path)\n",
+    "dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Divide dataset into training and testing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next step is to divide the dataset into training and testing. To do this, I will use the tools provided by  PyTorch.\n",
+    "\n",
+    "The dataset will be loaded and shuffled. In large datasets, this can be a problem. However, as this dataset is small, I will use this approach."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data.sampler import SubsetRandomSampler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load dataset and define the parameters used to split and load the dataset:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n",
-       " tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'))"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "dataset = StupidBotDataset(dataset_path)\n",
-    "dataset[1]"
+    "dataset_size = len(dataset)\n",
+    "dataset_indices = list(range(dataset_size))\n",
+    "\n",
+    "batch_size = 1\n",
+    "test_split = int(np.floor(0.2 * dataset_size))  # 20%\n",
+    "# Shuffle dataset indices.\n",
+    "np.random.shuffle(dataset_indices)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split dataset:"
    ]
   },
   {
@@ -248,7 +195,32 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "train_indices, test_indices = (\n",
+    "    dataset_indices[test_split:],\n",
+    "    dataset_indices[:test_split],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load train and test dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_sampler = SubsetRandomSampler(train_indices)\n",
+    "test_sampler = SubsetRandomSampler(test_indices)\n",
+    "\n",
+    "train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)\n",
+    "test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)"
+   ]
   }
  ],
  "metadata": {
-- 
GitLab


	question	answer
0	how are you?	good
1	how are you?	sad
2	how are you?	upset
3	how old are you?	23 years old
4	how old are you?	9 years old