From 2b8e1fa0a75b1415d830b3d60017e4c4ef5d8922 Mon Sep 17 00:00:00 2001 From: Claudio Scheer Date: Sun, 10 May 2020 22:37:13 -0300 Subject: [PATCH] Split dataset into train and test --- stupid-bot/src/notebooks/dataset.ipynb | 236 +++++++++++-------------- 1 file changed, 104 insertions(+), 132 deletions(-) diff --git a/stupid-bot/src/notebooks/dataset.ipynb b/stupid-bot/src/notebooks/dataset.ipynb index 40792e4..a4e93ac 100644 --- a/stupid-bot/src/notebooks/dataset.ipynb +++ b/stupid-bot/src/notebooks/dataset.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -25,78 +25,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionanswer
0how are you?good
1how are you?sad
2how are you?upset
3how old are you?23 years old
4how old are you?9 years old
\n", - "
" - ], - "text/plain": [ - " question answer\n", - "0 how are you? good\n", - "1 how are you? sad\n", - "2 how are you? upset\n", - "3 how old are you? 23 years old\n", - "4 how old are you? 9 years old" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dataset_path = \"../../dataset/data.csv\"\n", "data = pd.read_csv(dataset_path, header=0)\n", @@ -112,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -152,23 +83,29 @@ " x = self.text2int(x)\n", " # One-hot encode x.\n", " x = self.one_hot_encode(x)\n", - " x = torch.tensor(x).cuda()\n", + " x = torch.tensor(x)\n", " \n", " y = self.answers[index]\n", " # Map text to int.\n", " y = self.text2int(y)\n", " # One-hot encode y.\n", " y = self.one_hot_encode(y)\n", - " y = torch.tensor(y).cuda()\n", + " y = torch.tensor(y)\n", " return x, y\n", "\n", " def __len__(self):\n", " return self.data_len\n", " \n", " def text2int(self, text):\n", + " \"\"\"\n", + " Convert text to an array of integers.\n", + " \"\"\"\n", " return [self.char2int[c] for c in text]\n", " \n", " def one_hot_encode(self, sequence):\n", + " \"\"\"\n", + " Convert an array of integers to a matrix one-hot encoded.\n", + " \"\"\"\n", " encoded = np.zeros([self.unique_characters_length, len(sequence)], dtype=int)\n", " for i, character in enumerate(sequence):\n", " encoded[character][i] = 1\n", @@ -176,71 +113,81 @@ " \n", " def one_hot_decode(self, sequence):\n", " \"\"\"\n", - " sequence: expected to be a PyTorch tensor.\n", + " sequence: PyTorch tensor.\n", " \"\"\"\n", " return [np.argmax(x) for x in sequence.numpy().T]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell below shows an example of how to use the `StupidBotDataset` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = StupidBotDataset(dataset_path)\n", + "dataset[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Divide dataset into training and testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next step is to divide the dataset into training and testing. To do this, I will use the tools provided by PyTorch.\n", + "\n", + "The dataset will be loaded and shuffled. In large datasets, this can be a problem. However, as this dataset is small, I will use this approach." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data.sampler import SubsetRandomSampler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load dataset and define the parameters used to split and load the dataset:" + ] + }, { "cell_type": "code", - "execution_count": 129, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n", - " tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'))" - ] - }, - "execution_count": 129, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dataset = StupidBotDataset(dataset_path)\n", - "dataset[1]" + "dataset_size = len(dataset)\n", + "dataset_indices = list(range(dataset_size))\n", + "\n", + "batch_size = 1\n", + "test_split = int(np.floor(0.2 * dataset_size)) # 20%\n", + "# Shuffle dataset indices.\n", + "np.random.shuffle(dataset_indices)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split dataset:" ] }, { @@ -248,7 +195,32 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "train_indices, test_indices = (\n", + " dataset_indices[test_split:],\n", + " dataset_indices[:test_split],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load train and test dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_sampler = SubsetRandomSampler(train_indices)\n", + "test_sampler = SubsetRandomSampler(test_indices)\n", + "\n", + "train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)\n", + "test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)" + ] } ], "metadata": { -- GitLab