diff --git a/stupid-bot/src/notebooks/dataset.ipynb b/stupid-bot/src/notebooks/dataset.ipynb
index 40792e44cc767c24267ae54d5eb9edf4aabb26c4..a4e93ac3747962c73a4fbc2da020396483fc639d 100644
--- a/stupid-bot/src/notebooks/dataset.ipynb
+++ b/stupid-bot/src/notebooks/dataset.ipynb
@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -25,78 +25,9 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " question | \n",
- " answer | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " how are you? | \n",
- " good | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " how are you? | \n",
- " sad | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " how are you? | \n",
- " upset | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " how old are you? | \n",
- " 23 years old | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " how old are you? | \n",
- " 9 years old | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " question answer\n",
- "0 how are you? good\n",
- "1 how are you? sad\n",
- "2 how are you? upset\n",
- "3 how old are you? 23 years old\n",
- "4 how old are you? 9 years old"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"dataset_path = \"../../dataset/data.csv\"\n",
"data = pd.read_csv(dataset_path, header=0)\n",
@@ -112,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 128,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -152,23 +83,29 @@
" x = self.text2int(x)\n",
" # One-hot encode x.\n",
" x = self.one_hot_encode(x)\n",
- " x = torch.tensor(x).cuda()\n",
+ " x = torch.tensor(x)\n",
" \n",
" y = self.answers[index]\n",
" # Map text to int.\n",
" y = self.text2int(y)\n",
" # One-hot encode y.\n",
" y = self.one_hot_encode(y)\n",
- " y = torch.tensor(y).cuda()\n",
+ " y = torch.tensor(y)\n",
" return x, y\n",
"\n",
" def __len__(self):\n",
" return self.data_len\n",
" \n",
" def text2int(self, text):\n",
+ " \"\"\"\n",
+ " Convert text to an array of integers.\n",
+ " \"\"\"\n",
" return [self.char2int[c] for c in text]\n",
" \n",
" def one_hot_encode(self, sequence):\n",
+ " \"\"\"\n",
+ " Convert an array of integers to a matrix one-hot encoded.\n",
+ " \"\"\"\n",
" encoded = np.zeros([self.unique_characters_length, len(sequence)], dtype=int)\n",
" for i, character in enumerate(sequence):\n",
" encoded[character][i] = 1\n",
@@ -176,71 +113,81 @@
" \n",
" def one_hot_decode(self, sequence):\n",
" \"\"\"\n",
- " sequence: expected to be a PyTorch tensor.\n",
+ " sequence: PyTorch tensor.\n",
" \"\"\"\n",
" return [np.argmax(x) for x in sequence.numpy().T]"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The cell below shows an example of how to use the `StupidBotDataset` class."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = StupidBotDataset(dataset_path)\n",
+ "dataset[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Divide dataset into training and testing"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The next step is to divide the dataset into training and testing. To do this, I will use the tools provided by PyTorch.\n",
+ "\n",
+ "The dataset will be loaded and shuffled. In large datasets, this can be a problem. However, as this dataset is small, I will use this approach."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torch.utils.data.sampler import SubsetRandomSampler"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Load dataset and define the parameters used to split and load the dataset:"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 129,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n",
- " tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
- " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'))"
- ]
- },
- "execution_count": 129,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"dataset = StupidBotDataset(dataset_path)\n",
- "dataset[1]"
+ "dataset_size = len(dataset)\n",
+ "dataset_indices = list(range(dataset_size))\n",
+ "\n",
+ "batch_size = 1\n",
+ "test_split = int(np.floor(0.2 * dataset_size)) # 20%\n",
+ "# Shuffle dataset indices.\n",
+ "np.random.shuffle(dataset_indices)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Split dataset:"
]
},
{
@@ -248,7 +195,32 @@
"execution_count": null,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "train_indices, test_indices = (\n",
+ " dataset_indices[test_split:],\n",
+ " dataset_indices[:test_split],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Load train and test dataset:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_sampler = SubsetRandomSampler(train_indices)\n",
+ "test_sampler = SubsetRandomSampler(test_indices)\n",
+ "\n",
+ "train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)\n",
+ "test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)"
+ ]
}
],
"metadata": {