diff --git a/generate-sentence/README.md b/README.md similarity index 100% rename from generate-sentence/README.md rename to README.md diff --git a/generate-sentence/documentation/original-post.pdf b/documentation/original-post.pdf similarity index 100% rename from generate-sentence/documentation/original-post.pdf rename to documentation/original-post.pdf diff --git a/generate-sentence/documentation/rnn-notes.png b/documentation/rnn-notes.png similarity index 100% rename from generate-sentence/documentation/rnn-notes.png rename to documentation/rnn-notes.png diff --git a/generate-sentence/documentation/tensor-sizes-through-network.ora b/documentation/tensor-sizes-through-network.ora similarity index 100% rename from generate-sentence/documentation/tensor-sizes-through-network.ora rename to documentation/tensor-sizes-through-network.ora diff --git a/generate-sentence/documentation/tensor-sizes-through-network.png b/documentation/tensor-sizes-through-network.png similarity index 100% rename from generate-sentence/documentation/tensor-sizes-through-network.png rename to documentation/tensor-sizes-through-network.png diff --git a/generate-sentence/src/main.py b/src/main.py similarity index 100% rename from generate-sentence/src/main.py rename to src/main.py diff --git a/generate-sentence/src/notebooks/rnn-network.ipynb b/src/notebooks/rnn-network.ipynb similarity index 100% rename from generate-sentence/src/notebooks/rnn-network.ipynb rename to src/notebooks/rnn-network.ipynb diff --git a/generate-sentence/src/rnn.py b/src/rnn.py similarity index 100% rename from generate-sentence/src/rnn.py rename to src/rnn.py diff --git a/stupid-bot/README.md b/stupid-bot/README.md deleted file mode 100644 index 1f355b7c3a04a371624761907c5e85b8fb72e6d3..0000000000000000000000000000000000000000 --- a/stupid-bot/README.md +++ /dev/null @@ -1,4 +0,0 @@ -## To do - -- [ ] Remove dataset split in train and test -- [ ] Use a softmax layer in the prediction \ No newline at end of file diff --git a/stupid-bot/dataset/data.csv b/stupid-bot/dataset/data.csv deleted file mode 100644 index bc0bad912236b8ddf7ef8c7d37fc288b8118eb83..0000000000000000000000000000000000000000 --- a/stupid-bot/dataset/data.csv +++ /dev/null @@ -1,7 +0,0 @@ -question,answer -how are you?,sad -what is your name?,john -are you good?,no -how old are you?,47 years old -do you drink water?,yes a lot -do you have coronavirus?,yes i am dying \ No newline at end of file diff --git a/stupid-bot/notebooks/dataset.ipynb b/stupid-bot/notebooks/dataset.ipynb deleted file mode 100644 index 54c40174c59dd35fd9a4a08c42e3d0739310e42b..0000000000000000000000000000000000000000 --- a/stupid-bot/notebooks/dataset.ipynb +++ /dev/null @@ -1,315 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This section describes the process to load the dataset used to train and test the model. The dataset I am using on this project is just as stupid as the network. The idea is just to learn more about recurrent neural networks." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionanswer
0how are you?good
1how are you?sad
2how are you?upset
3how old are you?23 years old
4how old are you?9 years old
\n", - "
" - ], - "text/plain": [ - " question answer\n", - "0 how are you? good\n", - "1 how are you? sad\n", - "2 how are you? upset\n", - "3 how old are you? 23 years old\n", - "4 how old are you? 9 years old" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset_path = \"../dataset/data.csv\"\n", - "data = pd.read_csv(dataset_path, header=0)\n", - "data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This dataset is not large enough to justify using the PyTorch `Dataset` utility class. However, I will use it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from torch.utils.data.dataset import Dataset\n", - "import numpy as np\n", - "\n", - "\n", - "class StupidBotDataset(Dataset):\n", - " def __init__(self, csv_path):\n", - " self.data = pd.read_csv(csv_path, header=0)\n", - " self.questions = self.data[\"question\"]\n", - " self.answers = self.data[\"answer\"]\n", - " self.data_len = len(self.data.index)\n", - " \n", - " # Unique characters in the database.\n", - " self.unique_characters = set(\"\".join(self.questions + self.answers))\n", - " self.unique_characters_length = len(self.unique_characters)\n", - " # Map int to character.\n", - " self.int2char = dict(enumerate(self.unique_characters))\n", - " # Map character to int.\n", - " self.char2int = {char: i for i, char in self.int2char.items()}\n", - " \n", - " # Longer question.\n", - " longer_question_length = len(max(self.questions, key=len))\n", - " # Longer answer.\n", - " longer_answer_length = len(max(self.answers, key=len))\n", - " \n", - " # Pad strings.\n", - " self.questions = self.questions.str.pad(longer_question_length, side=\"right\")\n", - " self.answers = self.answers.str.pad(longer_answer_length, side=\"right\")\n", - "\n", - " def __getitem__(self, index):\n", - " x = self.questions[index]\n", - " # Map text to int.\n", - " x = self.text2int(x)\n", - " # One-hot encode x.\n", - " x = self.one_hot_encode(x)\n", - " x = torch.tensor(x)\n", - " \n", - " y = self.answers[index]\n", - " # Map text to int.\n", - " y = self.text2int(y)\n", - " # One-hot encode y.\n", - " y = self.one_hot_encode(y)\n", - " y = torch.tensor(y)\n", - " return x, y\n", - "\n", - " def __len__(self):\n", - " return self.data_len\n", - " \n", - " def text2int(self, text):\n", - " \"\"\"\n", - " Convert text to an array of integers.\n", - " \"\"\"\n", - " return [self.char2int[c] for c in text]\n", - " \n", - " def one_hot_encode(self, sequence):\n", - " \"\"\"\n", - " Convert an array of integers to a matrix one-hot encoded.\n", - " \"\"\"\n", - " encoded = np.zeros([self.unique_characters_length, len(sequence)], dtype=int)\n", - " for i, character in enumerate(sequence):\n", - " encoded[character][i] = 1\n", - " return encoded\n", - " \n", - " def one_hot_decode(self, sequence):\n", - " \"\"\"\n", - " sequence: PyTorch tensor.\n", - " \"\"\"\n", - " return [np.argmax(x) for x in sequence.numpy().T]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The cell below shows an example of how to use the `StupidBotDataset` class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = StupidBotDataset(dataset_path)\n", - "dataset[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Divide dataset into training and testing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next step is to divide the dataset into training and testing. To do this, I will use the tools provided by PyTorch.\n", - "\n", - "The dataset will be loaded and shuffled. In large datasets, this can be a problem. However, as this dataset is small, I will use this approach." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from torch.utils.data.sampler import SubsetRandomSampler" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load dataset and define the parameters used to split and load the dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = StupidBotDataset(dataset_path)\n", - "dataset_size = len(dataset)\n", - "dataset_indices = list(range(dataset_size))\n", - "\n", - "batch_size = 1\n", - "test_split = int(np.floor(0.2 * dataset_size)) # 20%\n", - "# Shuffle dataset indices.\n", - "np.random.shuffle(dataset_indices)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Split dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_indices, test_indices = (\n", - " dataset_indices[test_split:],\n", - " dataset_indices[:test_split],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load train and test dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_sampler = SubsetRandomSampler(train_indices)\n", - "test_sampler = SubsetRandomSampler(test_indices)\n", - "\n", - "train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)\n", - "test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/stupid-bot/notebooks/network.ipynb b/stupid-bot/notebooks/network.ipynb deleted file mode 100644 index 534223c34840e6f7f12b4bf556b87f4d2f055fb8..0000000000000000000000000000000000000000 --- a/stupid-bot/notebooks/network.ipynb +++ /dev/null @@ -1,94 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Network" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This section describes the implemented network in details. The network uses a single recurrent layer. RNN, unlike LSTM, cannot handle long-term dependencies. However, the dataset has short question and answers, and the idea is just to test how the performance of the network." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is not a good approach to deal with a bot. Maybe I should use some kind of tokenization using NLP." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "\n", - "\n", - "class RNNModel(nn.Module):\n", - " def __init__(self, input_size, output_size, hidden_dim, n_layers):\n", - " super(RNNModel, self).__init__()\n", - "\n", - " # Defining some parameters.\n", - " self.hidden_dim = hidden_dim\n", - " self.n_layers = n_layers\n", - "\n", - " # region Defining the layers.\n", - " # RNN layer.\n", - " self.rnn = nn.RNN(\n", - " input_size, hidden_dim, n_layers, batch_first=True, nonlinearity=\"relu\"\n", - " )\n", - " # Fully connected layer.\n", - " self.fc = nn.Linear(hidden_dim, output_size)\n", - " # endregion\n", - "\n", - " def forward(self, x):\n", - " batch_size = x.size(axis=0)\n", - "\n", - " # Initializing hidden state for first input using method defined below.\n", - " hidden = self.init_hidden(batch_size) # (1, 3, 12)\n", - "\n", - " # Passing in the input and hidden state into the model and obtaining outputs.\n", - " out, hidden = self.rnn(x, hidden) # (3, 14, 12), (1, 3, 12)\n", - "\n", - " # Reshaping the outputs such that it can be fit into the fully connected layer.\n", - " out = out.contiguous().view(-1, self.hidden_dim) # (42, 12)\n", - " out = self.fc(out) # (42, 17)\n", - "\n", - " return out, hidden\n", - "\n", - " def init_hidden(self, batch_size):\n", - " # This method generates the first hidden state of zeros which we will use in the forward pass.\n", - " hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).cuda()\n", - " return hidden\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/stupid-bot/src/data.py b/stupid-bot/src/data.py deleted file mode 100644 index 6182a0159eb2c64d2dd1022f41c050f51ca747ba..0000000000000000000000000000000000000000 --- a/stupid-bot/src/data.py +++ /dev/null @@ -1,77 +0,0 @@ -import torch -from torch.utils.data.dataset import Dataset -import pandas as pd -import numpy as np - - -class StupidBotDataset(Dataset): - def __init__(self, csv_path): - self.data = pd.read_csv(csv_path, header=0) - self.questions = self.data["question"] - self.answers = self.data["answer"] - self.data_len = len(self.data.index) - - # Unique characters in the database. - self.unique_characters = set("".join(self.questions + self.answers)) - self.unique_characters_length = len(self.unique_characters) # 21 - - # Map int to character. - self.int2char = dict(enumerate(self.unique_characters)) - # Map character to int. - self.char2int = {char: i for i, char in self.int2char.items()} - - # Longer question. - self.longer_question_length = len(max(self.questions, key=len)) # 16 - # Longer answer. - self.longer_answer_length = self.longer_question_length - # self.longer_answer_length = len(max(self.answers, key=len)) # 12 - - # Pad strings. - self.questions = self.questions.str.pad( - self.longer_question_length, side="right" - ) - self.answers = self.answers.str.pad(self.longer_answer_length, side="right") - - def __getitem__(self, index): - x = self.questions[index] - # Map text to int. - x = self.text2int(x) - # One-hot encode x. - x = self.one_hot_encode(x) - x = torch.tensor(x).float().cuda() - - y = self.answers[index] - # Map text to int. - y = self.text2int(y) - y = torch.tensor(y).float().cuda() - return x, y - - def __len__(self): - return self.data_len - - def text2int(self, text): - """ - Convert text to an array of integers. - """ - return [self.char2int[c] for c in text] - - def int2text(self, sequence): - """ - Convert an array of integers to text. - """ - return [self.int2char[c] for c in sequence] - - def one_hot_encode(self, sequence): - """ - Convert an array of integers to a matrix one-hot encoded. - """ - encoded = np.zeros([len(sequence), self.unique_characters_length], dtype=int) - for i, character in enumerate(sequence): - encoded[i][character] = 1 - return encoded - - def one_hot_decode(self, sequence): - """ - sequence: PyTorch tensor. - """ - return [np.argmax(x) for x in sequence.numpy()] diff --git a/stupid-bot/src/main.py b/stupid-bot/src/main.py deleted file mode 100644 index 7a5c3f1929c23efee71d086908667fa7f5170021..0000000000000000000000000000000000000000 --- a/stupid-bot/src/main.py +++ /dev/null @@ -1,76 +0,0 @@ -import torch -import torch.nn as nn -from torch.utils.data.dataset import Dataset -from torch.utils.data.sampler import SubsetRandomSampler -import numpy as np -from rnn import RNNModel -from data import StupidBotDataset - - -dataset = StupidBotDataset("../dataset/data.csv") -dataset_size = len(dataset) -dataset_indices = list(range(dataset_size)) - -batch_size = 1 -test_split = int(np.floor(0.2 * dataset_size)) # 20% -# Shuffle dataset indices. -np.random.shuffle(dataset_indices) - -train_indices, test_indices = ( - dataset_indices[test_split:], - dataset_indices[:test_split], -) - -train_sampler = SubsetRandomSampler(train_indices) -test_sampler = SubsetRandomSampler(test_indices) -train_loader = torch.utils.data.DataLoader( - dataset, batch_size=batch_size, sampler=train_sampler -) -test_loader = torch.utils.data.DataLoader( - dataset, batch_size=batch_size, sampler=test_sampler -) - -model = RNNModel(dataset.unique_characters_length, dataset.unique_characters_length) -model.cuda() - -# Define loss and optimizer functions. -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=0.01) - -# Training the network. -n_epochs = 100 -for epoch in range(1, n_epochs + 1): - for batch_index, (x, y) in enumerate(train_loader): - optimizer.zero_grad() - - output, hidden = model(x) # (16, 21), (1, 1, 32) - loss = criterion(output, y.view(-1).long()) - loss.backward() - optimizer.step() - - if epoch % 10 == 0: - print("Epoch: {}/{}.............".format(epoch, n_epochs), end=" ") - print("Loss: {:.4f}".format(loss.item())) - - -def predict(model, question): - """ - Returns the answer to the question. - """ - question = question.ljust(dataset.longer_question_length) - question = dataset.text2int(question) - question = dataset.one_hot_encode(question) - question = torch.from_numpy(np.array([question])).float().cuda() - - out, hidden = model(question) - - answer = dataset.one_hot_decode(out.cpu()) - answer = dataset.int2text(answer) - - return answer - - -model.eval() -with torch.no_grad(): - prediction = predict(model, "how are you?") - print(prediction) diff --git a/stupid-bot/src/rnn.py b/stupid-bot/src/rnn.py deleted file mode 100644 index 241e656a487b69f1ba4feedce19e74d1bef79711..0000000000000000000000000000000000000000 --- a/stupid-bot/src/rnn.py +++ /dev/null @@ -1,48 +0,0 @@ -import torch -import torch.nn as nn - - -class RNNModel(nn.Module): - def __init__(self, input_size, output_size): - super(RNNModel, self).__init__() - - # Defining some parameters. - self.input_size = input_size - self.output_size = output_size - self.hidden_dim = 32 - self.n_layers = 1 - - # region Defining the layers. - # RNN layer. - self.rnn = nn.RNN( - self.input_size, - self.hidden_dim, - self.n_layers, - batch_first=True, - nonlinearity="relu", - ) - # Fully connected layer. - self.fc = nn.Linear(self.hidden_dim, self.output_size) - # endregion - - def forward(self, x): - batch_size = x.size(axis=0) - - # Initializing hidden state for first input using method defined below. - hidden = self.init_hidden(batch_size) # (1, 1, 32) - - # Passing in the input and hidden state into the model and obtaining outputs. - out, hidden = self.rnn( - x, hidden - ) # input => (1, 16, 21), (1, 1, 32) | output => (1, 16, 32), (1, 1, 32) - - # Reshaping the outputs such that it can be fit into the fully connected layer. - out = out.contiguous().view(-1, self.hidden_dim) # (16, 32) - out = self.fc(out) # (16, 21) - - return out, hidden - - def init_hidden(self, batch_size): - # This method generates the first hidden state of zeros which we will use in the forward pass. - hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).cuda() - return hidden