From f1d77b160592cbfb36702ea110a5b73ae9a42784 Mon Sep 17 00:00:00 2001 From: Claudio Scheer Date: Tue, 2 Jun 2020 01:31:58 -0300 Subject: [PATCH] Remove custom scripts --- src/lib/__init__.py | 0 src/lib/dataset.py | 61 --------------------------------------------- src/main.py | 23 ----------------- 3 files changed, 84 deletions(-) delete mode 100644 src/lib/__init__.py delete mode 100644 src/lib/dataset.py delete mode 100644 src/main.py diff --git a/src/lib/__init__.py b/src/lib/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/lib/dataset.py b/src/lib/dataset.py deleted file mode 100644 index 80d66ec..0000000 --- a/src/lib/dataset.py +++ /dev/null @@ -1,61 +0,0 @@ -import json -from transformers import BertTokenizer -import os - - -def get_dataset_generator(path): - with open(path, "r") as file: - data = file.read() - data = json.loads(data) - data = data["data"] - for x in data: - for y in x["paragraphs"]: - context = y["context"] - for z in y["qas"]: - question = z["question"] - for r in z["answers"]: - answer = r["text"] - answer_start = r["answer_start"] - yield (context, question, answer, answer_start) - - -def tokenize_batch(batch, labels, tokenizer): - batch = tokenizer.batch_encode_plus( - batch, max_length=512, return_tensors="pt", pad_to_max_length=True - ) - return batch, labels - - -def get_batches(dataset_generator, batch_size, tokenizer): - while True: - batch = [] - labels = [] - for _ in range(batch_size): - try: - (context, question, answer, answer_start) = next(dataset_generator) - except: - if len(batch) > 0: - yield tokenize_batch(batch, labels, tokenizer) - return - batch.append((context, question)) - labels.append( - ( - tokenizer.encode( - answer, return_tensors="pt", pad_to_max_length=True - ), - answer_start, - ) - ) - yield tokenize_batch(batch, labels, tokenizer) - - -if __name__ == "__main__": - path = os.path.basename(os.path.dirname(__file__)) - dataset = get_dataset_generator( - os.path.join(path, "../../dataset/squad/dev-v1.1.json") - ) - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - for x, y in get_batches(dataset, 64, tokenizer): - print(x) - # print(y) - break diff --git a/src/main.py b/src/main.py deleted file mode 100644 index 200b2b0..0000000 --- a/src/main.py +++ /dev/null @@ -1,23 +0,0 @@ -import os -from transformers import BertTokenizer, BertForQuestionAnswering, AdamW -import torch -from lib.dataset import get_dataset_generator, get_batches - - -current_path = os.path.basename(os.path.dirname(__file__)) -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) -model = BertForQuestionAnswering.from_pretrained("bert-base-uncased").to(device) -model.train() - -train_dataset_generator = get_dataset_generator( - os.path.join(current_path, "../dataset/squad/train-v1.1.json") -) -test_dataset_generator = get_dataset_generator( - os.path.join(current_path, "../dataset/squad/dev-v1.1.json") -) - -n_epochs = 10 -for _ in range(n_epochs): - for x, y in get_batches(test_dataset_generator, 16, tokenizer): - pass -- GitLab