How to use kobert(torch model on colab)

2020. 2. 2. 00:19개발/AI&ML

For using kobert, you should install these libraries:


!pip install mxnet-cu101
!pip install gluonnlp pandas tqdm
!pip install sentencepiece==0.1.85
!pip install transformers==2.1.1
!pip install torch==1.3.1
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

import:

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import WarmupLinearSchedule

drive mount:

from google.colab import drive
drive.mount('/content/drive')

cuda:

##GPU 사용 시
device = torch.device("cuda:0")

get model and vocab file:

bertmodel, vocab = get_pytorch_kobert_model()

Setting:

# Import all libs
import os
import numpy as np
import pandas as pd
import re
import csv
from collections import namedtuple
from sklearn.model_selection import train_test_split
import nltk
from sklearn.pipeline import Pipeline
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
import sys

# Set paths and gloabal variables
BASE_DIR = '/content/drive/My Drive'
DATA_DIR = os.path.join(BASE_DIR,'Allganize_data/cremafactory_review_analysis/crema_sejin')

DATA = os.path.join(DATA_DIR, 'datayouhave1.tsv')
DATA = os.path.join(DATA_DIR, 'datayouhave2.tsv')
DATA_TEST = os.path.join(DATA_DIR, 'datayouhave3.tsv')
DATA_TRAIN = os.path.join(DATA_DIR, 'datayouhave4.tsv')

LABELS = [-1, 0, 1]

CATEGS = []
CATEGS_CNT = len(CATEGS)

data preprocessing:

data preprocess yourself

 

 

get kobert tokenizer:

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

 

setting parameters:

max_len = 128
batch_size = 8
warmup_ratio = 0.1
num_epochs = 15
max_grad_norm = 1
log_interval = 200
learning_rate =  2e-5

 

For batch generating, we used BatchGenrator in keras. In torch, we should use ‘Dataset’. Especially, if you want to input just one sentence, you should set pair=False, if 2 sentences, you should set pari=True in SentenceTransform.

class BertDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, data=train_data, batch_size=batch_size, bert_tokenizer=tok, max_len=max_len, pad=True, pair=True):
        'Initialization'
        self.data = data
        self.review_counts = len(data)
        self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        #self.sentences = [transform([dt[1],dt[0]]) for dt in data.values]
        self.data = data
        #self.labels = data['score]
  def __len__(self):
        'Denotes the total number of samples'
        return (len(self.data))
  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        #def to_categorical(y, num_classes):
          #return np.eye(len(LABELS), dtype='uint8')[y]
        #y = to_categorical(int(self.labels.iloc[index])+1, 3)
        if self.data.iloc[index]['score'] == '2':
          related = np.int32(0)
        else:
          related = np.int32(1)
        sentence = self.transform([self.data.iloc[index]['category'], self.data.iloc[index]['review']])
        score = np.int32(self.data.iloc[index]['score']) + np.int32(1)
        return (sentence + (related, score, ))

 

In dataloader, if set shuffle=True, torch shuffles data in each epochs.:

data_train = BertDataset(data=train_data)
data_test = BertDataset(data=test_data)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

 

Similar to keras, we can make the model like below:

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        #self.dr_rate = dr_rate
        self.layer1 = nn.Linear(hidden_size, 192)
        self.special1 = nn.Dropout(p=0.5)
        self.special2 = nn.Linear(hidden_size, 2)
        self.special3 = nn.Dropout(p=0.3)
        self.special4 = nn.Linear(192, 4)
        #if dr_rate:
            #self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length,batch_mode=True):
        attention_mask = torch.zeros_like(token_ids)
        if batch_mode:
          for i, v in enumerate(valid_length):
              attention_mask[i][:v] = 1
          return attention_mask.float()
        else:
          attention_mask[:valid_length] = 1
          return attention_mask.float()


    def forward(self, token_ids, valid_length, segment_ids,batch_mode=True):
        attention_mask = self.gen_attention_mask(token_ids, valid_length,batch_mode)
        
        encoded_layer, pooler = self.bert(input_ids = token_ids.to(device), token_type_ids = segment_ids.long().to(device), attention_mask = attention_mask.float().to(device))#token_ids
        hidden1 = self.special1(pooler)
        out1 = self.special2(hidden1)
        hidden2 = self.special3(pooler)
        hidden2 = self.layer1(hidden2)
        out2 = self.special4(hidden2)
        #out = nn.functional.relu(hidden)
        return out1, out2


model = BERTClassifier(bertmodel).to(device)

 

Setting optimizer, loss function, etc.

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn1 = nn.BCEWithLogitsLoss()
loss_fn2 = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)

 

You should make accuracy function by yourself.

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

 

Training:

You must set model.train() before training and model.eval() before evaluation!!!!

for e in range(num_epochs):
    train_acc_relevant = 0.0
    train_acc_score = 0.0
    test_acc_relevant = 0.0
    test_acc_score = 0.0
    model.train()
    loss_list = []
    for batch_id, (token_ids, valid_length, segment_ids, related, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        related = related.long().to(device)
        out1, out2 = model(token_ids, valid_length, segment_ids)
        loss1 = loss_fn2(out1, related)
        loss2 = loss_fn2(out2, label)
        loss = loss1 + loss2
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        if batch_id %10 == 0:
          loss_list.append(loss.item())
        train_acc_relevant += calc_accuracy(out1, related)
        train_acc_score += calc_accuracy(out2, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc relevant {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc_relevant / (batch_id+1)))
            print("epoch {} batch id {} loss {} train acc score {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc_score / (batch_id+1)))
    #print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    print("mean loss = ", sum(loss_list) / len(loss_list))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, related, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        related = related.long().to(device)
        out1, out2 = model(token_ids, valid_length, segment_ids)
        test_acc_relevant += calc_accuracy(out1, related)
        test_acc_score += calc_accuracy(out2, label)
    print("epoch {} test acc relevant {}".format(e+1, test_acc_relevant / (batch_id+1)))
    print("epoch {} test acc score {}".format(e+1, test_acc_score / (batch_id+1)))
 

 

Saving Model:

If you want to load model, you must define the class same name and structure as same as I defined before loading.(tokenizer too)

PATH = 'pathyouwant'
torch.save(model, PATH)

 

Confusion matrix:

RELEVANT_LABELS = [0,1]
SCORE_LABELS = [-1,0,1,2]
model.eval()
relevant_true = []
relevant_pred = []

sentiment_true = []
sentiment_pred = []
for batch_id, (token_ids, valid_length, segment_ids, related, label) in enumerate(tqdm_notebook(test_dataloader)):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length
  label = label.long().to(device)
  related = related.long().to(device)
  out1, out2 = model(token_ids, valid_length, segment_ids)
  _, related_index = torch.max(out1, 1)
  _, label_index = torch.max(out2, 1)
  for idx in range(len(label)):
    sentiment_true.append(S_LABELS[int(label[idx])])
    relevant_true.append(R_LABELS[int(related[idx])])
    sentiment_pred.append(S_LABELS[int(label_index[idx])])
    relevant_pred.append(R_LABELS[int(related_index[idx])])
print(sentiment_true)
print(sentiment_pred)
print(relevant_true)
print(relevant_pred)

from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter



print('da confusion matrix')
print('Labels: [-1, 0, 1, 2] (2 is not labeled because irrelevant)')
print(confusion_matrix(sentiment_true, sentiment_pred, labels=[-1,0,1,2]))
print(classification_report(sentiment_true, sentiment_pred, digits=3))



print('da confusion matrix')
print(confusion_matrix(relevant_true, relevant_pred, labels=[0,1]))
print(classification_report(relevant_true, relevant_pred, digits=3))

 

To make inference data, you should define new dataloader.(I don't know exact reason, even though I set model.eval() but when I make inference for each individual data, its output is terrible)

class InferenceBertDataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, data=train_data, batch_size=batch_size, bert_tokenizer=tok,category='review', max_len=max_len, pad=True, pair=True):
        'Initialization'
        self.data = data
        self.review_counts = len(data)
        self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        #self.sentences = [transform([dt[1],dt[0]]) for dt in data.values]
        self.data = data
        self.category = category
        #self.labels = data['score]
  def __len__(self):
        'Denotes the total number of samples'
        return (len(self.data))
  def __getitem__(self, index):
        'Generates one sample of data'
        sentence = self.transform([self.category, self.data.iloc[index]['review']])
        return sentence

#inference data preprocessing
PATHtoData = os.path.join('somethingelse','something')
PathToSave = os.path.join('som','daf')
inference_data = pd.read_csv(PAThtoData, sep='\t')

first_line = inference_data.columns[0]
inference_data.columns = ['review']
inference_data = inference_data.append({'review': first_line}, ignore_index=True)

Make inference data and save.

from collections import defaultdict
model.eval()
inference_dict = defaultdict(list)
transform = nlp.data.BERTSentenceTransform(tok, max_seq_length=max_len, pad=True, pair=True)
relevant_true = []
relevant_pred = []

s_true = []
s_pred = []
for i in range(len(inference_data)):
  inference_dict['review'].append(inference_data.iloc[i]['review'])
for category in c:
  data_inference = InferenceBertDataset(data=inference_data,category=category)
  inference_dataloader = torch.utils.data.DataLoader(data_inference, batch_size=batch_size, num_workers=5)
  for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(inference_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out1, out2 = model(token_ids, valid_length, segment_ids)
    max_vals, max_indices = torch.max(out1, 1)
    score_max_vals, score_max_indices = torch.max(out2, 1)
    for idx in range(len(token_ids)):
      if RELEVANT_LABELS[int(max_indices[idx])] == 1 and SCORE_LABELS[int(score_max_indices[idx])] != 2:
        result = str(SLABELS[score_max_indices[idx]])
      else:
        result = 'NaN'
      inference_dict[category].append(result)

inference_df = pd.DataFrame(inference_dict)
inference_df.to_csv(PathYouwant)

'개발 > AI&ML' 카테고리의 다른 글

200314 auto ml에 관하여  (0) 2020.03.15