QINGCHE commited on
Commit
fe7fef5
1 Parent(s): fdffdf0

add BERT train code

Browse files
Files changed (1) hide show
  1. BERT_inference.py +217 -0
BERT_inference.py CHANGED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import numpy as np
3
+ import pandas as pd
4
+ import csv
5
+ import torch.nn as nn
6
+ from torch.optim.lr_scheduler import ReduceLROnPlateau
7
+ from torch.utils.data import TensorDataset, DataLoader
8
+ from transformers import BertTokenizer,BertConfig,AdamW
9
+ from sklearn.metrics import accuracy_score
10
+ from sklearn.metrics import classification_report
11
+ from tqdm import tqdm
12
+ import torch
13
+ import transformers
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ # %%
17
+
18
+ class MyDataSet(Dataset):
19
+ def __init__(self, loaded_data):
20
+ self.data = loaded_data
21
+
22
+ def __len__(self):
23
+ return len(self.data)
24
+
25
+ def __getitem__(self, idx):
26
+ return self.data[idx]
27
+
28
+ Data_path = "/kaggle/input/inference/train.csv"
29
+ Totle_data = pd.read_csv(Data_path)
30
+ Totle_data = Totle_data.sample(frac=0.1)
31
+ Totle_data = Totle_data.dropna(axis=0,subset = ["2"])
32
+ custom_dataset = MyDataSet(Totle_data)
33
+ #按照比例划分
34
+ train_size = int(len(custom_dataset) * 0.6)
35
+ validate_size = int(len(custom_dataset) * 0.1)
36
+ test_size = len(custom_dataset) - validate_size - train_size
37
+ train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(custom_dataset, [train_size, validate_size, test_size])
38
+
39
+ #设置保存路径
40
+ train_data_path="Bert_Try.csv"
41
+ dev_data_path = "Bert_Dev.csv"
42
+ test_data_path="Bert_Test.csv"
43
+
44
+ train_dataset = Totle_data.iloc[train_dataset.indices]
45
+ validate_dataset = Totle_data.iloc[validate_dataset.indices]
46
+ test_dataset = Totle_data.iloc[test_dataset.indices]
47
+
48
+ #index参数设置为False表示不保存行索引,header设置为False表示不保存列索引
49
+ train_dataset.to_csv(train_data_path,index=False,header=True)
50
+ validate_dataset.to_csv(dev_data_path ,index=False,header=True)
51
+ test_dataset.to_csv(test_data_path,index=False,header=True)
52
+
53
+ # %%
54
+ data = pd.read_csv(train_data_path)
55
+ data.head
56
+
57
+ # %%
58
+
59
+ class BertClassificationModel(nn.Module):
60
+ def __init__(self):
61
+ super(BertClassificationModel, self).__init__()
62
+ #加载预训练模型
63
+ pretrained_weights="bert-base-chinese"
64
+ self.bert = transformers.BertModel.from_pretrained(pretrained_weights)
65
+ for param in self.bert.parameters():
66
+ param.requires_grad = True
67
+ #定义线性函数
68
+ self.dense = nn.Linear(768, 3)
69
+
70
+ def forward(self, input_ids,token_type_ids,attention_mask):
71
+ #得到bert_output
72
+ bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask)
73
+ #获得预训练模型的输出
74
+ bert_cls_hidden_state = bert_output[1]
75
+ #将768维的向量输入到线性层映射为二维向量
76
+ linear_output = self.dense(bert_cls_hidden_state)
77
+ return linear_output
78
+
79
+ # %%
80
+
81
+ def encoder(max_len,vocab_path,text_list):
82
+ #将text_list embedding成bert模型可用的输入形式
83
+ #加载分词模型
84
+ tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
85
+ tokenizer = tokenizer(
86
+ text_list,
87
+ padding = True,
88
+ truncation = True,
89
+ max_length = max_len,
90
+ return_tensors='pt' # 返回的类型为pytorch tensor
91
+ )
92
+ input_ids = tokenizer['input_ids']
93
+ token_type_ids = tokenizer['token_type_ids']
94
+ attention_mask = tokenizer['attention_mask']
95
+ return input_ids,token_type_ids,attention_mask
96
+
97
+ # %%
98
+ labels2dict = {"neutral":0,"entailment":1,"contradiction":2}
99
+ def load_data(path):
100
+ csvFileObj = open(path)
101
+ readerObj = csv.reader(csvFileObj)
102
+ text_list = []
103
+ labels = []
104
+ for row in readerObj:
105
+ #跳过表头
106
+ if readerObj.line_num == 1:
107
+ continue
108
+ #label在什么位置就改成对应的index
109
+ label = int(labels2dict[row[0]])
110
+ text = row[1]
111
+ text_list.append(text)
112
+ labels.append(label)
113
+ #调用encoder函数,获得预训练模型的三种输入形式
114
+ input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="/root/Bert/bert-base-chinese/vocab.txt",text_list=text_list)
115
+ labels = torch.tensor(labels)
116
+ #将encoder的返回值以及label封装为Tensor的形式
117
+ data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
118
+ return data
119
+
120
+ # %%
121
+ #设定batch_size
122
+ batch_size = 16
123
+ #引入数据路径
124
+ train_data_path="Bert_Try.csv"
125
+ dev_data_path="Bert_Dev.csv"
126
+ test_data_path="Bert_Test.csv"
127
+ #调用load_data函数,将数据加载为Tensor形式
128
+ train_data = load_data(train_data_path)
129
+ dev_data = load_data(dev_data_path)
130
+ test_data = load_data(test_data_path)
131
+ #将训练数据和测试数据进行DataLoader实例化
132
+ train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
133
+ dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True)
134
+ test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
135
+
136
+ # %%
137
+ def dev(model,dev_loader):
138
+ model.to(device)
139
+
140
+ model.eval()
141
+
142
+ with torch.no_grad():
143
+ correct = 0
144
+ total = 0
145
+ for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'):
146
+ input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
147
+ out_put = model(input_ids,token_type_ids,attention_mask)
148
+ _, predict = torch.max(out_put.data, 1)
149
+ correct += (predict==labels).sum().item()
150
+ total += labels.size(0)
151
+ res = correct / total
152
+ return res
153
+
154
+ # %%
155
+
156
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
157
+ def train(model,train_loader,dev_loader) :
158
+
159
+ model.to(device)
160
+ model.train()
161
+ criterion = nn.CrossEntropyLoss()
162
+ param_optimizer = list(model.named_parameters())
163
+ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
164
+
165
+ optimizer_grouped_parameters = [
166
+ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
167
+ 'weight_decay': 0.01},
168
+ {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
169
+ ]
170
+
171
+ optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False}
172
+ optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params)
173
+ scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08)
174
+ t_total = len(train_loader)
175
+
176
+ total_epochs = 10
177
+ bestAcc = 0
178
+ correct = 0
179
+ total = 0
180
+ print('Training and verification begin!')
181
+ for epoch in range(total_epochs):
182
+ for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader):
183
+
184
+ optimizer.zero_grad()
185
+ input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
186
+ out_put = model(input_ids,token_type_ids,attention_mask)
187
+ loss = criterion(out_put, labels)
188
+ _, predict = torch.max(out_put.data, 1)
189
+ correct += (predict == labels).sum().item()
190
+ total += labels.size(0)
191
+ loss.backward()
192
+ optimizer.step()
193
+ #每两步进行一次打印
194
+ if (step + 1) % 10 == 0:
195
+ train_acc = correct / total
196
+ print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item()))
197
+ #每五十次进行一次验证
198
+ if (step + 1) % 200 == 0:
199
+ train_acc = correct / total
200
+ #调用验证函数dev对模型进行验证,并将有效果提升的模型进行保存
201
+ acc = dev(model, dev_loader)
202
+ if bestAcc < acc:
203
+ bestAcc = acc
204
+ #模型保存路径
205
+ path = 'bert_model.pkl'
206
+ torch.save(model, path)
207
+ print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item()))
208
+ scheduler.step(bestAcc)
209
+
210
+ # %%
211
+
212
+ path = '/kaggle/input/inference/bert_model.pkl'
213
+ # model = torch.load(path)
214
+ #实例化模型
215
+ model = BertClassificationModel()
216
+ #调用训练函数进行训练与验证
217
+ train(model,train_loader,dev_loader)