# John Makely # Finetune Language Modeling Based on BERTweet # ./jigsaw-toxic-comment-classification-challenge/train.csv # "id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate" [6 total classifiers] # 1. Extract text from csv # 2. Tokenize text (BERTweet, RoBERTa, GPT-2) # 3. Pass each tokenized text to a model with each classifier # 4. Train each model # 5. Save each model import pandas as pd import os from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification import torch from torch.utils.data import Dataset torch.cuda.empty_cache() # Create Dataset class class MultiLabelClassifierDataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]).float() return item def __len__(self): return len(self.labels) # Set up directories work_dir = os.path.dirname(os.path.realpath(__file__)) + '/' dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/' # Set up labels classifiers = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] # Use train.csv to split into train, val, test print("Loading data...") df = pd.read_csv(dataset_dir + 'train.csv') df = df.sample(frac=1).reset_index(drop=True) # Shuffle # Split into train, val, test train_df = df[:int(len(df)*0.1)] # Extracting the last 6 columns into a numpy array train_labels = train_df[classifiers].to_numpy() # Setting device device = torch.device('cuda') print("Using device: ", device) # # # # # # # # # # # ## # # # # # BERT # # # # # # # # # # # # # # # # ## training_args = TrainingArguments( output_dir='./results', num_train_epochs=2, per_device_train_batch_size=32, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, fp16=True ) print("BERT") bert_dir = work_dir + 'bert/' print("Tokenizing") print("Model base: ", "vinai/bertweet-base") tokenizer = AutoTokenizer.from_pretrained( "vinai/bertweet-base", model_max_length=128) print("Creating train encodings...") train_encodings = tokenizer( train_df['comment_text'].tolist(), truncation=True, padding=True) # def bert_train_model('vinai/bertweet-base', num_labels, training_args, train_encodings, train_dataset, model_dir): print("Training model to be stored in" + bert_dir) # # Create dataset print("Creating dataset") train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) # # Load model print("Loading model for training...") model = AutoModelForSequenceClassification.from_pretrained( 'vinai/bertweet-base', num_labels=6) # Create Trainer print("Creating trainer...") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) # Train print("Training...") trainer.train() # # Save model print("Saving model to " + bert_dir + '_bert_model') trainer.save_model(bert_dir + '_bert_model') # # # # # # # # # # # # # # # # RoBERTa # # # # # # # # # # # # # # # # training_args = TrainingArguments( output_dir='./results', num_train_epochs=1, per_device_train_batch_size=32, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, fp16=True ) # RoBERTa print("RoBERTa") roberta_dir = work_dir + 'roberta/' print("Tokenizing") print("Model base: ", 'roberta-base') tokenizer = RobertaTokenizer.from_pretrained( 'roberta-base', model_max_length=128) train_encodings = tokenizer( train_df['comment_text'].tolist(), truncation=True, padding=True) # Create dataset print("Creating dataset") train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) # Load model print("Loading model for training...") model = AutoModelForSequenceClassification.from_pretrained( 'roberta-base', num_labels=6) # Create Trainer print("Creating trainer...") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) # Train print("Training...") trainer.train() # Save model print("Saving model to " + roberta_dir + '_roberta_model') trainer.save_model(roberta_dir + '_roberta_model') # # # # # # # # # # # ## # # # distilbert # # # # # # # # # # # # # # # ## training_args = TrainingArguments( output_dir='./results', num_train_epochs=1, per_device_train_batch_size=32, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, fp16=True ) print("DISTILBERT") distilbert_dir = work_dir + 'distilbert/' print("Tokenizing") print("Model base: ", 'distilbert-base-cased') tokenizer = AutoTokenizer.from_pretrained( 'distilbert-base-cased', model_max_length=128) print("Creating train encodings...") train_encodings = tokenizer( train_df['comment_text'].tolist(), truncation=True, padding=True) print("Training model to be stored in" + distilbert_dir) # Create dataset print("Creating dataset") train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) # Load model print("Loading model for training...") model = AutoModelForSequenceClassification.from_pretrained( 'distilbert-base-cased', num_labels=6) # Create Trainer print("Creating trainer...") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) # Train print("Training...") trainer.train() # Save model print("Saving model to " + distilbert_dir + '_distilbert_model') trainer.save_model(distilbert_dir + '_distilbert_model')