Spaces:
Runtime error
Runtime error
import argparse | |
import json | |
import logging | |
import os | |
import pathlib | |
import random | |
import shutil | |
import sys | |
import time | |
from ast import literal_eval | |
from shutil import rmtree | |
from typing import Any, Dict, List, Union | |
import numpy as np | |
import pandas as pd | |
import ray | |
from sklearn.feature_extraction.text import (CountVectorizer, HashingVectorizer, TfidfVectorizer) | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import f1_score | |
from sklearn.model_selection import train_test_split | |
from tqdm import tqdm | |
from lr.hyperparameters import (SEARCH_SPACE, HyperparameterSearch, | |
RandomSearch) | |
from lr.util import jackknife, replace_bool, stratified_sample | |
# Create a custom logger | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.DEBUG) | |
def train_lr(train, | |
dev, | |
test, | |
search_space): | |
master = pd.concat([train, dev], 0) | |
space = HyperparameterSearch(**search_space) | |
sample = space.sample() | |
if sample.pop('stopwords') == 1: | |
stop_words = 'english' | |
else: | |
stop_words = None | |
weight = sample.pop('weight') | |
if weight == 'binary': | |
binary = True | |
else: | |
binary = False | |
ngram_range = sample.pop('ngram_range') | |
ngram_range = sorted([int(x) for x in ngram_range.split()]) | |
if weight == 'tf-idf': | |
vect = TfidfVectorizer(stop_words=stop_words, | |
lowercase=True, | |
ngram_range=ngram_range, | |
) | |
elif weight == 'hash': | |
vect = HashingVectorizer(stop_words=stop_words, | |
lowercase=True, | |
ngram_range=ngram_range, | |
) | |
else: | |
vect = CountVectorizer(binary=binary, | |
stop_words=stop_words, | |
lowercase=True, | |
ngram_range=ngram_range, | |
) | |
start = time.time() | |
vect.fit(tqdm(master.text, desc="fitting data", leave=False)) | |
X_train = vect.transform(tqdm(train.text, desc="transforming training data", leave=False)) | |
X_dev = vect.transform(tqdm(dev.text, desc="transforming dev data", leave=False)) | |
if test is not None: | |
X_test = vect.transform(tqdm(test.text, desc="transforming test data", leave=False)) | |
sample['C'] = float(sample['C']) | |
sample['tol'] = float(sample['tol']) | |
classifier = LogisticRegression(**sample, verbose=True) | |
classifier.fit(X_train, train.label) | |
end = time.time() | |
for k, v in sample.items(): | |
if not v: | |
v = str(v) | |
sample[k] = [v] | |
res = pd.DataFrame(sample) | |
preds = classifier.predict(X_dev) | |
if test is not None: | |
test_preds = classifier.predict(X_test) | |
res['dev_f1'] = f1_score(dev.label, preds, average='macro') | |
if test is not None: | |
res['test_f1'] = f1_score(test.label, test_preds, average='macro') | |
res['dev_accuracy'] = classifier.score(X_dev, dev.label) | |
if test is not None: | |
res['test_accuracy'] = classifier.score(X_test, test.label) | |
res['training_duration'] = end - start | |
res['ngram_range'] = str(ngram_range) | |
res['weight'] = weight | |
res['stopwords'] = stop_words | |
return classifier, vect, res | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--train_file', type=str) | |
parser.add_argument('--dev_file', type=str, required=False) | |
parser.add_argument('--test_file', type=str, required=False) | |
parser.add_argument('--search_trials', type=int, default=5) | |
parser.add_argument('--train_subsample', type=int, required=False) | |
parser.add_argument('--stratified', action='store_true') | |
parser.add_argument('--jackknife_partitions', type=int, default=5, required=False) | |
parser.add_argument('--save_jackknife_partitions', action='store_true') | |
parser.add_argument('--serialization_dir', '-s', type=str) | |
parser.add_argument('--override', '-o', action='store_true') | |
parser.add_argument('--evaluate_on_test', '-t', action='store_true') | |
args = parser.parse_args() | |
if not os.path.isdir(args.serialization_dir): | |
os.makedirs(args.serialization_dir) | |
else: | |
if args.override: | |
rmtree(args.serialization_dir) | |
os.makedirs(args.serialization_dir) | |
else: | |
print(f"serialization directory {args.serialization_dir} exists. Aborting! ") | |
print(f"reading training data at {args.train_file}...") | |
train = pd.read_json(args.train_file, lines=True) | |
if args.train_subsample: | |
if args.stratified: | |
train = stratified_sample(train, "label", args.train_subsample) | |
else: | |
train = train.sample(n=args.train_subsample) | |
if args.dev_file: | |
print(f"reading dev data at {args.dev_file}...") | |
dev = pd.read_json(args.dev_file, lines=True) | |
else: | |
print("Dev file not provided, will jackknife training data...") | |
if args.evaluate_on_test: | |
if args.test_file: | |
print(f"reading test data at {args.test_file}...") | |
test = pd.read_json(args.test_file, lines=True) | |
else: | |
print("Test file not provided.") | |
sys.exit(1) | |
else: | |
test = None | |
num_assignments = args.search_trials | |
num_partitions = args.jackknife_partitions | |
df = pd.DataFrame() | |
current_f1 = 0.0 | |
best_classifier = None | |
best_vect = None | |
if args.dev_file: | |
pbar = tqdm(range(num_assignments), desc="search trials", leave=False) | |
for i in pbar: | |
try: | |
classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE) | |
df = pd.concat([df, res], 0, sort=True) | |
best_f1 = df.dev_f1.max() | |
if res.dev_f1[0] > current_f1: | |
current_f1 = res.dev_f1[0] | |
best_classifier = classifier | |
best_vect = vect | |
pbar.set_description(f"mean +- std dev F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}, max F1: {df.dev_f1.max()}") | |
except KeyboardInterrupt: | |
break | |
else: | |
if args.save_jackknife_partitions: | |
if not os.path.isdir(os.path.join(args.serialization_dir, "jackknife")): | |
os.mkdir(os.path.join(args.serialization_dir, "jackknife")) | |
for ix, (train, dev) in tqdm(enumerate(jackknife(train, num_partitions=num_partitions)), | |
total=num_partitions, | |
leave=False, | |
desc="jackknife partitions"): | |
for i in tqdm(range(num_assignments), desc="search trials", leave=False): | |
classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE) | |
df = pd.concat([df, res], 0, sort=True) | |
best_f1 = df.dev_f1.max() | |
if res.dev_f1[0] > current_f1: | |
current_f1 = res.dev_f1[0] | |
best_classifier = classifier | |
best_vect = vect | |
df['dataset_reader.sample'] = train.shape[0] | |
df['model.encoder.architecture.type'] = 'logistic regression' | |
if args.save_jackknife_partitions: | |
train.to_json( | |
os.path.join(args.serialization_dir, | |
"jackknife", | |
f"train.{ix}"), | |
lines=True, | |
orient="records") | |
dev.to_json(os.path.join(args.serialization_dir, | |
"jackknife", | |
f"dev.{ix}"), | |
lines=True, | |
orient='records') | |
print("DEV STATISTICS") | |
print("================") | |
print(f"mean +- std F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}") | |
print(f"max F1: {df.dev_f1.max()}") | |
print(f"min F1: {df.dev_f1.min()}") | |
print(f"mean +- std accuracy: {df.dev_accuracy.mean()} +- {df.dev_accuracy.std()}") | |
print(f"max accuracy: {df.dev_accuracy.max()}") | |
print(f"min accuracy: {df.dev_accuracy.min()}") | |
print("") | |
print("BEST HYPERPARAMETERS") | |
print(f"=====================") | |
best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict() | |
print(df.reset_index().iloc[df.reset_index().dev_f1.idxmax()]) | |
if test is not None: | |
print("TEST STATISTICS") | |
print("================") | |
print(f"mean +- std F1: {df.test_f1.mean()} +- {df.test_f1.std()}") | |
print(f"max F1: {df.test_f1.max()}") | |
print(f"min F1: {df.test_f1.min()}") | |
print(f"mean +- std accuracy: {df.test_accuracy.mean()} +- {df.test_accuracy.std()}") | |
print(f"max accuracy: {df.test_accuracy.max()}") | |
print(f"min accuracy: {df.test_accuracy.min()}") | |
df.to_json(os.path.join(args.serialization_dir, "results.jsonl"), lines=True, orient='records') | |
with open(os.path.join(args.serialization_dir, "best_hyperparameters.json"), "w+") as f: | |
best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict() | |
for k,v in best_hp.items(): | |
if isinstance(v, np.int64): | |
best_hp[k] = int(v) | |
if isinstance(v, str) and "[" in v: | |
v = literal_eval(v) | |
best_hp[k] = f"{v[0]} {v[1]}" | |
best_hp.pop("index") | |
best_hp.pop("dev_accuracy") | |
best_hp.pop("dev_f1") | |
if test is not None: | |
best_hp.pop("test_accuracy") | |
best_hp.pop("test_f1") | |
best_hp.pop("training_duration") | |
json.dump(best_hp, f) | |
with open(os.path.join(args.serialization_dir, "vocab.json"), 'w+') as f: | |
for k,v in best_vect.__dict__['vocabulary_'].items(): | |
best_vect.__dict__['vocabulary_'][k] = int(v) | |
json.dump(best_vect.__dict__['vocabulary_'], f) | |
os.mkdir(os.path.join(args.serialization_dir, "archive")) | |
try: | |
np.save(os.path.join(args.serialization_dir, "archive", "idf.npy"), best_vect.idf_) | |
except: | |
pass | |
np.save(os.path.join(args.serialization_dir, "archive", "classes.npy"),best_classifier.classes_) | |
np.save(os.path.join(args.serialization_dir, "archive", "coef.npy"),best_classifier.coef_) | |
np.save(os.path.join(args.serialization_dir, "archive", "intercept.npy"), best_classifier.intercept_) |