kernelmachine commited on
Commit
2c5347a
1 Parent(s): 842e849
lr/__init__.py ADDED
File without changes
lr/__init__.pyc ADDED
Binary file (101 Bytes). View file
 
lr/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (142 Bytes). View file
 
lr/__pycache__/eval.cpython-38.pyc ADDED
Binary file (3.22 kB). View file
 
lr/__pycache__/hyperparameters.cpython-38.pyc ADDED
Binary file (4.73 kB). View file
 
lr/__pycache__/plot.cpython-38.pyc ADDED
Binary file (2.3 kB). View file
 
lr/__pycache__/train.cpython-38.pyc ADDED
Binary file (6.78 kB). View file
 
lr/__pycache__/util.cpython-38.pyc ADDED
Binary file (2.45 kB). View file
 
lr/eval.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ import random
7
+ import shutil
8
+ import time
9
+ from typing import Any, Dict, List, Union
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ import ray
14
+ from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, HashingVectorizer,
15
+ TfidfVectorizer)
16
+ from sklearn.linear_model import LogisticRegression
17
+ from sklearn.metrics import f1_score
18
+ from sklearn.model_selection import train_test_split
19
+ from tqdm import tqdm
20
+ from lr.hyperparameters import SEARCH_SPACE, RandomSearch, HyperparameterSearch
21
+ from shutil import rmtree
22
+
23
+
24
+ # Create a custom logger
25
+ logger = logging.getLogger(__name__)
26
+ logger.setLevel(logging.DEBUG)
27
+
28
+
29
+ def load_model(serialization_dir):
30
+ with open(os.path.join(serialization_dir, "best_hyperparameters.json"), 'r') as f:
31
+ hyperparameters = json.load(f)
32
+ if hyperparameters.pop('stopwords') == 1:
33
+ stop_words = 'english'
34
+ else:
35
+ stop_words = None
36
+ weight = hyperparameters.pop('weight')
37
+ if weight == 'binary':
38
+ binary = True
39
+ else:
40
+ binary = False
41
+ ngram_range = hyperparameters.pop('ngram_range')
42
+ ngram_range = sorted([int(x) for x in ngram_range.split()])
43
+ if weight == 'tf-idf':
44
+ vect = TfidfVectorizer(stop_words=stop_words,
45
+ lowercase=True,
46
+ ngram_range=ngram_range)
47
+ elif weight == 'hash':
48
+ vect = HashingVectorizer(stop_words=stop_words,lowercase=True,ngram_range=ngram_range)
49
+ else:
50
+ vect = CountVectorizer(binary=binary,
51
+ stop_words=stop_words,
52
+ lowercase=True,
53
+ ngram_range=ngram_range)
54
+ if weight != "hash":
55
+ with open(os.path.join(serialization_dir, "vocab.json"), 'r') as f:
56
+ vocab = json.load(f)
57
+ vect.vocabulary_ = vocab
58
+ hyperparameters['C'] = float(hyperparameters['C'])
59
+ hyperparameters['tol'] = float(hyperparameters['tol'])
60
+ classifier = LogisticRegression(**hyperparameters)
61
+ if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
62
+ vect.idf_ = np.load(os.path.join(serialization_dir, "archive", "idf.npy"))
63
+ classifier.coef_ = np.load(os.path.join(serialization_dir, "archive", "coef.npy"))
64
+ classifier.intercept_ = np.load(os.path.join(serialization_dir, "archive", "intercept.npy"))
65
+ classifier.classes_ = np.load(os.path.join(serialization_dir, "archive", "classes.npy"))
66
+ return classifier, vect
67
+
68
+
69
+ def eval_lr(test,
70
+ classifier,
71
+ vect):
72
+ start = time.time()
73
+ X_test = vect.transform(tqdm(test.text, desc="fitting and transforming data"))
74
+ end = time.time()
75
+ preds = classifier.predict(X_test)
76
+ scores = classifier.predict_proba(X_test)
77
+ return f1_score(test.label, preds, average='macro'), classifier.score(X_test, test.label), scores
78
+
79
+
80
+ if __name__ == '__main__':
81
+ parser = argparse.ArgumentParser()
82
+ parser.add_argument('--eval_file', type=str)
83
+ parser.add_argument('--model', '-m', type=str)
84
+ parser.add_argument('--output', '-o', type=str)
85
+
86
+
87
+
88
+ args = parser.parse_args()
89
+
90
+ if not os.path.isdir(args.model):
91
+ print(f"model {args.model} does not exist. Aborting! ")
92
+ else:
93
+ clf, vect = load_model(args.model)
94
+
95
+ print(f"reading evaluation data at {args.eval_file}...")
96
+ test = pd.read_json(args.eval_file, lines=True)
97
+
98
+ f1, acc, scores = eval_lr(test, clf, vect)
99
+ if args.output:
100
+ out = pd.DataFrame({'id': test['id'], 'score': scores.tolist()})
101
+ out.to_json(args.output, lines=True, orient='records')
102
+
103
+ print("================")
104
+ print(f"F1: {f1}")
105
+ print(f"accuracy: {acc}")
lr/hyperparameters.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Union
2
+ import numpy as np
3
+ import logging
4
+ import os
5
+
6
+ # Create a custom logger
7
+ logger = logging.getLogger(__name__)
8
+ logger.setLevel(logging.DEBUG)
9
+
10
+
11
+
12
+ class RandomSearch:
13
+
14
+ @staticmethod
15
+ def random_choice(args: List[Any], n: int = 1):
16
+ """
17
+ pick a random element from a set.
18
+
19
+ Example:
20
+ >> sampler = RandomSearch.random_choice(1,2,3)
21
+ >> sampler()
22
+ 2
23
+ """
24
+ choices = []
25
+ for arg in args:
26
+ choices.append(arg)
27
+ if n == 1:
28
+ return lambda: np.random.choice(choices, replace=False)
29
+ else:
30
+ return lambda: np.random.choice(choices, n, replace=False)
31
+
32
+ @staticmethod
33
+ def random_integer(low: Union[int, float], high: Union[int, float]):
34
+ """
35
+ pick a random integer between two bounds
36
+
37
+ Example:
38
+ >> sampler = RandomSearch.random_integer(1, 10)
39
+ >> sampler()
40
+ 9
41
+ """
42
+ return lambda: int(np.random.randint(low, high))
43
+
44
+ @staticmethod
45
+ def random_loguniform(low: Union[float, int], high: Union[float, int]):
46
+ """
47
+ pick a random float between two bounds, using loguniform distribution
48
+
49
+ Example:
50
+ >> sampler = RandomSearch.random_loguniform(1e-5, 1e-2)
51
+ >> sampler()
52
+ 0.0004
53
+ """
54
+ return lambda: np.exp(np.random.uniform(np.log(low), np.log(high)))
55
+
56
+ @staticmethod
57
+ def random_uniform(low: Union[float, int], high: Union[float, int]):
58
+ """
59
+ pick a random float between two bounds, using uniform distribution
60
+
61
+ Example:
62
+ >> sampler = RandomSearch.random_uniform(0, 1)
63
+ >> sampler()
64
+ 0.01
65
+ """
66
+ return lambda: np.random.uniform(low, high)
67
+
68
+
69
+ class HyperparameterSearch:
70
+
71
+ def __init__(self, **kwargs):
72
+ self.search_space = {}
73
+ self.lambda_ = lambda: 0
74
+ for key, val in kwargs.items():
75
+ self.search_space[key] = val
76
+
77
+ def parse(self, val: Any):
78
+
79
+ if isinstance(val, (int, np.int)):
80
+ return int(val)
81
+ elif isinstance(val, (float, np.float)):
82
+ return val
83
+ elif isinstance(val, (np.ndarray, list)):
84
+ return " ".join(val)
85
+ elif val is None:
86
+ return None
87
+ if isinstance(val, str):
88
+ return val
89
+ else:
90
+ val = val()
91
+ if isinstance(val, (int, np.int)):
92
+ return int(val)
93
+ elif isinstance(val, (np.ndarray, list)):
94
+ return " ".join(val)
95
+ else:
96
+ return val
97
+
98
+
99
+ def sample(self) -> Dict:
100
+ res = {}
101
+ for key, val in self.search_space.items():
102
+ try:
103
+ res[key] = self.parse(val)
104
+ except (TypeError, ValueError) as error:
105
+ logger.error(f"Could not parse key {key} with value {val}. {error}")
106
+
107
+ return res
108
+
109
+ def update_environment(self, sample) -> None:
110
+ for key, val in sample.items():
111
+ os.environ[key] = str(val)
112
+
113
+
114
+ SEARCH_SPACE = {
115
+ "penalty": RandomSearch.random_choice(["l1", "l2"]),
116
+ "C": RandomSearch.random_uniform(0, 1),
117
+ "solver": "liblinear",
118
+ "multi_class": "auto",
119
+ "tol": RandomSearch.random_loguniform(10e-5, 10e-3),
120
+ "stopwords": RandomSearch.random_choice([0, 1]),
121
+ "weight": RandomSearch.random_choice(["hash"]),
122
+ "ngram_range": RandomSearch.random_choice(["1 2", "2 3", "1 3"]),
123
+ "random_state": RandomSearch.random_integer(0, 100000)
124
+ }
lr/merge.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ from typing import Any, Dict, List, Union
7
+ import sys
8
+ import pandas as pd
9
+
10
+ # Create a custom logger
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.DEBUG)
13
+
14
+
15
+ if __name__ == '__main__':
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument('--experiments', nargs="+", type=str)
18
+ parser.add_argument('--output', type=str)
19
+
20
+ args = parser.parse_args()
21
+ dfs = []
22
+ for experiment in args.experiments:
23
+ if not os.path.isdir(experiment):
24
+ print(f"experiment {experiment} does not exist. Aborting! ")
25
+ sys.exit(1)
26
+ else:
27
+ dfs.append(pd.read_json(os.path.join(experiment, "results.jsonl"), lines=True))
28
+ master = pd.concat(dfs, 0)
29
+ master.to_json(args.output, lines=True, orient='records')
lr/plot.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ import random
7
+ import shutil
8
+ import time
9
+ from typing import Any, Dict, List, Union
10
+ import seaborn as sns
11
+ import sys
12
+ import matplotlib.pyplot as plt
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+
18
+
19
+ # Create a custom logger
20
+ logger = logging.getLogger(__name__)
21
+ logger.setLevel(logging.DEBUG)
22
+
23
+
24
+ def load_model(hyperparameters):
25
+
26
+ if hyperparameters.pop('stopwords') == 1:
27
+ stop_words = 'english'
28
+ else:
29
+ stop_words = None
30
+ weight = hyperparameters.pop('weight')
31
+ if weight == 'binary':
32
+ binary = True
33
+ else:
34
+ binary = False
35
+ ngram_range = hyperparameters.pop('ngram_range')
36
+ ngram_range = sorted([int(x) for x in ngram_range.split()])
37
+ if weight == 'tf-idf':
38
+ vect = TfidfVectorizer(stop_words=stop_words,
39
+ lowercase=True,
40
+ ngram_range=ngram_range)
41
+ else:
42
+ vect = CountVectorizer(binary=binary,
43
+ stop_words=stop_words,
44
+ lowercase=True,
45
+ ngram_range=ngram_range)
46
+ hyperparameters['C'] = float(hyperparameters['C'])
47
+ hyperparameters['tol'] = float(hyperparameters['tol'])
48
+ classifier = LogisticRegression(**hyperparameters)
49
+ return classifier, vect
50
+
51
+
52
+ def eval_lr(test,
53
+ classifier,
54
+ vect):
55
+ start = time.time()
56
+ X_test = vect.fit_transform(tqdm(test.text, desc="fitting and transforming data"))
57
+ end = time.time()
58
+ preds = classifier.predict(X_test)
59
+ return f1_score(test.label, preds, average='macro'), classifier.score(X_test, test.label)
60
+
61
+
62
+ if __name__ == '__main__':
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument('--results_file', '-m', type=str)
65
+ parser.add_argument('--performance_metric', '-p', type=str)
66
+ parser.add_argument('--hyperparameter', '-x', type=str)
67
+ parser.add_argument('--logx', action='store_true')
68
+ parser.add_argument('--boxplot', action='store_true')
69
+
70
+
71
+ args = parser.parse_args()
72
+
73
+ if not os.path.exists(args.results_file):
74
+ print(f"Results file {args.results_file} does not exist. Aborting! ")
75
+ sys.exit(1)
76
+ else:
77
+ df = pd.read_json(args.results_file, lines=True)
78
+ if args.boxplot:
79
+ ax = sns.boxplot(df[args.hyperparameter], df[args.performance_metric])
80
+ else:
81
+ ax = sns.scatterplot(df[args.hyperparameter], df[args.performance_metric])
82
+ if args.logx:
83
+ ax.set_xscale("log")
84
+ plt.show()
lr/train.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ import random
7
+ import shutil
8
+ import sys
9
+ import time
10
+ from ast import literal_eval
11
+ from shutil import rmtree
12
+ from typing import Any, Dict, List, Union
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import ray
17
+ from sklearn.feature_extraction.text import (CountVectorizer, HashingVectorizer, TfidfVectorizer)
18
+ from sklearn.linear_model import LogisticRegression
19
+ from sklearn.metrics import f1_score
20
+ from sklearn.model_selection import train_test_split
21
+ from tqdm import tqdm
22
+
23
+ from lr.hyperparameters import (SEARCH_SPACE, HyperparameterSearch,
24
+ RandomSearch)
25
+ from lr.util import jackknife, replace_bool, stratified_sample
26
+
27
+ # Create a custom logger
28
+ logger = logging.getLogger(__name__)
29
+ logger.setLevel(logging.DEBUG)
30
+
31
+
32
+ def train_lr(train,
33
+ dev,
34
+ test,
35
+ search_space):
36
+ master = pd.concat([train, dev], 0)
37
+ space = HyperparameterSearch(**search_space)
38
+ sample = space.sample()
39
+ if sample.pop('stopwords') == 1:
40
+ stop_words = 'english'
41
+ else:
42
+ stop_words = None
43
+ weight = sample.pop('weight')
44
+ if weight == 'binary':
45
+ binary = True
46
+ else:
47
+ binary = False
48
+ ngram_range = sample.pop('ngram_range')
49
+ ngram_range = sorted([int(x) for x in ngram_range.split()])
50
+ if weight == 'tf-idf':
51
+ vect = TfidfVectorizer(stop_words=stop_words,
52
+ lowercase=True,
53
+ ngram_range=ngram_range,
54
+ )
55
+ elif weight == 'hash':
56
+ vect = HashingVectorizer(stop_words=stop_words,
57
+ lowercase=True,
58
+ ngram_range=ngram_range,
59
+ )
60
+ else:
61
+ vect = CountVectorizer(binary=binary,
62
+ stop_words=stop_words,
63
+ lowercase=True,
64
+ ngram_range=ngram_range,
65
+ )
66
+ start = time.time()
67
+ vect.fit(tqdm(master.text, desc="fitting data", leave=False))
68
+ X_train = vect.transform(tqdm(train.text, desc="transforming training data", leave=False))
69
+ X_dev = vect.transform(tqdm(dev.text, desc="transforming dev data", leave=False))
70
+ if test is not None:
71
+ X_test = vect.transform(tqdm(test.text, desc="transforming test data", leave=False))
72
+
73
+ sample['C'] = float(sample['C'])
74
+ sample['tol'] = float(sample['tol'])
75
+ classifier = LogisticRegression(**sample, verbose=True)
76
+ classifier.fit(X_train, train.label)
77
+ end = time.time()
78
+ for k, v in sample.items():
79
+ if not v:
80
+ v = str(v)
81
+ sample[k] = [v]
82
+ res = pd.DataFrame(sample)
83
+ preds = classifier.predict(X_dev)
84
+ if test is not None:
85
+ test_preds = classifier.predict(X_test)
86
+ res['dev_f1'] = f1_score(dev.label, preds, average='macro')
87
+ if test is not None:
88
+ res['test_f1'] = f1_score(test.label, test_preds, average='macro')
89
+ res['dev_accuracy'] = classifier.score(X_dev, dev.label)
90
+ if test is not None:
91
+ res['test_accuracy'] = classifier.score(X_test, test.label)
92
+ res['training_duration'] = end - start
93
+ res['ngram_range'] = str(ngram_range)
94
+ res['weight'] = weight
95
+ res['stopwords'] = stop_words
96
+ return classifier, vect, res
97
+
98
+
99
+ if __name__ == '__main__':
100
+ parser = argparse.ArgumentParser()
101
+ parser.add_argument('--train_file', type=str)
102
+ parser.add_argument('--dev_file', type=str, required=False)
103
+ parser.add_argument('--test_file', type=str, required=False)
104
+ parser.add_argument('--search_trials', type=int, default=5)
105
+ parser.add_argument('--train_subsample', type=int, required=False)
106
+ parser.add_argument('--stratified', action='store_true')
107
+ parser.add_argument('--jackknife_partitions', type=int, default=5, required=False)
108
+ parser.add_argument('--save_jackknife_partitions', action='store_true')
109
+ parser.add_argument('--serialization_dir', '-s', type=str)
110
+ parser.add_argument('--override', '-o', action='store_true')
111
+ parser.add_argument('--evaluate_on_test', '-t', action='store_true')
112
+
113
+
114
+ args = parser.parse_args()
115
+
116
+ if not os.path.isdir(args.serialization_dir):
117
+ os.makedirs(args.serialization_dir)
118
+ else:
119
+ if args.override:
120
+ rmtree(args.serialization_dir)
121
+ os.makedirs(args.serialization_dir)
122
+ else:
123
+ print(f"serialization directory {args.serialization_dir} exists. Aborting! ")
124
+
125
+
126
+ print(f"reading training data at {args.train_file}...")
127
+ train = pd.read_json(args.train_file, lines=True)
128
+ if args.train_subsample:
129
+ if args.stratified:
130
+ train = stratified_sample(train, "label", args.train_subsample)
131
+ else:
132
+ train = train.sample(n=args.train_subsample)
133
+
134
+ if args.dev_file:
135
+ print(f"reading dev data at {args.dev_file}...")
136
+ dev = pd.read_json(args.dev_file, lines=True)
137
+ else:
138
+ print("Dev file not provided, will jackknife training data...")
139
+
140
+ if args.evaluate_on_test:
141
+ if args.test_file:
142
+ print(f"reading test data at {args.test_file}...")
143
+ test = pd.read_json(args.test_file, lines=True)
144
+ else:
145
+ print("Test file not provided.")
146
+ sys.exit(1)
147
+ else:
148
+ test = None
149
+
150
+ num_assignments = args.search_trials
151
+ num_partitions = args.jackknife_partitions
152
+ df = pd.DataFrame()
153
+ current_f1 = 0.0
154
+ best_classifier = None
155
+ best_vect = None
156
+ if args.dev_file:
157
+ pbar = tqdm(range(num_assignments), desc="search trials", leave=False)
158
+ for i in pbar:
159
+ try:
160
+ classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE)
161
+ df = pd.concat([df, res], 0, sort=True)
162
+ best_f1 = df.dev_f1.max()
163
+ if res.dev_f1[0] > current_f1:
164
+ current_f1 = res.dev_f1[0]
165
+ best_classifier = classifier
166
+ best_vect = vect
167
+ pbar.set_description(f"mean +- std dev F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}, max F1: {df.dev_f1.max()}")
168
+ except KeyboardInterrupt:
169
+ break
170
+ else:
171
+ if args.save_jackknife_partitions:
172
+ if not os.path.isdir(os.path.join(args.serialization_dir, "jackknife")):
173
+ os.mkdir(os.path.join(args.serialization_dir, "jackknife"))
174
+ for ix, (train, dev) in tqdm(enumerate(jackknife(train, num_partitions=num_partitions)),
175
+ total=num_partitions,
176
+ leave=False,
177
+ desc="jackknife partitions"):
178
+ for i in tqdm(range(num_assignments), desc="search trials", leave=False):
179
+ classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE)
180
+ df = pd.concat([df, res], 0, sort=True)
181
+ best_f1 = df.dev_f1.max()
182
+ if res.dev_f1[0] > current_f1:
183
+ current_f1 = res.dev_f1[0]
184
+ best_classifier = classifier
185
+ best_vect = vect
186
+ df['dataset_reader.sample'] = train.shape[0]
187
+ df['model.encoder.architecture.type'] = 'logistic regression'
188
+ if args.save_jackknife_partitions:
189
+ train.to_json(
190
+ os.path.join(args.serialization_dir,
191
+ "jackknife",
192
+ f"train.{ix}"),
193
+ lines=True,
194
+ orient="records")
195
+ dev.to_json(os.path.join(args.serialization_dir,
196
+ "jackknife",
197
+ f"dev.{ix}"),
198
+ lines=True,
199
+ orient='records')
200
+
201
+ print("DEV STATISTICS")
202
+ print("================")
203
+ print(f"mean +- std F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}")
204
+ print(f"max F1: {df.dev_f1.max()}")
205
+ print(f"min F1: {df.dev_f1.min()}")
206
+ print(f"mean +- std accuracy: {df.dev_accuracy.mean()} +- {df.dev_accuracy.std()}")
207
+ print(f"max accuracy: {df.dev_accuracy.max()}")
208
+ print(f"min accuracy: {df.dev_accuracy.min()}")
209
+ print("")
210
+ print("BEST HYPERPARAMETERS")
211
+ print(f"=====================")
212
+ best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict()
213
+ print(df.reset_index().iloc[df.reset_index().dev_f1.idxmax()])
214
+
215
+ if test is not None:
216
+ print("TEST STATISTICS")
217
+ print("================")
218
+ print(f"mean +- std F1: {df.test_f1.mean()} +- {df.test_f1.std()}")
219
+ print(f"max F1: {df.test_f1.max()}")
220
+ print(f"min F1: {df.test_f1.min()}")
221
+ print(f"mean +- std accuracy: {df.test_accuracy.mean()} +- {df.test_accuracy.std()}")
222
+ print(f"max accuracy: {df.test_accuracy.max()}")
223
+ print(f"min accuracy: {df.test_accuracy.min()}")
224
+
225
+ df.to_json(os.path.join(args.serialization_dir, "results.jsonl"), lines=True, orient='records')
226
+ with open(os.path.join(args.serialization_dir, "best_hyperparameters.json"), "w+") as f:
227
+ best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict()
228
+ for k,v in best_hp.items():
229
+ if isinstance(v, np.int64):
230
+ best_hp[k] = int(v)
231
+ if isinstance(v, str) and "[" in v:
232
+ v = literal_eval(v)
233
+ best_hp[k] = f"{v[0]} {v[1]}"
234
+ best_hp.pop("index")
235
+ best_hp.pop("dev_accuracy")
236
+ best_hp.pop("dev_f1")
237
+ if test is not None:
238
+ best_hp.pop("test_accuracy")
239
+ best_hp.pop("test_f1")
240
+ best_hp.pop("training_duration")
241
+ json.dump(best_hp, f)
242
+ with open(os.path.join(args.serialization_dir, "vocab.json"), 'w+') as f:
243
+ for k,v in best_vect.__dict__['vocabulary_'].items():
244
+ best_vect.__dict__['vocabulary_'][k] = int(v)
245
+ json.dump(best_vect.__dict__['vocabulary_'], f)
246
+
247
+ os.mkdir(os.path.join(args.serialization_dir, "archive"))
248
+ try:
249
+ np.save(os.path.join(args.serialization_dir, "archive", "idf.npy"), best_vect.idf_)
250
+ except:
251
+ pass
252
+ np.save(os.path.join(args.serialization_dir, "archive", "classes.npy"),best_classifier.classes_)
253
+ np.save(os.path.join(args.serialization_dir, "archive", "coef.npy"),best_classifier.coef_)
254
+ np.save(os.path.join(args.serialization_dir, "archive", "intercept.npy"), best_classifier.intercept_)
lr/util.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ def load_huggingface_tokenizer(tokenizer_path: str):
7
+ with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
8
+ config = json.load(f)
9
+ tokenizer_type = config['tokenizer_type']
10
+ tokenizer = {'BPE': BPETokenizer,
11
+ 'BBPE': ByteLevelBPETokenizer,
12
+ 'BERT': BertWordPieceTokenizer}[tokenizer_type]
13
+ if tokenizer_type in ['BPE', 'BBPE']:
14
+ vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
15
+ merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
16
+ tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
17
+ merges_file=os.path.join(tokenizer_path, merges_file))
18
+ else:
19
+ vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
20
+ tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
21
+ return tokenizer
22
+
23
+
24
+ def jackknife(data, num_partitions=5):
25
+ data = data.sample(frac=1)
26
+ splits = np.split(data, range(0, data.shape[0], int(data.shape[0]/num_partitions) )[1:])
27
+ for i, split in enumerate(splits):
28
+ train_parts = list(range(0, num_partitions))
29
+ try:
30
+ train_parts.remove(i)
31
+ yield pd.concat([splits[ix] for ix in train_parts], 0), split
32
+ except ValueError:
33
+ continue
34
+
35
+
36
+ def stratified_sample(df, col, n_samples):
37
+ n = min(n_samples, df[col].value_counts().min())
38
+ rand_int = np.random.randint(1, 10000)
39
+ df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=rand_int))
40
+ df_.index = df_.index.droplevel(0)
41
+ return df_
42
+
43
+
44
+ def replace_bool(x):
45
+ if x == 'true':
46
+ return 1
47
+ elif x == 'false':
48
+ return 0
49
+ else:
50
+ return x
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ scikit-learn