Spaces:

kernelmachine
/

gpt3-quality-filter

Runtime error

App Files Files Community

kernelmachine commited on Jan 18, 2022

Commit

2c5347a

•

1 Parent(s): 842e849

update

Browse files

Files changed (15) hide show

lr/__init__.py +0 -0
lr/__init__.pyc +0 -0
lr/__pycache__/__init__.cpython-38.pyc +0 -0
lr/__pycache__/eval.cpython-38.pyc +0 -0
lr/__pycache__/hyperparameters.cpython-38.pyc +0 -0
lr/__pycache__/plot.cpython-38.pyc +0 -0
lr/__pycache__/train.cpython-38.pyc +0 -0
lr/__pycache__/util.cpython-38.pyc +0 -0
lr/eval.py +105 -0
lr/hyperparameters.py +124 -0
lr/merge.py +29 -0
lr/plot.py +84 -0
lr/train.py +254 -0
lr/util.py +50 -0
requirements.txt +1 -0

lr/__init__.py ADDED Viewed

File without changes

lr/__init__.pyc ADDED Viewed

Binary file (101 Bytes). View file

lr/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (142 Bytes). View file

lr/__pycache__/eval.cpython-38.pyc ADDED Viewed

Binary file (3.22 kB). View file

lr/__pycache__/hyperparameters.cpython-38.pyc ADDED Viewed

Binary file (4.73 kB). View file

lr/__pycache__/plot.cpython-38.pyc ADDED Viewed

Binary file (2.3 kB). View file

lr/__pycache__/train.cpython-38.pyc ADDED Viewed

Binary file (6.78 kB). View file

lr/__pycache__/util.cpython-38.pyc ADDED Viewed

Binary file (2.45 kB). View file

lr/eval.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import argparse
+import json
+import logging
+import os
+import pathlib
+import random
+import shutil
+import time
+from typing import Any, Dict, List, Union
+import numpy as np
+import pandas as pd
+import ray
+from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, HashingVectorizer,
+                                             TfidfVectorizer)
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from lr.hyperparameters import SEARCH_SPACE, RandomSearch, HyperparameterSearch
+from shutil import rmtree
+# Create a custom logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def load_model(serialization_dir):
+    with open(os.path.join(serialization_dir, "best_hyperparameters.json"), 'r') as f:
+        hyperparameters = json.load(f)
+    if hyperparameters.pop('stopwords') == 1:
+        stop_words = 'english'
+    else:
+        stop_words = None
+    weight = hyperparameters.pop('weight')
+    if weight == 'binary':
+        binary = True
+    else:
+        binary = False
+    ngram_range = hyperparameters.pop('ngram_range')
+    ngram_range = sorted([int(x) for x in ngram_range.split()])
+    if weight == 'tf-idf':
+        vect = TfidfVectorizer(stop_words=stop_words,
+                               lowercase=True,
+                               ngram_range=ngram_range)
+    elif weight == 'hash':
+        vect = HashingVectorizer(stop_words=stop_words,lowercase=True,ngram_range=ngram_range)
+    else:
+        vect = CountVectorizer(binary=binary,
+                               stop_words=stop_words,
+                               lowercase=True,
+                               ngram_range=ngram_range)
+    if weight != "hash":
+        with open(os.path.join(serialization_dir, "vocab.json"), 'r') as f:
+            vocab = json.load(f)
+        vect.vocabulary_ = vocab
+    hyperparameters['C'] = float(hyperparameters['C'])
+    hyperparameters['tol'] = float(hyperparameters['tol'])
+    classifier = LogisticRegression(**hyperparameters)
+    if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
+        vect.idf_ = np.load(os.path.join(serialization_dir,  "archive", "idf.npy"))
+    classifier.coef_ = np.load(os.path.join(serialization_dir,  "archive", "coef.npy"))
+    classifier.intercept_ = np.load(os.path.join(serialization_dir,  "archive", "intercept.npy"))
+    classifier.classes_ = np.load(os.path.join(serialization_dir,  "archive", "classes.npy"))
+    return classifier, vect
+def eval_lr(test,
+            classifier,
+            vect):
+    start = time.time()
+    X_test = vect.transform(tqdm(test.text, desc="fitting and transforming data"))
+    end = time.time()
+    preds = classifier.predict(X_test)
+    scores = classifier.predict_proba(X_test)
+    return f1_score(test.label, preds, average='macro'), classifier.score(X_test, test.label), scores
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--eval_file', type=str)
+    parser.add_argument('--model', '-m', type=str)
+    parser.add_argument('--output', '-o', type=str)
+    args = parser.parse_args()
+    if not os.path.isdir(args.model):
+        print(f"model {args.model} does not exist. Aborting! ")
+    else:
+        clf, vect = load_model(args.model)
+    print(f"reading evaluation data at {args.eval_file}...")
+    test = pd.read_json(args.eval_file, lines=True)
+    f1, acc, scores = eval_lr(test, clf, vect)
+    if args.output:
+        out = pd.DataFrame({'id': test['id'], 'score': scores.tolist()})
+        out.to_json(args.output, lines=True, orient='records')
+    print("================")
+    print(f"F1: {f1}")
+    print(f"accuracy: {acc}")

lr/hyperparameters.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from typing import Any, Dict, List, Union
+import numpy as np
+import logging
+import os
+# Create a custom logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+class RandomSearch:
+    @staticmethod
+    def random_choice(args: List[Any], n: int = 1):
+        """
+        pick a random element from a set.
+        Example:
+            >> sampler = RandomSearch.random_choice(1,2,3)
+            >> sampler()
+                2
+        """
+        choices = []
+        for arg in args:
+            choices.append(arg)
+        if n == 1:
+            return lambda: np.random.choice(choices, replace=False)
+        else:
+            return lambda: np.random.choice(choices, n, replace=False)
+    @staticmethod
+    def random_integer(low: Union[int, float], high: Union[int, float]):
+        """
+        pick a random integer between two bounds
+        Example:
+            >> sampler = RandomSearch.random_integer(1, 10)
+            >> sampler()
+                9
+        """
+        return lambda: int(np.random.randint(low, high))
+    @staticmethod
+    def random_loguniform(low: Union[float, int], high: Union[float, int]):
+        """
+        pick a random float between two bounds, using loguniform distribution
+        Example:
+            >> sampler = RandomSearch.random_loguniform(1e-5, 1e-2)
+            >> sampler()
+                0.0004
+        """
+        return lambda: np.exp(np.random.uniform(np.log(low), np.log(high)))
+    @staticmethod
+    def random_uniform(low: Union[float, int], high: Union[float, int]):
+        """
+        pick a random float between two bounds, using uniform distribution
+        Example:
+            >> sampler = RandomSearch.random_uniform(0, 1)
+            >> sampler()
+                0.01
+        """
+        return lambda: np.random.uniform(low, high)
+class HyperparameterSearch:
+    def __init__(self, **kwargs):
+        self.search_space = {}
+        self.lambda_ = lambda: 0
+        for key, val in kwargs.items():
+            self.search_space[key] = val
+    def parse(self, val: Any):
+        if isinstance(val, (int, np.int)):
+            return int(val)
+        elif isinstance(val, (float, np.float)):
+            return val
+        elif isinstance(val, (np.ndarray, list)):
+            return " ".join(val)
+        elif val is None:
+            return None
+        if isinstance(val, str):
+            return val
+        else:
+            val = val()
+            if isinstance(val, (int, np.int)):
+                return int(val)
+            elif isinstance(val, (np.ndarray, list)):
+                return " ".join(val)
+            else:
+                return val
+    def sample(self) -> Dict:
+        res = {}
+        for key, val in self.search_space.items():
+            try:
+                res[key] = self.parse(val)
+            except (TypeError, ValueError) as error:
+                logger.error(f"Could not parse key {key} with value {val}. {error}")
+        return res
+    def update_environment(self, sample) -> None:
+        for key, val in sample.items():
+            os.environ[key] = str(val)
+SEARCH_SPACE = {
+        "penalty": RandomSearch.random_choice(["l1", "l2"]),
+        "C": RandomSearch.random_uniform(0, 1),
+        "solver": "liblinear",
+        "multi_class": "auto",
+        "tol": RandomSearch.random_loguniform(10e-5, 10e-3),
+        "stopwords": RandomSearch.random_choice([0, 1]),
+        "weight": RandomSearch.random_choice(["hash"]),
+        "ngram_range": RandomSearch.random_choice(["1 2", "2 3", "1 3"]),
+        "random_state": RandomSearch.random_integer(0, 100000)
+}

lr/merge.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import argparse
+import json
+import logging
+import os
+import pathlib
+from typing import Any, Dict, List, Union
+import sys
+import pandas as pd
+# Create a custom logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--experiments', nargs="+", type=str)
+    parser.add_argument('--output', type=str)
+    args = parser.parse_args()
+    dfs = []
+    for experiment in args.experiments:
+        if not os.path.isdir(experiment):
+            print(f"experiment {experiment} does not exist. Aborting! ")
+            sys.exit(1)
+        else:
+            dfs.append(pd.read_json(os.path.join(experiment, "results.jsonl"), lines=True))
+    master = pd.concat(dfs, 0)
+    master.to_json(args.output, lines=True, orient='records')

lr/plot.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import argparse
+import json
+import logging
+import os
+import pathlib
+import random
+import shutil
+import time
+from typing import Any, Dict, List, Union
+import seaborn as sns
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+# Create a custom logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def load_model(hyperparameters):
+    if hyperparameters.pop('stopwords') == 1:
+        stop_words = 'english'
+    else:
+        stop_words = None
+    weight = hyperparameters.pop('weight')
+    if weight == 'binary':
+        binary = True
+    else:
+        binary = False
+    ngram_range = hyperparameters.pop('ngram_range')
+    ngram_range = sorted([int(x) for x in ngram_range.split()])
+    if weight == 'tf-idf':
+        vect = TfidfVectorizer(stop_words=stop_words,
+                               lowercase=True,
+                               ngram_range=ngram_range)
+    else:
+        vect = CountVectorizer(binary=binary,
+                               stop_words=stop_words,
+                               lowercase=True,
+                               ngram_range=ngram_range)
+    hyperparameters['C'] = float(hyperparameters['C'])
+    hyperparameters['tol'] = float(hyperparameters['tol'])
+    classifier = LogisticRegression(**hyperparameters)
+    return classifier, vect
+def eval_lr(test,
+            classifier,
+            vect):
+    start = time.time()
+    X_test = vect.fit_transform(tqdm(test.text, desc="fitting and transforming data"))
+    end = time.time()
+    preds = classifier.predict(X_test)
+    return f1_score(test.label, preds, average='macro'), classifier.score(X_test, test.label)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--results_file', '-m', type=str)
+    parser.add_argument('--performance_metric', '-p', type=str)
+    parser.add_argument('--hyperparameter', '-x', type=str)
+    parser.add_argument('--logx', action='store_true')
+    parser.add_argument('--boxplot', action='store_true')
+    args = parser.parse_args()
+    if not os.path.exists(args.results_file):
+        print(f"Results file {args.results_file} does not exist. Aborting! ")
+        sys.exit(1)
+    else:
+        df = pd.read_json(args.results_file, lines=True)
+    if args.boxplot:
+        ax = sns.boxplot(df[args.hyperparameter], df[args.performance_metric])
+    else:
+        ax = sns.scatterplot(df[args.hyperparameter], df[args.performance_metric])
+    if args.logx:
+        ax.set_xscale("log")
+    plt.show()

lr/train.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import argparse
+import json
+import logging
+import os
+import pathlib
+import random
+import shutil
+import sys
+import time
+from ast import literal_eval
+from shutil import rmtree
+from typing import Any, Dict, List, Union
+import numpy as np
+import pandas as pd
+import ray
+from sklearn.feature_extraction.text import (CountVectorizer, HashingVectorizer, TfidfVectorizer)
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from lr.hyperparameters import (SEARCH_SPACE, HyperparameterSearch,
+                             RandomSearch)
+from lr.util import jackknife, replace_bool, stratified_sample
+# Create a custom logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def train_lr(train,
+             dev,
+             test,
+             search_space):
+    master = pd.concat([train, dev], 0)
+    space = HyperparameterSearch(**search_space)
+    sample = space.sample()
+    if sample.pop('stopwords') == 1:
+        stop_words = 'english'
+    else:
+        stop_words = None
+    weight = sample.pop('weight')
+    if weight == 'binary':
+        binary = True
+    else:
+        binary = False
+    ngram_range = sample.pop('ngram_range')
+    ngram_range = sorted([int(x) for x in ngram_range.split()])
+    if weight == 'tf-idf':
+        vect = TfidfVectorizer(stop_words=stop_words,
+                               lowercase=True,
+                               ngram_range=ngram_range,
+                               )
+    elif weight == 'hash':
+        vect = HashingVectorizer(stop_words=stop_words,
+                               lowercase=True,
+                               ngram_range=ngram_range,
+                               )
+    else:
+        vect = CountVectorizer(binary=binary,
+                               stop_words=stop_words,
+                               lowercase=True,
+                               ngram_range=ngram_range,
+                               )
+    start = time.time()
+    vect.fit(tqdm(master.text, desc="fitting data", leave=False))
+    X_train = vect.transform(tqdm(train.text, desc="transforming training data",  leave=False))
+    X_dev = vect.transform(tqdm(dev.text, desc="transforming dev data",  leave=False))
+    if test is not None:
+        X_test = vect.transform(tqdm(test.text, desc="transforming test data",  leave=False))
+    sample['C'] = float(sample['C'])
+    sample['tol'] = float(sample['tol'])
+    classifier = LogisticRegression(**sample, verbose=True)
+    classifier.fit(X_train, train.label)
+    end = time.time()
+    for k, v in sample.items():
+        if not v:
+            v = str(v)
+        sample[k] = [v]
+    res = pd.DataFrame(sample)
+    preds = classifier.predict(X_dev)
+    if test is not None:
+        test_preds = classifier.predict(X_test)
+    res['dev_f1'] = f1_score(dev.label, preds, average='macro')
+    if test is not None:
+        res['test_f1'] = f1_score(test.label, test_preds, average='macro')
+    res['dev_accuracy'] = classifier.score(X_dev, dev.label)
+    if test is not None:
+        res['test_accuracy'] = classifier.score(X_test, test.label)
+    res['training_duration'] = end - start
+    res['ngram_range'] = str(ngram_range)
+    res['weight'] = weight
+    res['stopwords'] = stop_words
+    return classifier, vect, res
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train_file', type=str)
+    parser.add_argument('--dev_file', type=str, required=False)
+    parser.add_argument('--test_file', type=str, required=False)
+    parser.add_argument('--search_trials', type=int, default=5)
+    parser.add_argument('--train_subsample', type=int, required=False)
+    parser.add_argument('--stratified', action='store_true')
+    parser.add_argument('--jackknife_partitions', type=int, default=5, required=False)
+    parser.add_argument('--save_jackknife_partitions', action='store_true')
+    parser.add_argument('--serialization_dir', '-s', type=str)
+    parser.add_argument('--override', '-o', action='store_true')
+    parser.add_argument('--evaluate_on_test', '-t', action='store_true')
+    args = parser.parse_args()
+    if not os.path.isdir(args.serialization_dir):
+        os.makedirs(args.serialization_dir)
+    else:
+        if args.override:
+            rmtree(args.serialization_dir)
+            os.makedirs(args.serialization_dir)
+        else:
+            print(f"serialization directory {args.serialization_dir} exists. Aborting! ")
+    print(f"reading training data at {args.train_file}...")
+    train = pd.read_json(args.train_file, lines=True)
+    if args.train_subsample:
+        if args.stratified:
+            train = stratified_sample(train, "label", args.train_subsample)
+        else:
+            train = train.sample(n=args.train_subsample)
+    if args.dev_file:
+        print(f"reading dev data at {args.dev_file}...")
+        dev = pd.read_json(args.dev_file, lines=True)
+    else:
+        print("Dev file not provided, will jackknife training data...")
+    if args.evaluate_on_test:
+        if args.test_file:
+            print(f"reading test data at {args.test_file}...")
+            test = pd.read_json(args.test_file, lines=True)
+        else:
+            print("Test file not provided.")
+            sys.exit(1)
+    else:
+        test = None
+    num_assignments = args.search_trials
+    num_partitions = args.jackknife_partitions
+    df = pd.DataFrame()
+    current_f1 = 0.0
+    best_classifier = None
+    best_vect = None
+    if args.dev_file:
+        pbar = tqdm(range(num_assignments), desc="search trials", leave=False)
+        for i in pbar:
+            try:
+                classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE)
+                df = pd.concat([df, res], 0, sort=True)
+                best_f1 = df.dev_f1.max()
+                if res.dev_f1[0] > current_f1:
+                    current_f1 = res.dev_f1[0]
+                    best_classifier = classifier
+                    best_vect = vect
+                pbar.set_description(f"mean +- std dev F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}, max F1: {df.dev_f1.max()}")
+            except KeyboardInterrupt:
+                break
+    else:
+        if args.save_jackknife_partitions:
+            if not os.path.isdir(os.path.join(args.serialization_dir, "jackknife")):
+                os.mkdir(os.path.join(args.serialization_dir, "jackknife"))
+        for ix, (train, dev) in tqdm(enumerate(jackknife(train, num_partitions=num_partitions)),
+                                     total=num_partitions,
+                                     leave=False,
+                                     desc="jackknife partitions"):
+            for i in tqdm(range(num_assignments), desc="search trials",  leave=False):
+                classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE)
+                df = pd.concat([df, res], 0, sort=True)
+                best_f1 = df.dev_f1.max()
+                if res.dev_f1[0] > current_f1:
+                    current_f1 = res.dev_f1[0]
+                    best_classifier = classifier
+                    best_vect = vect
+            df['dataset_reader.sample'] = train.shape[0]
+            df['model.encoder.architecture.type'] = 'logistic regression'
+            if args.save_jackknife_partitions:
+                train.to_json(
+                    os.path.join(args.serialization_dir,
+                                 "jackknife",
+                                 f"train.{ix}"),
+                                 lines=True,
+                                 orient="records")
+                dev.to_json(os.path.join(args.serialization_dir,
+                                         "jackknife",
+                                         f"dev.{ix}"),
+                                         lines=True,
+                                         orient='records')
+    print("DEV STATISTICS")
+    print("================")
+    print(f"mean +- std F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}")
+    print(f"max F1: {df.dev_f1.max()}")
+    print(f"min F1: {df.dev_f1.min()}")
+    print(f"mean +- std accuracy: {df.dev_accuracy.mean()} +- {df.dev_accuracy.std()}")
+    print(f"max accuracy: {df.dev_accuracy.max()}")
+    print(f"min accuracy: {df.dev_accuracy.min()}")
+    print("")
+    print("BEST HYPERPARAMETERS")
+    print(f"=====================")
+    best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict()
+    print(df.reset_index().iloc[df.reset_index().dev_f1.idxmax()])
+    if test is not None:
+        print("TEST STATISTICS")
+        print("================")
+        print(f"mean +- std F1: {df.test_f1.mean()} +- {df.test_f1.std()}")
+        print(f"max F1: {df.test_f1.max()}")
+        print(f"min F1: {df.test_f1.min()}")
+        print(f"mean +- std accuracy: {df.test_accuracy.mean()} +- {df.test_accuracy.std()}")
+        print(f"max accuracy: {df.test_accuracy.max()}")
+        print(f"min accuracy: {df.test_accuracy.min()}")
+    df.to_json(os.path.join(args.serialization_dir, "results.jsonl"), lines=True, orient='records')
+    with open(os.path.join(args.serialization_dir, "best_hyperparameters.json"), "w+") as f:
+        best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict()
+        for k,v in best_hp.items():
+            if isinstance(v, np.int64):
+                best_hp[k] = int(v)
+            if isinstance(v, str) and "[" in v:
+                v = literal_eval(v)
+                best_hp[k] = f"{v[0]} {v[1]}"
+        best_hp.pop("index")
+        best_hp.pop("dev_accuracy")
+        best_hp.pop("dev_f1")
+        if test is not None:
+            best_hp.pop("test_accuracy")
+            best_hp.pop("test_f1")
+        best_hp.pop("training_duration")
+        json.dump(best_hp, f)
+    with open(os.path.join(args.serialization_dir, "vocab.json"), 'w+') as f:
+        for k,v in best_vect.__dict__['vocabulary_'].items():
+            best_vect.__dict__['vocabulary_'][k] = int(v)
+        json.dump(best_vect.__dict__['vocabulary_'], f)
+    os.mkdir(os.path.join(args.serialization_dir, "archive"))
+    try:
+        np.save(os.path.join(args.serialization_dir, "archive", "idf.npy"), best_vect.idf_)
+    except:
+        pass
+    np.save(os.path.join(args.serialization_dir, "archive", "classes.npy"),best_classifier.classes_)
+    np.save(os.path.join(args.serialization_dir, "archive", "coef.npy"),best_classifier.coef_)
+    np.save(os.path.join(args.serialization_dir, "archive", "intercept.npy"), best_classifier.intercept_)

lr/util.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import json
+import numpy as np
+import pandas as pd
+def load_huggingface_tokenizer(tokenizer_path: str):
+    with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
+            config = json.load(f)
+    tokenizer_type = config['tokenizer_type']
+    tokenizer = {'BPE': BPETokenizer,
+                 'BBPE': ByteLevelBPETokenizer,
+                 'BERT': BertWordPieceTokenizer}[tokenizer_type]
+    if tokenizer_type in ['BPE', 'BBPE']:
+        vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
+        merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
+        tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
+                            merges_file=os.path.join(tokenizer_path, merges_file))
+    else:
+        vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
+        tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
+    return tokenizer
+def jackknife(data, num_partitions=5):
+    data = data.sample(frac=1)
+    splits = np.split(data, range(0, data.shape[0], int(data.shape[0]/num_partitions) )[1:])
+    for i, split in enumerate(splits):
+        train_parts = list(range(0, num_partitions))
+        try:
+            train_parts.remove(i)
+            yield pd.concat([splits[ix] for ix in train_parts], 0), split
+        except ValueError:
+            continue
+def stratified_sample(df, col, n_samples):
+    n = min(n_samples, df[col].value_counts().min())
+    rand_int = np.random.randint(1, 10000)
+    df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=rand_int))
+    df_.index = df_.index.droplevel(0)
+    return df_
+def replace_bool(x):
+    if x == 'true':
+        return 1
+    elif x == 'false':
+        return 0
+    else:
+        return x

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ scikit-learn