import os import json import numpy as np import pandas as pd def load_huggingface_tokenizer(tokenizer_path: str): with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f: config = json.load(f) tokenizer_type = config['tokenizer_type'] tokenizer = {'BPE': BPETokenizer, 'BBPE': ByteLevelBPETokenizer, 'BERT': BertWordPieceTokenizer}[tokenizer_type] if tokenizer_type in ['BPE', 'BBPE']: vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0] merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0] tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file), merges_file=os.path.join(tokenizer_path, merges_file)) else: vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0] tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file)) return tokenizer def jackknife(data, num_partitions=5): data = data.sample(frac=1) splits = np.split(data, range(0, data.shape[0], int(data.shape[0]/num_partitions) )[1:]) for i, split in enumerate(splits): train_parts = list(range(0, num_partitions)) try: train_parts.remove(i) yield pd.concat([splits[ix] for ix in train_parts], 0), split except ValueError: continue def stratified_sample(df, col, n_samples): n = min(n_samples, df[col].value_counts().min()) rand_int = np.random.randint(1, 10000) df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=rand_int)) df_.index = df_.index.droplevel(0) return df_ def replace_bool(x): if x == 'true': return 1 elif x == 'false': return 0 else: return x