Spaces:
Runtime error
Runtime error
import os | |
import json | |
import numpy as np | |
import pandas as pd | |
def load_huggingface_tokenizer(tokenizer_path: str): | |
with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f: | |
config = json.load(f) | |
tokenizer_type = config['tokenizer_type'] | |
tokenizer = {'BPE': BPETokenizer, | |
'BBPE': ByteLevelBPETokenizer, | |
'BERT': BertWordPieceTokenizer}[tokenizer_type] | |
if tokenizer_type in ['BPE', 'BBPE']: | |
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0] | |
merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0] | |
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file), | |
merges_file=os.path.join(tokenizer_path, merges_file)) | |
else: | |
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0] | |
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file)) | |
return tokenizer | |
def jackknife(data, num_partitions=5): | |
data = data.sample(frac=1) | |
splits = np.split(data, range(0, data.shape[0], int(data.shape[0]/num_partitions) )[1:]) | |
for i, split in enumerate(splits): | |
train_parts = list(range(0, num_partitions)) | |
try: | |
train_parts.remove(i) | |
yield pd.concat([splits[ix] for ix in train_parts], 0), split | |
except ValueError: | |
continue | |
def stratified_sample(df, col, n_samples): | |
n = min(n_samples, df[col].value_counts().min()) | |
rand_int = np.random.randint(1, 10000) | |
df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=rand_int)) | |
df_.index = df_.index.droplevel(0) | |
return df_ | |
def replace_bool(x): | |
if x == 'true': | |
return 1 | |
elif x == 'false': | |
return 0 | |
else: | |
return x | |