import numpy import pandas semantic_path = 'dump/semantic.tsv' phoneme_path = 'dump/phoneme.npy' train_semantic_path = 'dump/semantic_train.tsv' train_phoneme_path = 'dump/phoneme_train.npy' dev_semantic_path = 'dump/semantic_dev.tsv' dev_phoneme_path = 'dump/phoneme_dev.npy' # 读取dump/semantic.tsv semantic_df = pandas.read_csv(semantic_path, sep='\t') # pd.DataFrame(columns=["item_name", "semantic_audio"]) # # 读取dump/phoneme.npy phoneme_dict = numpy.load(phoneme_path, allow_pickle=True).item() dev_num = 20 # 随机从semantic_df中选取dev_num个 dev_df = semantic_df.sample(n=dev_num) # 剩下的是train train_df = semantic_df.drop(dev_df.index) # 保存 dev_df.to_csv(dev_semantic_path, sep='\t', index=False) train_df.to_csv(train_semantic_path, sep='\t', index=False) # 将dev_df中的item_name取出来 作为dev_phoneme_dict的key dev_item_names = dev_df['item_name'].tolist() dev_phoneme_dict = {k: phoneme_dict[k] for k in dev_item_names if k in phoneme_dict} train_phoneme_dict = {k: phoneme_dict[k] for k in phoneme_dict.keys() if k not in dev_item_names} numpy.save(dev_phoneme_path, dev_phoneme_dict) numpy.save(train_phoneme_path, train_phoneme_dict)