pip --cert /etc/pki/ca-trust/source/anchors/tri-ace-CA-2015.cer install --trusted-host pypi.org --trusted-host files.pythonhosted.org beautifulsoup4 from bs4 import BeautifulSoup f = open("./amitaro.htm", "r") txt = f.read() soup = BeautifulSoup(txt) print(soup.prettify()) import json f = open('amitaro.json') file_list = json.load(f) td = soup.find_all('td') for i, val in enumerate(td): if len(val.contents) == 0: continue key = val.contents[0] if key in file_list: #print(td[i-1].contents[0]) if len(td[i-1].contents) > 0: #print(td[i-1].contents[0]) temp = BeautifulSoup(str(td[i-1].contents[0])) a = temp.find_all('a') print(a[0].contents[0]) file_list[key]["kana"] = str(a[0].contents[0]) with open("./amitaro_with_kana.json", "w") as outfile: outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False)) for key, val in file_list.items(): val["path"] = "./data_amitaro22k/" + val["path"] with open("./amitaro_with_kana.json", "w") as outfile: outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False)) file = [] for key, val in file_list.items(): if len(val['kana']) == 0: continue if val['kana'].find("(") != -1: continue file.append(f"{val['path']}|10|{val['kana']}") amitaro_train = [] amitaro_val = [] for val in file: amitaro_train.append(val) import random rands = [] while len(rands) < len(file)/10: rand_num = random.randint(0, len(file)-1) if rand_num in rands: continue amitaro_val.append(file[rand_num]) rands.append(rand_num) f = open("amitaro_train.txt", "w") for val in amitaro_train: f.write(f"{val}\n") f.close() f = open("amitaro_val.txt", "w") for val in amitaro_val: f.write(f"{val}\n") f.close()