|
pip --cert /etc/pki/ca-trust/source/anchors/tri-ace-CA-2015.cer install --trusted-host pypi.org --trusted-host files.pythonhosted.org beautifulsoup4 |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
f = open("./amitaro.htm", "r") |
|
txt = f.read() |
|
soup = BeautifulSoup(txt) |
|
print(soup.prettify()) |
|
|
|
import json |
|
f = open('amitaro.json') |
|
file_list = json.load(f) |
|
|
|
td = soup.find_all('td') |
|
for i, val in enumerate(td): |
|
if len(val.contents) == 0: |
|
continue |
|
key = val.contents[0] |
|
if key in file_list: |
|
|
|
if len(td[i-1].contents) > 0: |
|
|
|
temp = BeautifulSoup(str(td[i-1].contents[0])) |
|
a = temp.find_all('a') |
|
print(a[0].contents[0]) |
|
file_list[key]["kana"] = str(a[0].contents[0]) |
|
|
|
with open("./amitaro_with_kana.json", "w") as outfile: |
|
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False)) |
|
|
|
for key, val in file_list.items(): |
|
val["path"] = "./data_amitaro22k/" + val["path"] |
|
|
|
with open("./amitaro_with_kana.json", "w") as outfile: |
|
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False)) |
|
|
|
file = [] |
|
for key, val in file_list.items(): |
|
if len(val['kana']) == 0: |
|
continue |
|
if val['kana'].find("(") != -1: |
|
continue |
|
file.append(f"{val['path']}|10|{val['kana']}") |
|
|
|
amitaro_train = [] |
|
amitaro_val = [] |
|
for val in file: |
|
amitaro_train.append(val) |
|
|
|
import random |
|
|
|
rands = [] |
|
while len(rands) < len(file)/10: |
|
rand_num = random.randint(0, len(file)-1) |
|
if rand_num in rands: |
|
continue |
|
amitaro_val.append(file[rand_num]) |
|
rands.append(rand_num) |
|
|
|
f = open("amitaro_train.txt", "w") |
|
for val in amitaro_train: |
|
f.write(f"{val}\n") |
|
f.close() |
|
|
|
f = open("amitaro_val.txt", "w") |
|
for val in amitaro_val: |
|
f.write(f"{val}\n") |
|
f.close() |