File size: 916 Bytes
a38611e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import io
import gzip
import json
import sys
import requests
from tqdm import tqdm
_DATA_URL_TRAIN = "https://huggingface.co/datasets/bertin-project/mc4-es-sampled/resolve/main/mc4-es-train-50M-{config}-shard-{index:04d}-of-{n_shards:04d}.json.gz"
def main(config="stepwise"):
data_urls = [
_DATA_URL_TRAIN.format(
config=config,
index=index + 1,
n_shards=1024,
)
for index in range(1024)
]
with open(f"mc4-es-train-50M-{config}.jsonl", "w") as f:
for dara_url in tqdm(data_urls):
response = requests.get(dara_url)
bio = io.BytesIO(response.content)
with gzip.open(bio, "rt", encoding="utf8") as g:
for line in g:
json_line = json.loads(line.strip())
f.write(json.dumps(json_line) + "\n")
if __name__ == "__main__":
main(sys.argv[1])
|