jupyterlab_mitmproxy / test_crawl.py
narugo1992
Duplicate from narugo/jupyterlab_crawl_nikke
5ff92c3
raw
history blame contribute delete
No virus
1.92 kB
import re
from concurrent.futures import ThreadPoolExecutor
from ditk import logging
from gchar.games.azurlane import Character
from gchar.resources.pixiv import get_pixiv_posts
from tqdm.auto import tqdm
from waifuc.utils import task_ctx
from cyberharem.dataset import crawl_dataset_to_huggingface
from cyberharem.utils import get_hf_fs
def _cht(ch: Character):
name = str(ch.enname or ch.cnname or ch.jpname)
short_name = re.sub(r'[\W_]+', '_', name).strip('_')
return f'{short_name}_{ch.__game_name__}'
def _get_pixiv_posts(ch: Character):
v = get_pixiv_posts(ch)
return 0 if v is None else v[0]
hf_fs = get_hf_fs()
# all_chs = [ch for ch in Character.all(contains_extra=False) if not ch.is_extra and ch.accessible and ch.index >= 153]
all_chs = Character.all(contains_extra=False)
all_chs = sorted(all_chs, key=lambda x: -_get_pixiv_posts(x))[2::3]
pg = tqdm(total=len(all_chs))
crawled_ids = set()
def _crawl(char_):
try:
repo = f'CyberHarem/{_cht(char_)}'
if char_.gender == 'female':
if char_.index not in crawled_ids and \
not hf_fs.exists(f'datasets/{repo}/dataset-raw.zip'):
with task_ctx(repo):
crawl_dataset_to_huggingface(
char_, repository=repo, limit=200
)
crawled_ids.add(char_.index)
pg.update()
else:
logging.info(f'{char_!r} already crawled, skipped.')
pg.update()
else:
logging.info(f'{char_!r} is not female, skipped!')
pg.update()
except Exception as err:
logging.error(repr(err))
raise
if __name__ == '__main__':
logging.try_init_root(logging.INFO)
tp = ThreadPoolExecutor(max_workers=1)
for ch in all_chs:
# _crawl(ch)
tp.submit(_crawl, ch)
tp.shutdown()