Spaces:
Sleeping
Sleeping
import re | |
from concurrent.futures import ThreadPoolExecutor | |
from ditk import logging | |
from gchar.games.azurlane import Character | |
from gchar.resources.pixiv import get_pixiv_posts | |
from tqdm.auto import tqdm | |
from waifuc.utils import task_ctx | |
from cyberharem.dataset import crawl_dataset_to_huggingface | |
from cyberharem.utils import get_hf_fs | |
def _cht(ch: Character): | |
name = str(ch.enname or ch.cnname or ch.jpname) | |
short_name = re.sub(r'[\W_]+', '_', name).strip('_') | |
return f'{short_name}_{ch.__game_name__}' | |
def _get_pixiv_posts(ch: Character): | |
v = get_pixiv_posts(ch) | |
return 0 if v is None else v[0] | |
hf_fs = get_hf_fs() | |
# all_chs = [ch for ch in Character.all(contains_extra=False) if not ch.is_extra and ch.accessible and ch.index >= 153] | |
all_chs = Character.all(contains_extra=False) | |
all_chs = sorted(all_chs, key=lambda x: -_get_pixiv_posts(x))[2::3] | |
pg = tqdm(total=len(all_chs)) | |
crawled_ids = set() | |
def _crawl(char_): | |
try: | |
repo = f'CyberHarem/{_cht(char_)}' | |
if char_.gender == 'female': | |
if char_.index not in crawled_ids and \ | |
not hf_fs.exists(f'datasets/{repo}/dataset-raw.zip'): | |
with task_ctx(repo): | |
crawl_dataset_to_huggingface( | |
char_, repository=repo, limit=200 | |
) | |
crawled_ids.add(char_.index) | |
pg.update() | |
else: | |
logging.info(f'{char_!r} already crawled, skipped.') | |
pg.update() | |
else: | |
logging.info(f'{char_!r} is not female, skipped!') | |
pg.update() | |
except Exception as err: | |
logging.error(repr(err)) | |
raise | |
if __name__ == '__main__': | |
logging.try_init_root(logging.INFO) | |
tp = ThreadPoolExecutor(max_workers=1) | |
for ch in all_chs: | |
# _crawl(ch) | |
tp.submit(_crawl, ch) | |
tp.shutdown() | |