import os import random import argparse import glob import pandas as pd import multiprocessing as mp from foldseek_util import get_struc_seq def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--pdb_dir", type=str, default="./pdb_files", help="Directory containing PDB files.", ) parser.add_argument( "--num_processes", type=int, default=2, help="Number of processes to use for multiprocessing. Default is 2.", ) parser.add_argument( "--output_dir", type=str, default="./data", help="Output directory.", ) return parser.parse_args() def get_foldseek_seq(pdb_path): parsed_seqs = get_struc_seq( "bin/foldseek", pdb_path, ["A"], process_id=random.randint(0, 10000000), )["A"] return parsed_seqs if __name__ == "__main__": config = parse_args() pdb_files = glob.glob(os.path.join(config.pdb_dir, "*.pdb")) with mp.Pool(config.num_processes) as pool: output = pool.map(get_foldseek_seq, pdb_files) aa, foldseek, aa_foldseek = zip(*output) result = {} result["file"] = pdb_files result["aa"] = aa result["foldseek"] = foldseek result["aa_foldseek"] = aa_foldseek df = pd.DataFrame(result) df.to_csv(os.path.join(config.output_dir, "foldseek_result.csv"), index=False)