File size: 3,492 Bytes
4321e7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import random
import argparse
import pandas as pd
import multiprocessing as mp
from foldseek_util import get_struc_seq


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--file_path",
        type=str,
        required=True,
        help="Path to the file containing uniprotid information.",
    )
    parser.add_argument(
        "--sheet_name",
        type=str,
        default="Sheet1",
        help="Name of the sheet to read (for Excel files). Default is 'Sheet1'.",
    )
    parser.add_argument(
        "--pdb_dir",
        type=str,
        default="pdb_files/UP000000589_10090_MOUSE_v4",
        help="Directory containing PDB files.",
    )
    parser.add_argument(
        "--uniprotid_column",
        type=str,
        help="Name of the column containing UniprotID information.",
    )
    parser.add_argument(
        "--uniprotids_column",
        type=str,
        help="Name of the column containing multiple UniprotIDs (separated by semicolons). The first ID will be used.",
    )
    parser.add_argument(
        "--num_processes",
        type=int,
        default=2,
        help="Number of processes to use for multiprocessing. Default is 2.",
    )
    return parser.parse_args()


def validate_columns(cfg, df):
    if cfg.uniprotid_column is None and cfg.uniprotids_column is None:
        raise ValueError("Either --uniprotid_column or --uniprotids_column must be provided.")
    if cfg.uniprotids_column:
        df = df.dropna(subset=[cfg.uniprotids_column]).reset_index(drop=True)
        df["uniprotid"] = df[cfg.uniprotids_column].apply(lambda x: x.split(";")[0].split("-")[0])
        cfg.uniprotid_column = "uniprotid"
    return df.dropna(subset=[cfg.uniprotid_column]).reset_index(drop=True)


def find_pdb_files(pdb_dir, uniprot_ids):
    pdf_files = os.listdir(pdb_dir)
    pdb_paths = []
    for uniprot_id in uniprot_ids:
        matches = [pdf_file for pdf_file in sorted(pdf_files) if uniprot_id in pdf_file]
        pdb_paths.append(matches[0] if matches else None)
    return pdb_paths


def get_foldseek_seq(pdb_path, cfg):
    parsed_seqs = get_struc_seq(
        "bin/foldseek",
        os.path.join(cfg.pdb_dir, pdb_path),
        ["A"],
        process_id=random.randint(0, 10000000),
    )["A"]
    return parsed_seqs


if __name__ == "__main__":
        
    config = parse_args()

    if config.file_path.endswith(".xls") or config.file_path.endswith(".xlsx"):
        df = pd.read_excel(
            config.file_path,
            sheet_name=config.sheet_name,
        )
    else:
        df = pd.read_csv(config.file_path)
    df = validate_columns(config, df)

    df = df.dropna(subset=[config.uniprotid_column]).reset_index(drop=True)

    uniprot_ids = df[config.uniprotid_column].tolist()
    pdb_paths = find_pdb_files(config.pdb_dir, uniprot_ids)
    df["pdb_path"] = pdb_paths
    df = df.dropna(subset=["pdb_path"]).reset_index(drop=True)
    df = df.drop_duplicates(subset=[config.uniprotid_column]).reset_index(drop=True)

    with mp.Pool(config.num_processes) as pool:
        output = pool.map(lambda x: get_foldseek_seq(x, config), df["pdb_path"].tolist())

    aa, foldseek, aa_foldseek = zip(*output)

    df["aa"] = aa
    df["foldseek"] = foldseek
    df["aa_foldseek"] = aa_foldseek
    df.to_csv(f"{config.file_path.split('.')[0]}_foldseek.csv", index=False)