DockFormerPP

Running

File size: 6,722 Bytes

0fdcb79

# Copyright 2021 AlQuraishi Laboratory
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys

from env_consts import TEST_INPUT_DIR, TEST_OUTPUT_DIR, CKPT_PATH
import json
import logging
import numpy as np
import os
import pickle

from dockformerpp.data.data_modules import OpenFoldSingleDataset

logging.basicConfig()
logger = logging.getLogger(__file__)
logger.setLevel(level=logging.INFO)

import torch
torch_versions = torch.__version__.split(".")
torch_major_version = int(torch_versions[0])
torch_minor_version = int(torch_versions[1])
if (
    torch_major_version > 1 or
    (torch_major_version == 1 and torch_minor_version >= 12)
):
    # Gives a large speedup on Ampere-class GPUs
    torch.set_float32_matmul_precision("high")

torch.set_grad_enabled(False)

from dockformerpp.config import model_config
from dockformerpp.utils.script_utils import (load_models_from_command_line, run_model, save_output_structure,
                                             get_latest_checkpoint)
from dockformerpp.utils.tensor_utils import tensor_tree_map


def list_files_with_extensions(dir, extensions):
    return [f for f in os.listdir(dir) if f.endswith(extensions)]


def override_config(base_config, overriding_config):
    for k, v in overriding_config.items():
        if isinstance(v, dict):
            base_config[k] = override_config(base_config[k], v)
        else:
            base_config[k] = v
    return base_config


def run_on_folder(input_dir: str, output_dir: str, run_config_path: str, skip_relaxation=True,
                  long_sequence_inference=False, skip_exists=False):
    config_preset = "initial_training"
    save_outputs = False
    device_name = "cuda" if torch.cuda.is_available() else "cpu"

    run_config = json.load(open(run_config_path))

    ckpt_path = CKPT_PATH
    if ckpt_path is None:
        ckpt_path = get_latest_checkpoint(os.path.join(run_config["train_output_dir"], "checkpoint"))
    print("Using checkpoint: ", ckpt_path)

    config = model_config(config_preset, long_sequence_inference=long_sequence_inference)
    config = override_config(config, run_config.get("override_conf", {}))

    model_generator = load_models_from_command_line(
        config,
        model_device=device_name,
        model_checkpoint_path=ckpt_path,
        output_dir=output_dir)
    print("Model loaded")
    model, output_directory = next(model_generator)

    dataset = OpenFoldSingleDataset(data_dir=input_dir, config=config.data, mode="predict")
    for i, processed_feature_dict in enumerate(dataset):
        tag = dataset.get_metadata_for_idx(i)["input_name"]
        print("Processing", tag)
        output_name = f"{tag}_predicted"
        output_path = os.path.join(output_directory, f'{output_name}_joined.pdb')
        if os.path.exists(output_path) and skip_exists:
            print("skipping exists", output_name)
            continue

        # turn into a batch of size 1
        processed_feature_dict = {key: value.unsqueeze(0).to(device_name)
                                  for key, value in processed_feature_dict.items()}

        out = run_model(model, processed_feature_dict, tag, output_dir)

        # Toss out the recycling dimensions --- we don't need them anymore
        processed_feature_dict = tensor_tree_map(
            lambda x: np.array(x[..., -1].cpu()),
            processed_feature_dict
        )
        out = tensor_tree_map(lambda x: np.array(x.cpu()), out)

        protein_mask = processed_feature_dict["structural_mask"][0].astype(bool)

        in_chain_residue_index = np.concatenate([processed_feature_dict["in_chain_residue_index_r"][0],
                                                 processed_feature_dict["in_chain_residue_index_l"][0]])
        chain_index = [0] * len(processed_feature_dict["in_chain_residue_index_r"][0])
        chain_index += [1] * len(processed_feature_dict["in_chain_residue_index_l"][0])
        chain_index = np.array(chain_index)

        save_output_structure(
            aatype=processed_feature_dict["aatype"][0][protein_mask],
            residue_index=in_chain_residue_index,
            chain_index=chain_index,
            plddt=out["plddt"][0][protein_mask],
            final_atom_protein_positions=out["final_atom_positions"][0][protein_mask],
            final_atom_mask=out["final_atom_mask"][0][protein_mask],
            output_path=output_path,
        )

        logger.info(f"Output written to {output_path}...")

        # TODO: fix relaxation
        # if not skip_relaxation:
        #     # Relax the prediction.
        #     logger.info(f"Running relaxation on {output_path}...")
        #     from dockformerpp.utils.relax import relax_complex
        #     try:
        #         relax_complex(output_path,
        #                       ligand_output_path,
        #                       os.path.join(output_directory, f'{output_name}_protein_relaxed.pdb'),
        #                       os.path.join(output_directory, f'{output_name}_ligand_relaxed.sdf'))
        #     except Exception as e:
        #         logger.error(f"Failed to relax {protein_output_path} due to {e}...")

        if save_outputs:
            output_dict_path = os.path.join(
                output_directory, f'{output_name}_output_dict.pkl'
            )
            with open(output_dict_path, "wb") as fp:
                pickle.dump(out, fp, protocol=pickle.HIGHEST_PROTOCOL)

            logger.info(f"Model output written to {output_dict_path}...")


if __name__ == "__main__":
    config_path = sys.argv[1] if len(sys.argv) > 1 else os.path.join(os.path.dirname(__file__), "run_config.json")
    input_dir, output_dir = TEST_INPUT_DIR, TEST_OUTPUT_DIR
    options = {"skip_relaxation": True, "long_sequence_inference": False}
    if len(sys.argv) > 3:
        input_dir = sys.argv[2]
        output_dir = sys.argv[3]
        if "--relax" in sys.argv:
            options["skip_relaxation"] = False
        if "--long" in sys.argv:
            options["long_sequence_inference"] = True
        if "--allow-skip" in sys.argv:
            options["skip_exists"] = True

    run_on_folder(input_dir, output_dir, config_path, **options)