Spaces:

ibm
/

FM4M-demo1

Running

App Files Files Community

ipd commited on 24 days ago

Commit

85ec4af

•

1 Parent(s): cd61b42

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +712 -0
data/.DS_Store +0 -0
data/bace/test.csv +0 -0
data/bace/train.csv +0 -0
data/bace/valid.csv +0 -0
data/esol/test.csv +109 -0
data/esol/train.csv +0 -0
log.csv +1 -0
models/.DS_Store +0 -0
models/__pycache__/fm4m.cpython-310.pyc +0 -0
models/fm4m.py +876 -0
models/mhg_model/.DS_Store +0 -0
models/mhg_model/README.md +75 -0
models/mhg_model/__init__.py +5 -0
models/mhg_model/__pycache__/__init__.cpython-310.pyc +0 -0
models/mhg_model/__pycache__/load.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/__init__.py +19 -0
models/mhg_model/graph_grammar/__pycache__/__init__.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/__pycache__/hypergraph.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/algo/__init__.py +20 -0
models/mhg_model/graph_grammar/algo/__pycache__/__init__.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/algo/__pycache__/tree_decomposition.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/algo/tree_decomposition.py +821 -0
models/mhg_model/graph_grammar/graph_grammar/__init__.py +20 -0
models/mhg_model/graph_grammar/graph_grammar/__pycache__/__init__.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/graph_grammar/__pycache__/base.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/graph_grammar/__pycache__/corpus.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/graph_grammar/__pycache__/hrg.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/graph_grammar/__pycache__/symbols.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/graph_grammar/__pycache__/utils.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/graph_grammar/base.py +30 -0
models/mhg_model/graph_grammar/graph_grammar/corpus.py +152 -0
models/mhg_model/graph_grammar/graph_grammar/hrg.py +1065 -0
models/mhg_model/graph_grammar/graph_grammar/symbols.py +180 -0
models/mhg_model/graph_grammar/graph_grammar/utils.py +130 -0
models/mhg_model/graph_grammar/hypergraph.py +544 -0
models/mhg_model/graph_grammar/io/__init__.py +20 -0
models/mhg_model/graph_grammar/io/__pycache__/__init__.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/io/__pycache__/smi.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/io/smi.py +559 -0
models/mhg_model/graph_grammar/nn/__init__.py +11 -0
models/mhg_model/graph_grammar/nn/__pycache__/__init__.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/nn/__pycache__/decoder.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/nn/__pycache__/encoder.cpython-310.pyc +0 -0
models/mhg_model/graph_grammar/nn/dataset.py +121 -0
models/mhg_model/graph_grammar/nn/decoder.py +158 -0
models/mhg_model/graph_grammar/nn/encoder.py +199 -0
models/mhg_model/graph_grammar/nn/graph.py +313 -0
models/mhg_model/images/mhg_example.png +0 -0
models/mhg_model/images/mhg_example1.png +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,712 @@

+import gradio as gr
+from huggingface_hub import InferenceClient
+import matplotlib.pyplot as plt
+from PIL import Image
+from rdkit.Chem import Descriptors, QED, Draw
+from rdkit.Chem.Crippen import MolLogP
+import pandas as pd
+from rdkit.Contrib.SA_Score import sascorer
+from rdkit.Chem import DataStructs, AllChem
+from transformers import BartForConditionalGeneration, AutoTokenizer, AutoModel
+from transformers.modeling_outputs import BaseModelOutput
+import selfies as sf
+from rdkit import Chem
+import torch
+import numpy as np
+import umap
+import pickle
+import xgboost as xgb
+from sklearn.svm import SVR
+from sklearn.linear_model import LinearRegression
+from sklearn.kernel_ridge import KernelRidge
+import json
+import os
+os.environ["OMP_MAX_ACTIVE_LEVELS"] = "1"
+# my_theme = gr.Theme.from_hub("ysharma/steampunk")
+# my_theme = gr.themes.Glass()
+"""
+# カスタムテーマ設定
+theme = gr.themes.Default().set(
+    body_background_fill="#000000",  # 背景色を黒に設定
+    text_color="#FFFFFF",            # テキスト色を白に設定
+)
+"""
+import sys
+sys.path.append("models")
+sys.path.append("../models")
+sys.path.append("../")
+import models.fm4m as fm4m
+# Function to display molecule image from SMILES
+def smiles_to_image(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol:
+        img = Draw.MolToImage(mol)
+        return img
+    return None
+# Function to get canonical SMILES
+def get_canonical_smiles(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol:
+        return Chem.MolToSmiles(mol, canonical=True)
+    return None
+# Dictionary for SMILES strings and corresponding images (you can replace with your actual image paths)
+smiles_image_mapping = {
+    "Mol 1": {"smiles": "C=C(C)CC(=O)NC[C@H](CO)NC(=O)C=Cc1ccc(C)c(Cl)c1", "image": "img/img1.png"},
+    # Example SMILES for ethanol
+    "Mol 2": {"smiles": "C=CC1(CC(=O)NC[C@@H](CCCC)NC(=O)c2cc(Cl)cc(Br)c2)CC1", "image": "img/img2.png"},
+    # Example SMILES for butane
+    "Mol 3": {"smiles": "C=C(C)C[C@H](NC(C)=O)C(=O)N1CC[C@H](NC(=O)[C@H]2C[C@@]2(C)Br)C(C)(C)C1",
+              "image": "img/img3.png"},  # Example SMILES for ethylamine
+    "Mol 4": {"smiles": "C=C1CC(CC(=O)N[C@H]2CCN(C(=O)c3ncccc3SC)C23CC3)C1", "image": "img/img4.png"},
+    # Example SMILES for diethyl ether
+    "Mol 5": {"smiles": "C=CCS[C@@H](C)CC(=O)OCC", "image": "img/img5.png"}  # Example SMILES for chloroethane
+}
+datasets = ["BACE", "ESOL", "Custom Dataset"]
+models_enabled = ["SELFIES-TED", "MHG-GED", "MolFormer", "SMI-TED"]
+fusion_available = ["Concat"]
+global log_df
+log_df = pd.DataFrame(columns=["Selected Models", "Dataset", "Task", "Result"])
+def log_selection(models, dataset, task_type, result, log_df):
+    # Append the new entry to the DataFrame
+    new_entry = {"Selected Models": str(models), "Dataset": dataset, "Task": task_type, "Result": result}
+    updated_log_df = log_df.append(new_entry, ignore_index=True)
+    return updated_log_df
+# Function to handle evaluation and logging
+def save_rep(models, dataset, task_type, eval_output):
+    return
+def evaluate_and_log(models, dataset, task_type, eval_output):
+    task_dic = {'Classification': 'CLS', 'Regression': 'RGR'}
+    result = f"{eval_output}"#display_eval(models, dataset, task_type, fusion_type=None)
+    result = result.replace(" Score", "")
+    new_entry = {"Selected Models": str(models), "Dataset": dataset, "Task": task_dic[task_type], "Result": result}
+    new_entry_df = pd.DataFrame([new_entry])
+    log_df = pd.read_csv('log.csv', index_col=0)
+    log_df = pd.concat([new_entry_df, log_df])
+    log_df.to_csv('log.csv')
+    return log_df
+log_df = pd.read_csv('log.csv', index_col=0)
+# Load images for selection
+def load_image(path):
+    return Image.open(smiles_image_mapping[path]["image"])# Image.1open(path)
+# Function to handle image selection
+def handle_image_selection(image_key):
+    smiles = smiles_image_mapping[image_key]["smiles"]
+    mol_image = smiles_to_image(smiles)
+    return smiles, mol_image
+def calculate_properties(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol:
+        qed = QED.qed(mol)
+        logp = MolLogP(mol)
+        sa = sascorer.calculateScore(mol)
+        wt = Descriptors.MolWt(mol)
+        return qed, sa, logp, wt
+    return None, None, None, None
+# Function to calculate Tanimoto similarity
+def calculate_tanimoto(smiles1, smiles2):
+    mol1 = Chem.MolFromSmiles(smiles1)
+    mol2 = Chem.MolFromSmiles(smiles2)
+    if mol1 and mol2:
+        # fp1 = FingerprintMols.FingerprintMol(mol1)
+        # fp2 = FingerprintMols.FingerprintMol(mol2)
+        fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
+        fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
+        return round(DataStructs.FingerprintSimilarity(fp1, fp2), 2)
+    return None
+#with open("models/selfies_model/bart-2908.pickle", "rb") as input_file:
+#    gen_model, gen_tokenizer = pickle.load(input_file)
+gen_tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
+gen_model = BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")
+def generate(latent_vector, mask):
+    encoder_outputs = BaseModelOutput(latent_vector)
+    decoder_output = gen_model.generate(encoder_outputs=encoder_outputs, attention_mask=mask,
+                                        max_new_tokens=64, do_sample=True, top_k=5, top_p=0.95, num_return_sequences=1)
+    selfies = gen_tokenizer.batch_decode(decoder_output, skip_special_tokens=True)
+    outs = []
+    for i in selfies:
+        outs.append(sf.decoder(i.replace("] [", "][")))
+    return outs
+def perturb_latent(latent_vecs, noise_scale=0.5):
+    modified_vec = torch.tensor(np.random.uniform(0, 1, latent_vecs.shape) * noise_scale,
+                                dtype=torch.float32) + latent_vecs
+    return modified_vec
+def encode(selfies):
+    encoding = gen_tokenizer(selfies, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
+    input_ids = encoding['input_ids']
+    attention_mask = encoding['attention_mask']
+    outputs = gen_model.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
+    model_output = outputs.last_hidden_state
+    """input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
+    sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    model_output = sum_embeddings / sum_mask"""
+    return model_output, attention_mask
+# Function to generate canonical SMILES and molecule image
+def generate_canonical(smiles):
+    s = sf.encoder(smiles)
+    selfie = s.replace("][", "] [")
+    latent_vec, mask = encode([selfie])
+    gen_mol = None
+    for i in range(5, 51):
+        noise = i / 10
+        perturbed_latent = perturb_latent(latent_vec, noise_scale=noise)
+        gen = generate(perturbed_latent, mask)
+        gen_mol = Chem.MolToSmiles(Chem.MolFromSmiles(gen[0]))
+        if gen_mol != Chem.MolToSmiles(Chem.MolFromSmiles(smiles)): break
+    if gen_mol:
+        # Calculate properties for ref and gen molecules
+        ref_properties = calculate_properties(smiles)
+        gen_properties = calculate_properties(gen_mol)
+        tanimoto_similarity = calculate_tanimoto(smiles, gen_mol)
+        # Prepare the table with ref mol and gen mol
+        data = {
+            "Property": ["QED", "SA", "LogP", "Mol Wt", "Tanimoto Similarity"],
+            "Reference Mol": [ref_properties[0], ref_properties[1], ref_properties[2], ref_properties[3],
+                              tanimoto_similarity],
+            "Generated Mol": [gen_properties[0], gen_properties[1], gen_properties[2], gen_properties[3], ""]
+        }
+        df = pd.DataFrame(data)
+        # Display molecule image of canonical smiles
+        mol_image = smiles_to_image(gen_mol)
+        return df, gen_mol, mol_image
+    return "Invalid SMILES", None, None
+# Function to display evaluation score
+def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
+    result = None
+    try:
+        downstream_model = downstream.split("*")[0].lstrip()
+        downstream_model = downstream_model.rstrip()
+        hyp_param = downstream.split("*")[-1].lstrip()
+        hyp_param = hyp_param.rstrip()
+        hyp_param = hyp_param.replace("nan", "float('nan')")
+        params = eval(hyp_param)
+    except:
+        downstream_model = downstream.split("*")[0].lstrip()
+        downstream_model = downstream_model.rstrip()
+        params = None
+    try:
+        if not selected_models:
+            return "Please select at least one enabled model."
+        if task_type == "Classification":
+            global roc_auc, fpr, tpr, x_batch, y_batch
+        elif task_type == "Regression":
+            global RMSE, y_batch_test, y_prob
+        if len(selected_models) > 1:
+            if task_type == "Classification":
+                #result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
+                #                                                               downstream_model="XGBClassifier",
+                #                                                               dataset=dataset.lower())
+                if downstream_model == "Default Settings":
+                    downstream_model = "DefaultClassifier"
+                    params = None
+                result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
+                                                                                               downstream_model=downstream_model,
+                                                                                               params = params,
+                                                                                               dataset=dataset)
+            elif task_type == "Regression":
+                #result, RMSE, y_batch_test, y_prob = fm4m.multi_modal(model_list=selected_models,
+                #                                                      downstream_model="XGBRegressor",
+                #                                                      dataset=dataset.lower())
+                if downstream_model == "Default Settings":
+                    downstream_model = "DefaultRegressor"
+                    params = None
+                result, RMSE, y_batch_test, y_prob, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
+                                                                      downstream_model=downstream_model,
+                                                                      params=params,
+                                                                      dataset=dataset)
+        else:
+            if task_type == "Classification":
+                #result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
+                #                                                                downstream_model="XGBClassifier",
+                #                                                                dataset=dataset.lower())
+                if downstream_model == "Default Settings":
+                    downstream_model = "DefaultClassifier"
+                    params = None
+                result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
+                                                                                downstream_model=downstream_model,
+                                                                                params=params,
+                                                                                dataset=dataset)
+            elif task_type == "Regression":
+                #result, RMSE, y_batch_test, y_prob = fm4m.single_modal(model=selected_models[0],
+                #                                                       downstream_model="XGBRegressor",
+                #                                                       dataset=dataset.lower())
+                if downstream_model == "Default Settings":
+                    downstream_model = "DefaultRegressor"
+                    params = None
+                result, RMSE, y_batch_test, y_prob, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
+                                                                       downstream_model=downstream_model,
+                                                                       params=params,
+                                                                       dataset=dataset)
+        if result == None:
+            result = "Data & Model Setting is incorrect"
+    except Exception as e:
+        return f"An error occurred: {e}"
+    return f"{result}"
+# Function to handle plot display
+def display_plot(plot_type):
+    fig, ax = plt.subplots()
+    if plot_type == "Latent Space":
+        global x_batch, y_batch
+        ax.set_title("T-SNE Plot")
+        # reducer = umap.UMAP(metric='euclidean', n_neighbors=  10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
+        # features_umap = reducer.fit_transform(x_batch[:500])
+        # x = y_batch.values[:500]
+        # index_0 = [index for index in range(len(x)) if x[index] == 0]
+        # index_1 = [index for index in range(len(x)) if x[index] == 1]
+        class_0 = x_batch  # features_umap[index_0]
+        class_1 = y_batch  # features_umap[index_1]
+        """with open("latent_multi_bace.pkl", "rb") as f:
+            class_0, class_1 = pickle.load(f)
+        """
+        plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
+        plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
+        ax.set_xlabel('Feature 1')
+        ax.set_ylabel('Feature 2')
+        ax.set_title('Dataset Distribution')
+    elif plot_type == "ROC-AUC":
+        global roc_auc, fpr, tpr
+        ax.set_title("ROC-AUC Curve")
+        try:
+            ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
+            ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
+            ax.set_xlim([0.0, 1.0])
+            ax.set_ylim([0.0, 1.05])
+        except:
+            pass
+        ax.set_xlabel('False Positive Rate')
+        ax.set_ylabel('True Positive Rate')
+        ax.set_title('Receiver Operating Characteristic')
+        ax.legend(loc='lower right')
+    elif plot_type == "Parity Plot":
+        global RMSE, y_batch_test, y_prob
+        ax.set_title("Parity plot")
+        # change format
+        try:
+            print(y_batch_test)
+            print(y_prob)
+            y_batch_test = np.array(y_batch_test, dtype=float)
+            y_prob = np.array(y_prob, dtype=float)
+            ax.scatter(y_batch_test, y_prob, color="blue", label=f"Predicted vs Actual (RMSE: {RMSE:.4f})")
+            min_val = min(min(y_batch_test), min(y_prob))
+            max_val = max(max(y_batch_test), max(y_prob))
+            ax.plot([min_val, max_val], [min_val, max_val], 'r-')
+        except:
+            y_batch_test = []
+            y_prob = []
+            RMSE = None
+            print(y_batch_test)
+            print(y_prob)
+        ax.set_xlabel('Actual Values')
+        ax.set_ylabel('Predicted Values')
+        ax.legend(loc='lower right')
+    return fig
+# Predefined dataset paths (these should be adjusted to your file paths)
+predefined_datasets = {
+    "Bace": f"data/bace/train.csv, data/bace/test.csv, smiles, Class",
+    "ESOL": f"data/esol/train.csv, data/esol/test.csv, smiles, prop",
+}
+# Function to load a predefined dataset from the local path
+def load_predefined_dataset(dataset_name):
+    val = predefined_datasets.get(dataset_name)
+    try: file_path = val.split(",")[0]
+    except:file_path=False
+    if file_path:
+        df = pd.read_csv(file_path)
+        return df.head(), gr.update(choices=list(df.columns)), gr.update(choices=list(df.columns)), f"{dataset_name.lower()}"
+    return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[]), f"Dataset not found"
+# Function to display the head of the uploaded CSV file
+def display_csv_head(file):
+    if file is not None:
+        # Load the CSV file into a DataFrame
+        df = pd.read_csv(file.name)
+        return df.head(), gr.update(choices=list(df.columns)), gr.update(choices=list(df.columns))
+    return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[])
+# Function to handle dataset selection (predefined or custom)
+def handle_dataset_selection(selected_dataset):
+    if selected_dataset == "Custom Dataset":
+        # Show file upload fields for train and test datasets if "Custom Dataset" is selected
+        return gr.update(visible=True), gr.update(visible=True),  gr.update(visible=True), gr.update(visible=True), gr.update(
+            visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
+    else:
+        #[dataset_name, train_file, train_display, test_file, test_display, predefined_display,
+        # input_column_selector, output_column_selector]
+        # Load the predefined dataset from its local path
+        #return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
+        #    visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+        #return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(
+        #    visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(
+            visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+# Function to select input and output columns and display a message
+def select_columns(input_column, output_column, train_data, test_data,dataset_name):
+    if input_column and output_column:
+        return f"{train_data.name},{test_data.name},{input_column},{output_column},{dataset_name}"
+    return "Please select both input and output columns."
+def set_dataname(dataset_name, dataset_selector ):
+    if dataset_selector == "Custom Dataset":
+        return f"{dataset_name}"
+    return f"{dataset_selector}"
+# Function to create model based on user input
+def create_model(model_name, max_depth=None, n_estimators=None, alpha=None, degree=None, kernel=None):
+    if model_name == "XGBClassifier":
+        model = xgb.XGBClassifier(objective='binary:logistic',eval_metric= 'auc', max_depth=max_depth, n_estimators=n_estimators, alpha=alpha)
+    elif model_name == "SVR":
+        model = SVR(degree=degree, kernel=kernel)
+    elif model_name == "Kernel Ridge":
+        model = KernelRidge(alpha=alpha, degree=degree, kernel=kernel)
+    elif model_name == "Linear Regression":
+        model = LinearRegression()
+    elif model_name == "Default - Auto":
+        model = "Default Settings"
+        return f"{model}"
+    else:
+        return "Model not supported."
+    return f"{model_name} * {model.get_params()}"
+def model_selector(model_name):
+    # Dynamically return the appropriate hyperparameter components based on the selected model
+    if model_name == "XGBClassifier":
+        return (
+            gr.Slider(1, 10, label="max_depth"),
+            gr.Slider(50, 500, label="n_estimators"),
+            gr.Slider(0.1, 10.0, step=0.1, label="alpha")
+        )
+    elif model_name == "SVR":
+        return (
+            gr.Slider(1, 5, label="degree"),
+            gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
+        )
+    elif model_name == "Kernel Ridge":
+        return (
+            gr.Slider(0.1, 10.0, step=0.1, label="alpha"),
+            gr.Slider(1, 5, label="degree"),
+            gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
+        )
+    elif model_name == "Linear Regression":
+        return ()  # No hyperparameters for Linear Regression
+    else:
+        return ()
+# Define the Gradio layout
+# with gr.Blocks(theme=my_theme) as demo:
+with gr.Blocks() as demo:
+    with gr.Row():
+        # Left Column
+        with gr.Column():
+            gr.HTML('''
+           <div style="background-color: #6A8EAE; color: #FFFFFF; padding: 10px;">
+                <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Data & Model Setting</h3>
+            </div>
+            ''')
+            # gr.Markdown("## Data & Model Setting")
+            #dataset_dropdown = gr.Dropdown(choices=datasets, label="Select Dat")
+            # Dropdown menu for predefined datasets including "Custom Dataset" option
+            dataset_selector = gr.Dropdown(label="Select Dataset",
+                                           choices=list(predefined_datasets.keys()) + ["Custom Dataset"])
+            # Display the message for selected columns
+            selected_columns_message = gr.Textbox(label="Selected Columns Info", visible=False)
+            with gr.Accordion("Dataset Settings", open=True):
+                # File upload options for custom dataset (train and test)
+                dataset_name = gr.Textbox(label="Dataset Name", visible=False)
+                train_file = gr.File(label="Upload Custom Train Dataset", file_types=[".csv"], visible=False)
+                train_display = gr.Dataframe(label="Train Dataset Preview (First 5 Rows)", visible=False, interactive=False)
+                test_file = gr.File(label="Upload Custom Test Dataset", file_types=[".csv"], visible=False)
+                test_display = gr.Dataframe(label="Test Dataset Preview (First 5 Rows)", visible=False, interactive=False)
+                # Predefined dataset displays
+                predefined_display = gr.Dataframe(label="Predefined Dataset Preview (First 5 Rows)", visible=False,
+                                                  interactive=False)
+                # Dropdowns for selecting input and output columns for the custom dataset
+                input_column_selector = gr.Dropdown(label="Select Input Column", choices=[], visible=False)
+                output_column_selector = gr.Dropdown(label="Select Output Column", choices=[], visible=False)
+                #selected_columns_message = gr.Textbox(label="Selected Columns Info", visible=True)
+                # When a dataset is selected, show either file upload fields (for custom) or load predefined datasets
+                dataset_selector.change(handle_dataset_selection,
+                                        inputs=dataset_selector,
+                                        outputs=[dataset_name, train_file, train_display, test_file, test_display, predefined_display,
+                                                 input_column_selector, output_column_selector])
+                # When a predefined dataset is selected, load its head and update column selectors
+                dataset_selector.change(load_predefined_dataset,
+                                        inputs=dataset_selector,
+                                        outputs=[predefined_display, input_column_selector, output_column_selector, selected_columns_message])
+                # When a custom train file is uploaded, display its head and update column selectors
+                train_file.change(display_csv_head, inputs=train_file,
+                                  outputs=[train_display, input_column_selector, output_column_selector])
+                # When a custom test file is uploaded, display its head
+                test_file.change(display_csv_head, inputs=test_file,
+                                 outputs=[test_display, input_column_selector, output_column_selector])
+                dataset_selector.change(set_dataname,
+                                    inputs=[dataset_name, dataset_selector],
+                                    outputs=dataset_name)
+                # Update the selected columns information when dropdown values are changed
+                input_column_selector.change(select_columns,
+                                             inputs=[input_column_selector, output_column_selector, train_file, test_file, dataset_name],
+                                             outputs=selected_columns_message)
+                output_column_selector.change(select_columns,
+                                              inputs=[input_column_selector, output_column_selector, train_file, test_file, dataset_name],
+                                              outputs=selected_columns_message)
+            model_checkbox = gr.CheckboxGroup(choices=models_enabled, label="Select Model")
+            # Add disabled checkboxes for GNN and FNN
+            # gnn_checkbox = gr.Checkbox(label="GNN (Disabled)", value=False, interactive=False)
+            # fnn_checkbox = gr.Checkbox(label="FNN (Disabled)", value=False, interactive=False)
+            task_radiobutton = gr.Radio(choices=["Classification", "Regression"], label="Task Type")
+            ####### adding hyper parameter tuning ###########
+            model_name = gr.Dropdown(["Default - Auto", "XGBClassifier", "SVR", "Kernel Ridge", "Linear Regression"], label="Select Downstream Model")
+            with gr.Accordion("Downstream Hyperparameter Settings", open=True):
+                # Create placeholders for hyperparameter components
+                max_depth = gr.Slider(1, 20, step=1,visible=False, label="max_depth")
+                n_estimators = gr.Slider(100, 5000, step=100, visible=False, label="n_estimators")
+                alpha = gr.Slider(0.1, 10.0, step=0.1, visible=False, label="alpha")
+                degree = gr.Slider(1, 20, step=1,visible=False, label="degree")
+                kernel = gr.Dropdown(choices=["rbf", "poly", "linear"], visible=False, label="kernel")
+                # Output textbox
+                output = gr.Textbox(label="Loaded Parameters")
+            # Dynamically show relevant hyperparameters based on selected model
+            def update_hyperparameters(model_name):
+                if model_name == "XGBClassifier":
+                    return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(
+                        visible=False), gr.update(visible=False)
+                elif model_name == "SVR":
+                    return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
+                        visible=True), gr.update(visible=True)
+                elif model_name == "Kernel Ridge":
+                    return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(
+                        visible=True), gr.update(visible=True)
+                elif model_name == "Linear Regression":
+                    return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
+                        visible=False), gr.update(visible=False)
+                elif model_name == "Default - Auto":
+                    return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
+                        visible=False), gr.update(visible=False)
+            # When model is selected, update which hyperparameters are visible
+            model_name.change(update_hyperparameters, inputs=[model_name],
+                              outputs=[max_depth, n_estimators, alpha, degree, kernel])
+            # Submit button to create the model with selected hyperparameters
+            submit_button = gr.Button("Create Downstream Model")
+            # Function to handle model creation based on input parameters
+            def on_submit(model_name, max_depth, n_estimators, alpha, degree, kernel):
+                if model_name == "XGBClassifier":
+                    return create_model(model_name, max_depth=max_depth, n_estimators=n_estimators, alpha=alpha)
+                elif model_name == "SVR":
+                    return create_model(model_name, degree=degree, kernel=kernel)
+                elif model_name == "Kernel Ridge":
+                    return create_model(model_name, alpha=alpha, degree=degree, kernel=kernel)
+                elif model_name == "Linear Regression":
+                    return create_model(model_name)
+                elif model_name == "Default - Auto":
+                    return create_model(model_name)
+            # When the submit button is clicked, run the on_submit function
+            submit_button.click(on_submit, inputs=[model_name, max_depth, n_estimators, alpha, degree, kernel],
+                                outputs=output)
+            ###### End of hyper param tuning #########
+            fusion_radiobutton = gr.Radio(choices=fusion_available, label="Fusion Type")
+            eval_button = gr.Button("Train downstream model")
+            #eval_button.style(css_class="custom-button-left")
+        # Middle Column
+        with gr.Column():
+            gr.HTML('''
+           <div style="background-color: #8F9779; color: #FFFFFF; padding: 10px;">
+                <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 1: Property Prediction</h3>
+            </div>
+            ''')
+            # gr.Markdown("## Downstream task Result")
+            eval_output = gr.Textbox(label="Train downstream model")
+            plot_radio = gr.Radio(choices=["ROC-AUC", "Parity Plot", "Latent Space"], label="Select Plot Type")
+            plot_output = gr.Plot(label="Visualization")#, height=250, width=250)
+            #download_rep = gr.Button("Download representation")
+            create_log = gr.Button("Store log")
+            log_table = gr.Dataframe(value=log_df, label="Log of Selections and Results", interactive=False)
+            eval_button.click(display_eval,
+                              inputs=[model_checkbox, selected_columns_message, task_radiobutton, output, fusion_radiobutton],
+                              outputs=eval_output)
+            plot_radio.change(display_plot, inputs=plot_radio, outputs=plot_output)
+            # Function to gather selected models
+            def gather_selected_models(*models):
+                selected = [model for model in models if model]
+                return selected
+            create_log.click(evaluate_and_log, inputs=[model_checkbox, dataset_name, task_radiobutton, eval_output],
+                             outputs=log_table)
+            #download_rep.click(save_rep, inputs=[model_checkbox, dataset_name, task_radiobutton, eval_output],
+            #                 outputs=None)
+        # Right Column
+        with gr.Column():
+            gr.HTML('''
+           <div style="background-color: #D2B48C; color: #FFFFFF; padding: 10px;">
+                <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 2: Molecule Generation</h3>
+            </div>
+            ''')
+            # gr.Markdown("## Molecular Generation")
+            smiles_input = gr.Textbox(label="Input SMILES String")
+            image_display = gr.Image(label="Molecule Image", height=250, width=250)
+            # Show images for selection
+            with gr.Accordion("Select from sample molecules", open=False):
+                image_selector = gr.Radio(
+                    choices=list(smiles_image_mapping.keys()),
+                    label="Select from sample molecules",
+                    value=None,
+                    #item_images=[load_image(smiles_image_mapping[key]["image"]) for key in smiles_image_mapping.keys()]
+                )
+                image_selector.change(load_image, image_selector, image_display)
+            generate_button = gr.Button("Generate")
+            gen_image_display = gr.Image(label="Generated Molecule Image", height=250, width=250)
+            generated_output = gr.Textbox(label="Generated Output")
+            property_table = gr.Dataframe(label="Molecular Properties Comparison")
+            # Handle image selection
+            image_selector.change(handle_image_selection, inputs=image_selector, outputs=[smiles_input, image_display])
+            smiles_input.change(smiles_to_image, inputs=smiles_input, outputs=image_display)
+            # Generate button to display canonical SMILES and molecule image
+            generate_button.click(generate_canonical, inputs=smiles_input,
+                                  outputs=[property_table, generated_output, gen_image_display])
+if __name__ == "__main__":
+    demo.launch()

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/bace/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/bace/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/bace/valid.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/esol/test.csv ADDED Viewed

	@@ -0,0 +1,109 @@

+,selfies,prop,smiles
+0,[Cl] [C] [=C] [Branch1] [C] [Cl] [C] [Branch1] [C] [Cl] [C] [C] [C] [C] [Branch1] [Branch2] [C] [O] [C] [Ring1] [=Branch1] [Ring1] [Ring1] [C] [Ring1] [Branch2] [C] [Ring1] [=C] [Branch1] [C] [Cl] [C] [Ring1] [=N] [Branch1] [C] [Cl] [Cl],-4.533,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl
+1,[C] [C] [C] [C] [C] [=O],-1.103,CCCCC=O
+2,[O] [C] [C] [C] [C] [=C],-0.7909999999999999,OCCCC=C
+3,[C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [N] [N] [=C] [C] [Branch1] [C] [N] [=C] [Branch1] [C] [Br] [C] [Ring1] [Branch2] [=O],-3.005,c1ccccc1n2ncc(N)c(Br)c2(=O)
+4,[N] [C] [=C] [C] [=C] [Branch1] [C] [O] [C] [=C] [Ring1] [#Branch1],-1.231,Nc1ccc(O)cc1
+5,[C] [C] [Branch1] [C] [C] [C] [C] [O] [C] [=Branch1] [C] [=O] [C],-1.817,CC(C)CCOC(=O)C
+6,[C] [O] [P] [=Branch1] [C] [=S] [Branch1] [Ring1] [O] [C] [S] [C] [C] [=Branch1] [C] [=O] [N] [Branch1] [C] [C] [C] [=O],-2.087,COP(=S)(OC)SCC(=O)N(C)C=O
+7,[Cl] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=Branch1] [Ring2] [=C] [Ring1] [#Branch1] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [Branch1] [C] [Cl] [=C] [Ring1] [Branch2],-6.312,Clc1ccc(Cl)c(c1)c2ccc(Cl)c(Cl)c2
+8,[C] [Branch1] [C] [Cl] [=C] [Branch1] [C] [Cl] [C] [Branch1] [C] [Cl] [=C] [N] [=C] [C] [=N] [C] [Ring1] [=Branch1] [=C] [Ring1] [=N] [Cl],-4.438,c2(Cl)c(Cl)c(Cl)c1nccnc1c2(Cl)
+9,[C] [C] [C] [C] [C] [=C] [Branch1] [C] [C] [N] [=C] [Branch1] [=Branch1] [N] [=C] [Ring1] [#Branch1] [O] [N] [Branch1] [C] [C] [C],-3.57,CCCCc1c(C)nc(nc1O)N(C)C
+10,[C] [C] [O] [C] [=Branch1] [C] [=O] [C] [C] [=Branch1] [C] [=O] [O] [C] [C],-1.413,CCOC(=O)CC(=O)OCC
+11,[C] [C] [Branch1] [C] [C] [Branch1] [C] [C] [C] [=C] [C] [=C] [Branch1] [C] [O] [C] [=C] [Ring1] [#Branch1],-3.192,CC(C)(C)c1ccc(O)cc1
+12,[C] [C] [=C] [C] [=C] [C] [Branch1] [C] [C] [=C] [Ring1] [#Branch1],-3.035,Cc1cccc(C)c1
+13,[C] [C] [C] [O] [C] [=Branch1] [C] [=O] [C],-1.125,CCCOC(=O)C
+14,[C] [S] [C] [=N] [N] [=C] [Branch1] [=Branch2] [C] [=Branch1] [C] [=O] [N] [Ring1] [#Branch1] [N] [C] [Branch1] [C] [C] [Branch1] [C] [C] [C],-2.324,CSc1nnc(c(=O)n1N)C(C)(C)C
+15,[Cl] [C] [=C] [C] [=C] [Branch1] [Branch1] [C] [=C] [Ring1] [=Branch1] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [Cl],-5.142,Clc1ccc(cc1)c2ccccc2Cl
+16,[C] [C] [C] [C] [Branch1] [C] [C] [C] [=Branch1] [C] [=O] [C] [Branch1] [Ring2] [C] [Ring1] [Branch2] [C] [Branch1] [C] [O] [C] [C] [C] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [C] [Ring1] [Branch2],-1.5319999999999998,CC1CC(C)C(=O)C(C1)C(O)CC2CC(=O)NC(=O)C2
+17,[C] [N] [C] [=Branch1] [C] [=O] [O] [C] [=C] [C] [=C] [C] [Branch1] [Branch2] [N] [=C] [N] [Branch1] [C] [C] [C] [=C] [Ring1] [O],-1.846,CNC(=O)Oc1cccc(N=CN(C)C)c1
+18,[C] [C] [=C] [C] [=N] [C] [N] [Branch1] [=Branch1] [C] [C] [C] [Ring1] [Ring1] [C] [=N] [C] [=C] [C] [=C] [Ring1] [=Branch1] [C] [=Branch1] [C] [=O] [N] [C] [Ring2] [Ring1] [Ring1] [=Ring1] [#C],-3.397,Cc3ccnc4N(C1CC1)c2ncccc2C(=O)Nc34
+19,[C] [C] [N] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1],-2.389,CCNc1ccccc1
+20,[C] [C] [=C] [C] [=C] [C] [=C] [C] [Ring1] [=Branch1] [=C] [Branch1] [C] [C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [Ring1] [=Branch1] [=C] [Ring2] [Ring1] [Ring1] [Ring1] [#Branch2],-6.297000000000001,Cc1c2ccccc2c(C)c3ccc4ccccc4c13
+21,[F] [C] [=C] [C] [=C] [C] [Branch1] [C] [F] [=C] [Ring1] [#Branch1] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [Branch1] [C] [Cl] [=C] [Branch1] [C] [F] [C] [Branch1] [C] [Cl] [=C] [Ring1] [=Branch2] [F],-5.462000000000001,Fc1cccc(F)c1C(=O)NC(=O)Nc2cc(Cl)c(F)c(Cl)c2F
+22,[C] [O] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [Ring1] [#Branch1],-3.057,COc1ccc(Cl)cc1
+23,[O] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=N] [Ring1] [=Branch1],-4.2010000000000005,o1c2ccccc2c3ccccc13
+24,[C] [=C] [C] [=C] [N] [=C] [C] [=C] [C] [=C] [C] [Ring1] [=Branch1] [=C] [C] [Ring1] [#Branch2] [=C] [Ring1] [=C],-3.846,c3ccc2nc1ccccc1cc2c3
+25,[C] [C] [C] [C] [=Branch1] [C] [=O] [C] [C] [Branch1] [P] [C] [C] [C] [=C] [C] [=Branch1] [C] [=O] [C] [C] [C] [Ring1] [O] [Ring1] [#Branch1] [C] [C] [Ring1] [P] [C] [C] [C] [Ring2] [Ring1] [Ring2] [Branch1] [C] [O] [C] [=Branch1] [C] [=O] [C] [O],-2.893,CC12CC(=O)C3C(CCC4=CC(=O)CCC34C)C2CCC1(O)C(=O)CO
+26,[C] [C] [C] [=C] [C] [=C] [C] [Branch1] [Ring1] [C] [C] [=C] [Ring1] [Branch2] [N] [Branch1] [Ring2] [C] [O] [C] [C] [=Branch1] [C] [=O] [C] [Cl],-3.319,CCc1cccc(CC)c1N(COC)C(=O)CCl
+27,[C] [C] [C] [C] [N] [Branch1] [C] [C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [Branch1] [C] [Cl] [=C] [Ring1] [Branch2],-4.157,CCCCN(C)C(=O)Nc1ccc(Cl)c(Cl)c1
+28,[C] [S] [C] [=Branch1] [C] [=S] [N] [C] [Ring1] [=Branch1] [=O],-0.396,C1SC(=S)NC1(=O)
+29,[O] [C] [=C] [C] [=C] [Branch1] [Branch2] [C] [Branch1] [C] [O] [=C] [Ring1] [#Branch1] [C] [O] [C] [=C] [C] [Branch1] [C] [O] [=C] [C] [Branch1] [C] [O] [=C] [Ring1] [Branch2] [C] [=Branch1] [C] [=O] [C] [=Ring1] [=N] [O],-2.7310000000000003,Oc1ccc(c(O)c1)c3oc2cc(O)cc(O)c2c(=O)c3O
+30,[C] [N] [Branch1] [C] [C] [C] [=N] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [Ring1] [#Branch1] [C],-3.164,CN(C)C=Nc1ccc(Cl)cc1C
+31,[N] [C] [=Branch1] [C] [=O] [N] [C] [N] [C] [=Branch1] [C] [=O] [N] [C] [Ring1] [=Branch1] [=O],0.652,NC(=O)NC1NC(=O)NC1=O
+32,[Cl] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [Ring1] [#Branch2] [Ring1] [=Branch1],-4.063,Clc1cccc2ccccc12
+33,[O] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [Branch1] [C] [Cl] [=C] [Ring1] [Branch2],-3.352,Oc1ccc(Cl)c(Cl)c1
+34,[C] [C] [Branch1] [C] [C] [C] [Branch1] [#Branch1] [C] [=C] [Branch1] [C] [Cl] [Cl] [C] [Ring1] [Branch2] [C] [=Branch1] [C] [=O] [O] [C] [Branch1] [Ring1] [C] [#N] [C] [=C] [C] [=C] [C] [Branch1] [#Branch2] [O] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [=C] [Ring1] [=N],-6.775,CC1(C)C(C=C(Cl)Cl)C1C(=O)OC(C#N)c2cccc(Oc3ccccc3)c2
+35,[C] [=C] [C] [=C] [NH1] [N] [=N] [C] [Ring1] [Branch1] [=C] [Ring1] [=Branch2],-2.21,c2ccc1[nH]nnc1c2
+36,[C] [C] [Branch1] [C] [C] [C] [Branch2] [Ring1] [Branch1] [N] [C] [=C] [C] [=C] [Branch1] [=Branch1] [C] [=C] [Ring1] [=Branch1] [Cl] [C] [Branch1] [C] [F] [Branch1] [C] [F] [F] [C] [=Branch1] [C] [=O] [O] [C] [Branch1] [Ring1] [C] [#N] [C] [=C] [C] [=C] [C] [Branch1] [#Branch2] [O] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [=C] [Ring1] [=N],-8.057,CC(C)C(Nc1ccc(cc1Cl)C(F)(F)F)C(=O)OC(C#N)c2cccc(Oc3ccccc3)c2
+37,[C] [C] [C],-1.5530000000000002,CCC
+38,[C] [C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [Ring1] [O] [=C] [Ring1] [#Branch2] [Ring1] [=Branch1],-3.792,C1Cc2cccc3cccc1c23
+39,[C] [C] [C] [#C],-1.092,CCC#C
+40,[Cl] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [Ring1] [#Branch1],-3.5580000000000003,Clc1ccc(Cl)cc1
+41,[C] [C] [=C] [NH1] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch2] [Ring1] [=Branch1],-2.9810000000000003,Cc1c[nH]c2ccccc12
+42,[C] [C] [#N],0.152,CC#N
+43,[C] [C] [C] [C] [O],-0.688,CCCCO
+44,[C] [C] [=Branch1] [C] [=C] [C] [=Branch1] [C] [=C] [C],-2.052,CC(=C)C(=C)C
+45,[C] [C] [C] [Branch1] [C] [C] [C] [C] [O],-1.308,CCC(C)CCO
+46,[Cl] [C] [=C] [C] [=C] [Branch1] [=Branch2] [C] [Branch1] [C] [Cl] [=C] [Ring1] [#Branch1] [Cl] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [Branch1] [C] [Cl] [=C] [Ring1] [Branch2] [Cl],-7.192,Clc1ccc(c(Cl)c1Cl)c2ccc(Cl)c(Cl)c2Cl
+47,[C] [C] [=C] [C] [=Branch2] [Ring1] [=Branch1] [=C] [C] [=C] [Ring1] [=Branch1] [N] [S] [=Branch1] [C] [=O] [=Branch1] [C] [=O] [C] [Branch1] [C] [F] [Branch1] [C] [F] [F] [S] [=Branch1] [C] [=O] [=Branch1] [C] [=O] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1],-4.945,Cc1cc(ccc1NS(=O)(=O)C(F)(F)F)S(=O)(=O)c2ccccc2
+48,[O] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [Ring1] [#Branch1] [Cl],-3.22,Oc1ccc(Cl)cc1Cl
+49,[C] [N] [C] [=Branch2] [Ring1] [Ring2] [=C] [Branch1] [C] [O] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [S] [Ring1] [O] [=Branch1] [C] [=O] [=O] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [C] [=N] [Ring1] [=Branch1],-3.4730000000000003,CN2C(=C(O)c1ccccc1S2(=O)=O)C(=O)Nc3ccccn3
+50,[C] [C] [C] [C] [C] [C] [Branch1] [S] [C] [C] [C] [=C] [C] [Branch1] [C] [O] [=C] [C] [=C] [Ring1] [O] [Ring1] [#Branch1] [C] [Ring1] [#C] [C] [C] [C] [Ring2] [Ring1] [C] [=O],-3.872,CC12CCC3C(CCc4cc(O)ccc34)C2CCC1=O
+51,[C] [C] [=C] [C] [=C] [C] [=C] [Branch1] [C] [C] [C] [=C] [C] [=C] [Ring1] [O] [Ring1] [#Branch1],-4.147,Cc1cccc2c(C)cccc12
+52,[N] [S] [=Branch1] [C] [=O] [=Branch1] [C] [=O] [C] [=C] [C] [=C] [Branch1] [O] [N] [C] [N] [S] [Ring1] [=Branch1] [=Branch1] [C] [=O] [=O] [C] [=C] [Ring1] [N] [Cl],-1.72,NS(=O)(=O)c2cc1c(NCNS1(=O)=O)cc2Cl
+53,[O] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [N] [=C] [Ring1] [#Branch2] [Ring1] [=Branch1],-2.725,Oc1cccc2cccnc12
+54,[C] [C] [C] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [C] [Ring1] [#Branch2],-3.447,C1CCc2ccccc2C1
+55,[C] [C] [O] [C] [Branch1] [C] [C] [O] [C] [C],-0.899,CCOC(C)OCC
+56,[C] [C] [C] [C] [Ring1] [Ring1] [Branch1] [C] [C] [C] [=Branch1] [C] [=O] [N] [Branch1] [Branch1] [C] [Ring1] [Branch2] [=O] [C] [=C] [C] [Branch1] [C] [Cl] [=C] [C] [Branch1] [C] [Cl] [=C] [Ring1] [Branch2],-3.464,CC12CC2(C)C(=O)N(C1=O)c3cc(Cl)cc(Cl)c3
+57,[C] [C] [=C] [C] [=C] [C] [=C] [C] [Ring1] [=Branch1] [=C] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=C] [Ring1] [=Branch1],-4.87,Cc1c2ccccc2cc3ccccc13
+58,[C] [C] [C] [C] [O] [C],-1.072,CCCCOC
+59,[C] [C] [C] [C] [C] [=Branch1] [C] [=O] [C] [=C] [Ring1] [#Branch1] [C] [C] [C] [C] [C] [C] [C] [Branch1] [#Branch1] [C] [=Branch1] [C] [=O] [C] [O] [C] [Ring1] [=Branch2] [Branch1] [N] [C] [C] [Branch1] [C] [O] [C] [Ring2] [Ring1] [#Branch1] [Ring1] [=C] [C] [=O],-3.0660000000000003,CC13CCC(=O)C=C1CCC4C2CCC(C(=O)CO)C2(CC(O)C34)C=O
+60,[C] [C] [C] [Branch1] [=Branch1] [C] [Branch1] [C] [C] [C] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [N] [C] [Ring1] [O] [=O],-1.6030000000000002,CCC1(C(C)C)C(=O)NC(=O)NC1=O
+61,[C] [C] [O] [C] [=Branch1] [C] [=O] [C] [=C] [C] [=C] [Branch1] [C] [O] [C] [=C] [Ring1] [#Branch1],-2.761,CCOC(=O)c1ccc(O)cc1
+62,[C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [C] [=C] [Ring2] [Ring1] [C] [C] [=C] [Ring2] [Ring1] [C] [C] [Ring1] [S] [=C] [Ring1] [=C] [C] [Ring1] [N] [=C] [Ring1] [#Branch2] [Ring1] [=Branch1],-6.885,c1cc2ccc3ccc4ccc5ccc6ccc1c7c2c3c4c5c67
+63,[C] [C] [N] [C] [=C] [C] [Branch1] [=Branch1] [N] [Branch1] [C] [C] [C] [=C] [C] [Branch1] [C] [C] [=C] [Ring1] [#Branch2] [N] [C] [=Branch1] [C] [=O] [C] [=C] [C] [=C] [N] [=C] [Ring2] [Ring1] [Ring2] [Ring1] [=Branch1],-4.408,CCN2c1cc(N(C)C)cc(C)c1NC(=O)c3cccnc23
+64,[C] [N] [Branch1] [C] [C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [Branch1] [C] [Cl] [=C] [Ring1] [Branch2],-3.301,CN(C)C(=O)Nc1ccc(Cl)c(Cl)c1
+65,[C] [C] [C] [C] [C] [C] [Branch1] [C] [C] [C],-3.3080000000000003,CCCCCC(C)C
+66,[C] [O] [C] [=C] [C] [=C] [Branch1] [C] [N] [N] [=C] [Branch1] [#C] [N] [=C] [Ring1] [#Branch1] [C] [Branch1] [Ring1] [O] [C] [=C] [Ring1] [=N] [O] [C] [N] [C] [C] [N] [Branch1] [Branch1] [C] [C] [Ring1] [=Branch1] [C] [=Branch1] [C] [=O] [O] [C] [C] [Branch1] [C] [C] [Branch1] [C] [C] [O],-3.958,COc2cc1c(N)nc(nc1c(OC)c2OC)N3CCN(CC3)C(=O)OCC(C)(C)O
+67,[C] [=C] [C] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [C] [Ring1] [#Branch1] [C] [=C] [Ring1] [O],-0.636,c1cC2C(=O)NC(=O)C2cc1
+68,[C] [C] [C] [=O],-0.3939999999999999,CCC=O
+69,[Cl] [C] [=C] [C] [=C] [Branch2] [Ring1] [=Branch2] [C] [N] [Branch1] [Branch2] [C] [C] [C] [C] [C] [Ring1] [Branch1] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [C] [=C] [Ring2] [Ring1] [=Branch1],-5.126,Clc1ccc(CN(C2CCCC2)C(=O)Nc3ccccc3)cc1
+70,[C] [C] [C] [C] [C] [Branch1] [Ring1] [C] [C] [C] [=O],-2.232,CCCCC(CC)C=O
+71,[O] [=C] [N] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [C] [Ring1] [Branch2] [Branch1] [Ring1] [C] [C] [C] [C] [C] [Branch1] [C] [C] [C],-2.312,O=C1NC(=O)NC(=O)C1(CC)CCC(C)C
+72,[C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1],-1.857,CC(=O)Nc1ccccc1
+73,[C] [=N] [C] [=C] [C] [Branch1] [#Branch1] [C] [=Branch1] [C] [=O] [N] [N] [=C] [Ring1] [#Branch2],-0.7170000000000001,c1nccc(C(=O)NN)c1
+74,[C] [C] [Branch1] [C] [C] [C] [C] [C] [C] [Branch1] [C] [C] [Branch1] [Ring2] [C] [Ring1] [=Branch1] [C] [Ring1] [=Branch2] [=O],-2.158,CC2(C)C1CCC(C)(C1)C2=O
+75,[C] [O] [C] [=C] [N] [=C] [C] [=N] [C] [=N] [C] [Ring1] [=Branch1] [=N] [Ring1] [#Branch2],-1.589,COc2cnc1cncnc1n2
+76,[C] [N] [C] [=Branch1] [C] [=O] [C] [=C] [Branch1] [C] [C] [O] [P] [=Branch1] [C] [=O] [Branch1] [Ring1] [O] [C] [O] [C],-0.949,CNC(=O)C=C(C)OP(=O)(OC)OC
+77,[O] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [N] [Branch1] [Ring1] [C] [C] [C] [=Branch1] [C] [=O] [C] [=C] [C] [=C] [C] [=C] [Ring2] [Ring1] [C] [Ring1] [=Branch1],-3.784,O2c1ccccc1N(CC)C(=O)c3ccccc23
+78,[C] [=C] [C] [=C] [C] [=C] [Branch1] [Ring1] [O] [C] [C] [Branch1] [Branch2] [C] [C] [=C] [Branch1] [C] [C] [C] [=C] [Ring1] [=N] [O] [C] [Ring1] [P] [=O],-4.0760000000000005,c1cc2ccc(OC)c(CC=C(C)(C))c2oc1=O
+79,[C] [C] [C] [S] [C] [C] [C],-2.307,CCCSCCC
+80,[C] [O] [N] [Branch1] [C] [C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [Ring1] [#Branch1],-2.948,CON(C)C(=O)Nc1ccc(Cl)cc1
+81,[C] [C] [O] [C] [C],-0.718,CCOCC
+82,[C] [C] [C] [C] [C] [C] [Branch1] [S] [C] [C] [C] [=C] [C] [Branch1] [C] [O] [=C] [C] [=C] [Ring1] [O] [Ring1] [#Branch1] [C] [Ring1] [#C] [C] [C] [Branch1] [C] [O] [C] [Ring2] [Ring1] [Ring1] [O],-3.858,CC34CCC1C(CCc2cc(O)ccc12)C3CC(O)C4O
+83,[C] [C] [N] [C] [=N] [C] [Branch1] [C] [Cl] [=N] [C] [Branch1] [O] [N] [C] [Branch1] [C] [C] [Branch1] [C] [C] [C] [#N] [=N] [Ring1] [=N],-2.49,CCNc1nc(Cl)nc(NC(C)(C)C#N)n1
+84,[C] [C] [Branch1] [C] [C] [C] [C] [Branch1] [C] [C] [Branch1] [C] [C] [O],-1.6469999999999998,CC(C)CC(C)(C)O
+85,[Cl] [C] [=C] [C] [=C] [C] [Branch1] [C] [Br] [=C] [Ring1] [#Branch1],-3.928,Clc1cccc(Br)c1
+86,[C] [C] [C] [C] [C] [C] [Branch1] [C] [O] [C] [C],-2.033,CCCCCC(O)CC
+87,[O] [=C] [N] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [C] [Ring1] [Branch2] [Branch1] [Ring1] [C] [C] [C] [C] [=C] [Branch1] [C] [C] [C],-2.126,O=C1NC(=O)NC(=O)C1(CC)CC=C(C)C
+88,[C] [C] [C] [Branch1] [C] [C] [C] [Branch1] [#Branch1] [C] [C] [Branch1] [C] [Br] [=C] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [N] [C] [Ring1] [N] [=O],-2.766,CCC(C)C1(CC(Br)=C)C(=O)NC(=O)NC1=O
+89,[C] [O] [C] [=Branch1] [C] [=O] [C],-0.416,COC(=O)C
+90,[C] [C] [Branch1] [C] [C] [C] [=C] [C] [=C] [Branch1] [C] [C] [C] [=C] [Ring1] [#Branch1] [O],-3.129,CC(C)c1ccc(C)cc1O
+91,[C],-0.636,C
+92,[N] [C] [=N] [C] [Branch1] [C] [O] [=N] [C] [N] [=C] [NH1] [C] [Ring1] [#Branch2] [=Ring1] [Branch1],-1.74,Nc1nc(O)nc2nc[nH]c12
+93,[F] [C] [=C] [C] [=C] [C] [Branch1] [C] [F] [=C] [Ring1] [#Branch1] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [Ring1] [#Branch1],-4.692,Fc1cccc(F)c1C(=O)NC(=O)Nc2ccc(Cl)cc2
+94,[C] [C] [C] [C] [C] [Branch1] [Branch1] [C] [C] [Ring1] [=Branch1] [C] [Branch1] [C] [C] [Branch1] [C] [C] [O] [Ring1] [#Branch2],-2.579,CC12CCC(CC1)C(C)(C)O2
+95,[C] [C] [O],0.02,CCO
+96,[C] [=C] [Branch2] [Ring1] [C] [N] [C] [=Branch1] [C] [=O] [O] [C] [Branch1] [C] [C] [C] [=Branch1] [C] [=O] [N] [C] [C] [C] [=C] [C] [=C] [Ring1] [P],-2.29,c1c(NC(=O)OC(C)C(=O)NCC)cccc1
+97,[C] [C] [Branch1] [C] [C] [=C] [C] [C] [Branch2] [Ring1] [#Branch2] [C] [=Branch1] [C] [=O] [O] [C] [C] [=C] [C] [=C] [C] [Branch1] [#Branch2] [O] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [=C] [Ring1] [=N] [C] [Ring2] [Ring1] [Ring2] [Branch1] [C] [C] [C],-6.763,CC(C)=CC3C(C(=O)OCc2cccc(Oc1ccccc1)c2)C3(C)C
+98,[C] [C] [C] [C] [N] [C] [=Branch1] [C] [=O] [N] [C] [Branch1] [Branch2] [N] [C] [=Branch1] [C] [=O] [O] [C] [=N] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=C] [Ring1] [=Branch1],-2.902,CCCCNC(=O)n1c(NC(=O)OC)nc2ccccc12
+99,[C] [N] [Branch1] [C] [C] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1],-2.542,CN(C)c1ccccc1
+100,[C] [O] [C] [=Branch1] [C] [=O] [C] [=C],-0.878,COC(=O)C=C
+101,[C] [N] [Branch1] [C] [C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [Branch1] [=N] [O] [C] [=C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [Ring1] [#Branch1] [C] [=C] [Ring1] [=C],-4.477,CN(C)C(=O)Nc2ccc(Oc1ccc(Cl)cc1)cc2
+102,[O] [=C] [N] [C] [=Branch1] [C] [=O] [N] [C] [=Branch1] [C] [=O] [C] [Ring1] [Branch2] [Branch1] [=Branch1] [C] [Branch1] [C] [C] [C] [C] [C] [=C] [Branch1] [C] [C] [C],-2.465,O=C1NC(=O)NC(=O)C1(C(C)C)CC=C(C)C
+103,[C] [C] [=C] [C] [=C] [Branch1] [C] [O] [C] [=C] [Ring1] [#Branch1] [C],-2.6210000000000004,Cc1ccc(O)cc1C
+104,[Cl] [C] [=C] [C] [=C] [C] [=Branch1] [Ring2] [=N] [Ring1] [=Branch1] [C] [Branch1] [C] [Cl] [Branch1] [C] [Cl] [Cl],-3.833,Clc1cccc(n1)C(Cl)(Cl)Cl
+105,[C] [C] [=Branch1] [C] [=O] [O] [C] [Branch2] [Ring1] [=C] [C] [C] [C] [C] [C] [C] [C] [=C] [C] [=Branch1] [C] [=O] [C] [C] [C] [Ring1] [#Branch1] [C] [Ring1] [O] [C] [C] [C] [Ring2] [Ring1] [C] [Ring1] [#C] [C] [C] [#C],-4.2410000000000005,CC(=O)OC3(CCC4C2CCC1=CC(=O)CCC1C2CCC34C)C#C
+106,[C] [N] [C] [=Branch1] [C] [=O] [O] [N] [=C] [Branch1] [Ring2] [C] [S] [C] [C] [Branch1] [C] [C] [Branch1] [C] [C] [C],-2.7,CNC(=O)ON=C(CSC)C(C)(C)C
+107,[C] [C] [C] [C] [C] [C] [C] [Branch1] [C] [C] [O],-2.033,CCCCCCC(C)O

data/esol/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

log.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,Selected Models,Dataset,Task,Result

models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

models/__pycache__/fm4m.cpython-310.pyc ADDED Viewed

Binary file (22.1 kB). View file

models/fm4m.py ADDED Viewed

	@@ -0,0 +1,876 @@

+from sklearn.metrics import roc_auc_score, roc_curve
+import datetime
+import os
+import umap
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import pickle
+import json
+from xgboost import XGBClassifier, XGBRegressor
+import xgboost as xgb
+from sklearn.metrics import roc_auc_score, mean_squared_error
+import xgboost as xgb
+from sklearn.svm import SVR
+from sklearn.linear_model import LinearRegression
+from sklearn.kernel_ridge import KernelRidge
+import json
+from sklearn.compose import TransformedTargetRegressor
+from sklearn.preprocessing import MinMaxScaler
+import torch
+from transformers import AutoTokenizer, AutoModel
+import sys
+sys.path.append("models/")
+from models.selfies_model.load import SELFIES as bart
+from models.mhg_model import load as mhg
+from models.smi_ted.smi_ted_light.load import load_smi_ted
+datasets = {}
+models = {}
+downstream_models ={}
+def avail_models_data():
+    global datasets
+    global models
+    datasets = [{"Dataset": "hiv", "Input": "smiles", "Output": "HIV_active", "Path": "data/hiv", "Timestamp": "2024-06-26 11:27:37"},
+  {"Dataset": "esol", "Input": "smiles", "Output": "ESOL predicted log solubility in mols per litre", "Path": "data/esol", "Timestamp": "2024-06-26 11:31:46"},
+  {"Dataset": "freesolv", "Input": "smiles", "Output": "expt", "Path": "data/freesolv", "Timestamp": "2024-06-26 11:33:47"},
+  {"Dataset": "lipo", "Input": "smiles", "Output": "y", "Path": "data/lipo", "Timestamp": "2024-06-26 11:34:37"},
+  {"Dataset": "bace", "Input": "smiles", "Output": "Class", "Path": "data/bace", "Timestamp": "2024-06-26 11:36:40"},
+  {"Dataset": "bbbp", "Input": "smiles", "Output": "p_np", "Path": "data/bbbp", "Timestamp": "2024-06-26 11:39:23"},
+  {"Dataset": "clintox", "Input": "smiles", "Output": "CT_TOX", "Path": "data/clintox", "Timestamp": "2024-06-26 11:42:43"}]
+    models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"},
+  {"Name": "mol-xl","Model Name": "Molformer", "Description": "MolFormer model for string based SMILES modality", "Timestamp": "2024-06-21 12:35:56"},
+  {"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"},
+  {"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}]
+def avail_models(raw=False):
+    global models
+    models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"},
+              {"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"},
+              {"Name": "mol-xl","Model Name": "Molformer", "Description": "MolFormer model for string based SMILES modality"},
+              {"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"},
+  ]
+    if raw: return models
+    else:
+        return pd.DataFrame(models).drop('Name', axis=1)
+    return models
+def avail_downstream_models():
+    global downstream_models
+    with open("downstream_models.json", "r") as outfile:
+        downstream_models = json.load(outfile)
+    return downstream_models
+def avail_datasets():
+    global datasets
+    datasets = [{"Dataset": "hiv", "Input": "smiles", "Output": "HIV_active", "Path": "data/hiv",
+                 "Timestamp": "2024-06-26 11:27:37"},
+                {"Dataset": "esol", "Input": "smiles", "Output": "ESOL predicted log solubility in mols per litre",
+                 "Path": "data/esol", "Timestamp": "2024-06-26 11:31:46"},
+                {"Dataset": "freesolv", "Input": "smiles", "Output": "expt", "Path": "data/freesolv",
+                 "Timestamp": "2024-06-26 11:33:47"},
+                {"Dataset": "lipo", "Input": "smiles", "Output": "y", "Path": "data/lipo",
+                 "Timestamp": "2024-06-26 11:34:37"},
+                {"Dataset": "bace", "Input": "smiles", "Output": "Class", "Path": "data/bace",
+                 "Timestamp": "2024-06-26 11:36:40"},
+                {"Dataset": "bbbp", "Input": "smiles", "Output": "p_np", "Path": "data/bbbp",
+                 "Timestamp": "2024-06-26 11:39:23"},
+                {"Dataset": "clintox", "Input": "smiles", "Output": "CT_TOX", "Path": "data/clintox",
+                 "Timestamp": "2024-06-26 11:42:43"}]
+    return datasets
+def reset():
+    """datasets = {"esol": ["smiles", "ESOL predicted log solubility in mols per litre", "data/esol", "2024-06-26 11:36:46.509324"],
+           "freesolv": ["smiles", "expt", "data/freesolv", "2024-06-26 11:37:37.393273"],
+           "lipo": ["smiles", "y", "data/lipo", "2024-06-26 11:37:37.393273"],
+           "hiv": ["smiles", "HIV_active", "data/hiv",  "2024-06-26 11:37:37.393273"],
+           "bace": ["smiles", "Class", "data/bace", "2024-06-26 11:38:40.058354"],
+           "bbbp": ["smiles", "p_np", "data/bbbp","2024-06-26 11:38:40.058354"],
+           "clintox": ["smiles", "CT_TOX", "data/clintox","2024-06-26 11:38:40.058354"],
+           "sider": ["smiles","1:", "data/sider","2024-06-26 11:38:40.058354"],
+           "tox21": ["smiles",":-2", "data/tox21","2024-06-26 11:38:40.058354"]
+           }"""
+    datasets = [
+      {"Dataset": "hiv", "Input": "smiles", "Output": "HIV_active", "Path": "data/hiv", "Timestamp": "2024-06-26 11:27:37"},
+      {"Dataset": "esol", "Input": "smiles", "Output": "ESOL predicted log solubility in mols per litre", "Path": "data/esol", "Timestamp": "2024-06-26 11:31:46"},
+      {"Dataset": "freesolv", "Input": "smiles", "Output": "expt", "Path": "data/freesolv", "Timestamp": "2024-06-26 11:33:47"},
+      {"Dataset": "lipo", "Input": "smiles", "Output": "y", "Path": "data/lipo", "Timestamp": "2024-06-26 11:34:37"},
+      {"Dataset": "bace", "Input": "smiles", "Output": "Class", "Path": "data/bace", "Timestamp": "2024-06-26 11:36:40"},
+      {"Dataset": "bbbp", "Input": "smiles", "Output": "p_np", "Path": "data/bbbp", "Timestamp": "2024-06-26 11:39:23"},
+      {"Dataset": "clintox", "Input": "smiles", "Output": "CT_TOX", "Path": "data/clintox", "Timestamp": "2024-06-26 11:42:43"},
+      #{"Dataset": "sider", "Input": "smiles", "Output": "1:", "path": "data/sider", "Timestamp": "2024-06-26 11:38:40.058354"},
+      #{"Dataset": "tox21", "Input": "smiles", "Output": ":-2", "path": "data/tox21", "Timestamp": "2024-06-26 11:38:40.058354"}
+    ]
+    models = [{"Name": "bart", "Description": "BART model for string based SELFIES modality",
+      "Timestamp": "2024-06-21 12:32:20"},
+     {"Name": "mol-xl", "Description": "MolFormer model for string based SMILES modality",
+      "Timestamp": "2024-06-21 12:35:56"},
+     {"Name": "mhg", "Description": "MHG", "Timestamp": "2024-07-10 00:09:42"},
+     {"Name": "spec-gru", "Description": "Spectrum modality with GRU", "Timestamp": "2024-07-10 00:09:42"},
+     {"Name": "spec-lstm", "Description": "Spectrum modality with LSTM", "Timestamp": "2024-07-10 00:09:54"},
+     {"Name": "3d-vae", "Description": "VAE model for 3D atom positions", "Timestamp": "2024-07-10 00:10:08"}]
+    downstream_models = [
+        {"Name": "XGBClassifier", "Description": "XG Boost Classifier",
+         "Timestamp": "2024-06-21 12:31:20"},
+        {"Name": "XGBRegressor", "Description": "XG Boost Regressor",
+         "Timestamp": "2024-06-21 12:32:56"},
+        {"Name": "2-FNN", "Description": "A two layer feedforward network",
+         "Timestamp": "2024-06-24 14:34:16"},
+        {"Name": "3-FNN", "Description": "A three layer feedforward network",
+         "Timestamp": "2024-06-24 14:38:37"},
+    ]
+    with open("datasets.json", "w") as outfile:
+        json.dump(datasets, outfile)
+    with open("models.json", "w") as outfile:
+        json.dump(models, outfile)
+    with open("downstream_models.json", "w") as outfile:
+        json.dump(downstream_models, outfile)
+def update_data_list(list_data):
+    #datasets[list_data[0]] = list_data[1:]
+    with open("datasets.json", "w") as outfile:
+        json.dump(datasets, outfile)
+    avail_models_data()
+def update_model_list(list_model):
+    #models[list_model[0]] = list_model[1]
+    with open("models.json", "w") as outfile:
+        json.dump(list_model, outfile)
+    avail_models_data()
+def update_downstream_model_list(list_model):
+    #models[list_model[0]] = list_model[1]
+    with open("downstream_models.json", "w") as outfile:
+        json.dump(list_model, outfile)
+    avail_models_data()
+avail_models_data()
+def list_models():
+    #print(*list(models.keys()),sep='\n')
+    data = avail_models(raw=True)
+    # Convert data to a pandas DataFrame
+    df = pd.DataFrame(data)
+    # Add a column for displaying row numbers starting from 1
+    df.index += 1
+    # Create dropdown widget for sorting
+    sort_dropdown = widgets.Dropdown(
+        options=['Name', 'Timestamp'],
+        value='Name',
+        description='Sort by:',
+        disabled=False,
+    )
+    # Output widget to display the table
+    output = widgets.Output()
+    # Define function to update display based on sorting
+    def update_display(change):
+        with output:
+            output.clear_output(wait=True)
+            sorted_df = df.sort_values(by=sort_dropdown.value)
+            display(sorted_df.style.set_properties(**{
+                'text-align': 'left', 'border': '1px solid #ddd',
+            }))
+    # Attach the update_display function to the dropdown widget
+    sort_dropdown.observe(update_display, names='value')
+    # Display the dropdown and the table initially
+    display(sort_dropdown, output)
+    update_display(None)
+def list_downstream_models():
+    #print(*list(models.keys()),sep='\n')
+    data = avail_downstream_models()
+    # Convert data to a pandas DataFrame
+    df = pd.DataFrame(data)
+    # Add a column for displaying row numbers starting from 1
+    df.index += 1
+    # Create dropdown widget for sorting
+    sort_dropdown = widgets.Dropdown(
+        options=['Name', 'Timestamp'],
+        value='Timestamp',
+        description='Sort by:',
+        disabled=False,
+    )
+    # Output widget to display the table
+    output = widgets.Output()
+    # Define function to update display based on sorting
+    def update_display(change):
+        with output:
+            output.clear_output(wait=True)
+            sorted_df = df.sort_values(by=sort_dropdown.value)
+            display(sorted_df.style.set_properties(**{
+                'text-align': 'left', 'border': '1px solid #ddd',
+            }))
+    # Attach the update_display function to the dropdown widget
+    sort_dropdown.observe(update_display, names='value')
+    # Display the dropdown and the table initially
+    display(sort_dropdown, output)
+    update_display(None)
+def list_data():
+    #print(*list(datasets.keys()),sep='\n')
+    data = avail_datasets()
+    # Convert data to a pandas DataFrame
+    df = pd.DataFrame(data)
+    # Add a column for displaying row numbers starting from 1
+    df.index += 1
+    # Create dropdown widget for sorting
+    sort_dropdown = widgets.Dropdown(
+        options=['Dataset', 'Input', 'Output', 'Path', 'Timestamp'],
+        value='Input',
+        description='Sort by:',
+        disabled=False,
+    )
+    # Output widget to display the table
+    output = widgets.Output()
+    # Define function to update display based on sorting
+    def update_display(change):
+        with output:
+            output.clear_output(wait=True)
+            sorted_df = df.sort_values(by=sort_dropdown.value)
+            display(sorted_df.style.set_properties(**{
+                'text-align': 'left', 'border': '1px solid #ddd',
+            }))
+    # Attach the update_display function to the dropdown widget
+    sort_dropdown.observe(update_display, names='value')
+    # Display the dropdown and the table initially
+    display(sort_dropdown, output)
+    update_display(None)
+def vizualize(roc_auc,fpr, tpr, features, labels):
+    #def vizualize(features, labels):
+    reducer = umap.UMAP(metric="jaccard", n_neighbors=20, n_components=2, low_memory=True, min_dist=0.001, verbose=False)
+    features_umap = reducer.fit_transform(features)
+    x = labels.values
+    index_0 = [index for index in range(len(x)) if x[index] == 0]
+    index_1 = [index for index in range(len(x)) if x[index] == 1]
+    class_0 = features_umap[index_0]
+    class_1 = features_umap[index_1]
+    # Function to create ROC AUC plot
+    def plot_roc_auc():
+        plt.figure(figsize=(8, 6))
+        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
+        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
+        plt.xlim([0.0, 1.0])
+        plt.ylim([0.0, 1.05])
+        plt.xlabel('False Positive Rate')
+        plt.ylabel('True Positive Rate')
+        plt.title('Receiver Operating Characteristic')
+        plt.legend(loc='lower right')
+        plt.show()
+    # Function to create scatter plot of the dataset distribution
+    def plot_distribution():
+        plt.figure(figsize=(8, 6))
+        #plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, edgecolors='k')
+        plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
+        plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
+        plt.xlabel('Feature 1')
+        plt.ylabel('Feature 2')
+        plt.title('Dataset Distribution')
+        plt.show()
+    # Create tabs using ipywidgets
+    tab_contents = ['ROC AUC', 'Distribution']
+    children = [widgets.Output(), widgets.Output()]
+    tab = widgets.Tab()
+    tab.children = children
+    for i in range(len(tab_contents)):
+        tab.set_title(i, tab_contents[i])
+    # Display plots in their respective tabs
+    with children[0]:
+        plot_roc_auc()
+    with children[1]:
+        plot_distribution()
+    # Display the tab widget
+    display(tab)
+def get_representation(train_data,test_data,model_type, return_tensor=True):
+    alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
+    if model_type in alias.keys():
+        model_type = alias[model_type]
+    if model_type == "mhg":
+        model = mhg.load("models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle")
+        with torch.no_grad():
+            train_emb = model.encode(train_data)
+            x_batch = torch.stack(train_emb)
+            test_emb = model.encode(test_data)
+            x_batch_test = torch.stack(test_emb)
+        if not return_tensor:
+            x_batch = pd.DataFrame(x_batch)
+            x_batch_test = pd.DataFrame(x_batch_test)
+    elif model_type == "bart":
+        model = bart()
+        model.load()
+        x_batch = model.encode(train_data, return_tensor=return_tensor)
+        x_batch_test = model.encode(test_data, return_tensor=return_tensor)
+    elif model_type == "smi-ted":
+        model = load_smi_ted(folder='./models/smi_ted/smi_ted_light', ckpt_filename='smi-ted-Light_40.pt')
+        with torch.no_grad():
+            x_batch = model.encode(train_data, return_torch=return_tensor)
+            x_batch_test = model.encode(test_data, return_torch=return_tensor)
+    elif model_type == "mol-xl":
+        model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True,
+                                          trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)
+        if type(train_data) == list:
+            inputs = tokenizer(train_data, padding=True, return_tensors="pt")
+        else:
+            inputs = tokenizer(list(train_data.values), padding=True, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(**inputs)
+        x_batch = outputs.pooler_output
+        if type(test_data) == list:
+            inputs = tokenizer(test_data, padding=True, return_tensors="pt")
+        else:
+            inputs = tokenizer(list(test_data.values), padding=True, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(**inputs)
+        x_batch_test = outputs.pooler_output
+        if not return_tensor:
+            x_batch = pd.DataFrame(x_batch)
+            x_batch_test = pd.DataFrame(x_batch_test)
+    return x_batch, x_batch_test
+def single_modal(model,dataset, downstream_model,params):
+    print(model)
+    alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED": "smi-ted"}
+    data = avail_models(raw=True)
+    df = pd.DataFrame(data)
+    print(list(df["Name"].values))
+    if alias[model] in list(df["Name"].values):
+        if model in alias.keys():
+            model_type = alias[model]
+        else:
+            model_type = model
+    else:
+        print("Model not available")
+        return
+    data = avail_datasets()
+    df = pd.DataFrame(data)
+    print(list(df["Dataset"].values))
+    if dataset in list(df["Dataset"].values):
+        task = dataset
+        with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
+            x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
+        print(f" Representation loaded successfully")
+    else:
+        print("Custom Dataset")
+        #return
+        components = dataset.split(",")
+        train_data = pd.read_csv(components[0])[components[2]]
+        test_data = pd.read_csv(components[1])[components[2]]
+        y_batch = pd.read_csv(components[0])[components[3]]
+        y_batch_test = pd.read_csv(components[1])[components[3]]
+        x_batch,  x_batch_test = get_representation(train_data,test_data,model_type)
+        print(f" Representation loaded successfully")
+    print(f" Calculating ROC AUC Score ...")
+    if downstream_model == "XGBClassifier":
+        xgb_predict_concat = XGBClassifier(**params) # n_estimators=5000, learning_rate=0.01, max_depth=10
+        xgb_predict_concat.fit(x_batch, y_batch)
+        y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
+        roc_auc = roc_auc_score(y_batch_test, y_prob)
+        fpr, tpr, _ = roc_curve(y_batch_test, y_prob)
+        print(f"ROC-AUC Score: {roc_auc:.4f}")
+        try:
+            with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
+                class_0,class_1 = pickle.load(f1)
+        except:
+            print("Generating latent plots")
+            reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
+                                verbose=False)
+            n_samples = np.minimum(1000, len(x_batch))
+            features_umap = reducer.fit_transform(x_batch[:n_samples])
+            x = y_batch.values[:n_samples]
+            index_0 = [index for index in range(len(x)) if x[index] == 0]
+            index_1 = [index for index in range(len(x)) if x[index] == 1]
+            class_0 = features_umap[index_0]
+            class_1 = features_umap[index_1]
+            print("Generating latent plots : Done")
+        #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
+        result = f"ROC-AUC Score: {roc_auc:.4f}"
+        return result, roc_auc,fpr, tpr, class_0, class_1
+    elif downstream_model == "DefaultClassifier":
+        xgb_predict_concat = XGBClassifier() # n_estimators=5000, learning_rate=0.01, max_depth=10
+        xgb_predict_concat.fit(x_batch, y_batch)
+        y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
+        roc_auc = roc_auc_score(y_batch_test, y_prob)
+        fpr, tpr, _ = roc_curve(y_batch_test, y_prob)
+        print(f"ROC-AUC Score: {roc_auc:.4f}")
+        try:
+            with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
+                class_0,class_1 = pickle.load(f1)
+        except:
+            print("Generating latent plots")
+            reducer = umap.UMAP(metric='euclidean', n_neighbors=  10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
+            n_samples = np.minimum(1000,len(x_batch))
+            features_umap = reducer.fit_transform(x_batch[:n_samples])
+            x = y_batch.values[:n_samples]
+            index_0 = [index for index in range(len(x)) if x[index] == 0]
+            index_1 = [index for index in range(len(x)) if x[index] == 1]
+            class_0 = features_umap[index_0]
+            class_1 = features_umap[index_1]
+            print("Generating latent plots : Done")
+        #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
+        result = f"ROC-AUC Score: {roc_auc:.4f}"
+        return result, roc_auc,fpr, tpr, class_0, class_1
+    elif downstream_model == "SVR":
+        regressor = SVR(**params)
+        model = TransformedTargetRegressor(regressor= regressor,
+                                                transformer = MinMaxScaler(feature_range=(-1, 1))
+                                                ).fit(x_batch,y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        print("Generating latent plots")
+        reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
+                            verbose=False)
+        n_samples = np.minimum(1000, len(x_batch))
+        features_umap = reducer.fit_transform(x_batch[:n_samples])
+        x = y_batch.values[:n_samples]
+        #index_0 = [index for index in range(len(x)) if x[index] == 0]
+        #index_1 = [index for index in range(len(x)) if x[index] == 1]
+        class_0 = features_umap#[index_0]
+        class_1 = features_umap#[index_1]
+        print("Generating latent plots : Done")
+        return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
+    elif downstream_model == "Kernel Ridge":
+        regressor = KernelRidge(**params)
+        model = TransformedTargetRegressor(regressor=regressor,
+                                           transformer=MinMaxScaler(feature_range=(-1, 1))
+                                           ).fit(x_batch, y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        print("Generating latent plots")
+        reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
+                            verbose=False)
+        n_samples = np.minimum(1000, len(x_batch))
+        features_umap = reducer.fit_transform(x_batch[:n_samples])
+        x = y_batch.values[:n_samples]
+        # index_0 = [index for index in range(len(x)) if x[index] == 0]
+        # index_1 = [index for index in range(len(x)) if x[index] == 1]
+        class_0 = features_umap#[index_0]
+        class_1 = features_umap#[index_1]
+        print("Generating latent plots : Done")
+        return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
+    elif downstream_model == "Linear Regression":
+        regressor = LinearRegression(**params)
+        model = TransformedTargetRegressor(regressor=regressor,
+                                           transformer=MinMaxScaler(feature_range=(-1, 1))
+                                           ).fit(x_batch, y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        print("Generating latent plots")
+        reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
+                            verbose=False)
+        n_samples = np.minimum(1000, len(x_batch))
+        features_umap = reducer.fit_transform(x_batch[:n_samples])
+        x = y_batch.values[:n_samples]
+        # index_0 = [index for index in range(len(x)) if x[index] == 0]
+        # index_1 = [index for index in range(len(x)) if x[index] == 1]
+        class_0 = features_umap#[index_0]
+        class_1 = features_umap#[index_1]
+        print("Generating latent plots : Done")
+        return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
+    elif downstream_model == "DefaultRegressor":
+        regressor = SVR(kernel="rbf", degree=3, C=5, gamma="scale", epsilon=0.01)
+        model = TransformedTargetRegressor(regressor=regressor,
+                                           transformer=MinMaxScaler(feature_range=(-1, 1))
+                                           ).fit(x_batch, y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        print("Generating latent plots")
+        reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
+                            verbose=False)
+        n_samples = np.minimum(1000, len(x_batch))
+        features_umap = reducer.fit_transform(x_batch[:n_samples])
+        x = y_batch.values[:n_samples]
+        # index_0 = [index for index in range(len(x)) if x[index] == 0]
+        # index_1 = [index for index in range(len(x)) if x[index] == 1]
+        class_0 = features_umap#[index_0]
+        class_1 = features_umap#[index_1]
+        print("Generating latent plots : Done")
+        return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
+def multi_modal(model_list,dataset, downstream_model,params):
+    print(model_list)
+    data = avail_datasets()
+    df = pd.DataFrame(data)
+    list(df["Dataset"].values)
+    if dataset in list(df["Dataset"].values):
+        task = dataset
+        predefined = True
+    else:
+        predefined = False
+        components = dataset.split(",")
+        train_data = pd.read_csv(components[0])[components[2]]
+        test_data = pd.read_csv(components[1])[components[2]]
+        y_batch = pd.read_csv(components[0])[components[3]]
+        y_batch_test = pd.read_csv(components[1])[components[3]]
+        print("Custom Dataset loaded")
+    data = avail_models(raw=True)
+    df = pd.DataFrame(data)
+    list(df["Name"].values)
+    alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED":"smi-ted"}
+    #if set(model_list).issubset(list(df["Name"].values)):
+    if set(model_list).issubset(list(alias.keys())):
+        for i, model in enumerate(model_list):
+            if model in alias.keys():
+                model_type = alias[model]
+            else:
+                model_type = model
+            if i == 0:
+                if predefined:
+                    with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
+                        x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
+                    print(f" Loaded representation/{task}_{model_type}.pkl")
+                else:
+                    x_batch, x_batch_test = get_representation(train_data, test_data, model_type)
+                    x_batch = pd.DataFrame(x_batch)
+                    x_batch_test = pd.DataFrame(x_batch_test)
+            else:
+                if predefined:
+                    with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
+                        x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1)
+                        print(f" Loaded representation/{task}_{model_type}.pkl")
+                else:
+                    x_batch_1, x_batch_test_1 = get_representation(train_data, test_data, model_type)
+                    x_batch_1 = pd.DataFrame(x_batch_1)
+                    x_batch_test_1 = pd.DataFrame(x_batch_test_1)
+                x_batch = pd.concat([x_batch, x_batch_1], axis=1)
+                x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1)
+    else:
+        print("Model not available")
+        return
+    num_columns = x_batch_test.shape[1]
+    x_batch_test.columns = [f'{i + 1}' for i in range(num_columns)]
+    num_columns = x_batch.shape[1]
+    x_batch.columns = [f'{i + 1}' for i in range(num_columns)]
+    print(f"Representations loaded successfully")
+    try:
+        with open(f"plot_emb/{task}_multi.pkl", "rb") as f1:
+            class_0, class_1 = pickle.load(f1)
+    except:
+        print("Generating latent plots")
+        reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
+                            verbose=False)
+        n_samples = np.minimum(1000, len(x_batch))
+        features_umap = reducer.fit_transform(x_batch[:n_samples])
+        if "Classifier" in downstream_model:
+            x = y_batch.values[:n_samples]
+            index_0 = [index for index in range(len(x)) if x[index] == 0]
+            index_1 = [index for index in range(len(x)) if x[index] == 1]
+            class_0 = features_umap[index_0]
+            class_1 = features_umap[index_1]
+        else:
+            class_0 = features_umap
+            class_1 = features_umap
+        print("Generating latent plots : Done")
+    print(f" Calculating ROC AUC Score ...")
+    if downstream_model == "XGBClassifier":
+        xgb_predict_concat = XGBClassifier(**params)#n_estimators=5000, learning_rate=0.01, max_depth=10)
+        xgb_predict_concat.fit(x_batch, y_batch)
+        y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
+        roc_auc = roc_auc_score(y_batch_test, y_prob)
+        fpr, tpr, _ = roc_curve(y_batch_test, y_prob)
+        print(f"ROC-AUC Score: {roc_auc:.4f}")
+        #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
+        #vizualize(x_batch_test, y_batch_test)
+        print(f"ROC-AUC Score: {roc_auc:.4f}")
+        result = f"ROC-AUC Score: {roc_auc:.4f}"
+        return result, roc_auc,fpr, tpr, class_0, class_1
+    elif downstream_model == "DefaultClassifier":
+        xgb_predict_concat = XGBClassifier()#n_estimators=5000, learning_rate=0.01, max_depth=10)
+        xgb_predict_concat.fit(x_batch, y_batch)
+        y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
+        roc_auc = roc_auc_score(y_batch_test, y_prob)
+        fpr, tpr, _ = roc_curve(y_batch_test, y_prob)
+        print(f"ROC-AUC Score: {roc_auc:.4f}")
+        #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
+        #vizualize(x_batch_test, y_batch_test)
+        print(f"ROC-AUC Score: {roc_auc:.4f}")
+        result = f"ROC-AUC Score: {roc_auc:.4f}"
+        return result, roc_auc,fpr, tpr, class_0, class_1
+    elif downstream_model == "SVR":
+        regressor = SVR(**params)
+        model = TransformedTargetRegressor(regressor= regressor,
+                                                transformer = MinMaxScaler(feature_range=(-1, 1))
+                                                ).fit(x_batch,y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
+    elif downstream_model == "Linear Regression":
+        regressor = LinearRegression(**params)
+        model = TransformedTargetRegressor(regressor=regressor,
+                                           transformer=MinMaxScaler(feature_range=(-1, 1))
+                                           ).fit(x_batch, y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
+    elif downstream_model == "Kernel Ridge":
+        regressor = KernelRidge(**params)
+        model = TransformedTargetRegressor(regressor=regressor,
+                                           transformer=MinMaxScaler(feature_range=(-1, 1))
+                                           ).fit(x_batch, y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
+    elif downstream_model == "DefaultRegressor":
+        regressor = SVR(kernel="rbf", degree=3, C=5, gamma="scale", epsilon=0.01)
+        model = TransformedTargetRegressor(regressor=regressor,
+                                           transformer=MinMaxScaler(feature_range=(-1, 1))
+                                           ).fit(x_batch, y_batch)
+        y_prob = model.predict(x_batch_test)
+        RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
+        print(f"RMSE Score: {RMSE_score:.4f}")
+        result = f"RMSE Score: {RMSE_score:.4f}"
+        return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
+def finetune_optuna(x_batch,y_batch, x_batch_test, y_test ):
+    print(f" Finetuning with Optuna and calculating ROC AUC Score ...")
+    X_train = x_batch.values
+    y_train = y_batch.values
+    X_test = x_batch_test.values
+    y_test = y_test.values
+    def objective(trial):
+        # Define parameters to be optimized
+        params = {
+            # 'objective': 'binary:logistic',
+            'eval_metric': 'auc',
+            'verbosity': 0,
+            'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
+            # 'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
+            # 'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
+            'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
+            'max_depth': trial.suggest_int('max_depth', 1, 12),
+            # 'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
+            # 'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
+            # 'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
+            # "subsample": trial.suggest_float("subsample", 0.05, 1.0),
+            # "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
+        }
+        # Train XGBoost model
+        dtrain = xgb.DMatrix(X_train, label=y_train)
+        dtest = xgb.DMatrix(X_test, label=y_test)
+        model = xgb.train(params, dtrain)
+        # Predict probabilities
+        y_pred = model.predict(dtest)
+        # Calculate ROC AUC score
+        roc_auc = roc_auc_score(y_test, y_pred)
+        print("ROC_AUC : ", roc_auc)
+        return roc_auc

models/mhg_model/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

models/mhg_model/README.md ADDED Viewed

	@@ -0,0 +1,75 @@

+# mhg-gnn
+This repository provides PyTorch source code assosiated with our publication, "MHG-GNN: Combination of Molecular Hypergraph Grammar with Graph Neural Network"
+**Paper:** [Arxiv Link](https://arxiv.org/pdf/2309.16374)
+![mhg-gnn](images/mhg_example1.png)
+## Introduction
+We present MHG-GNN, an autoencoder architecture
+that has an encoder based on GNN and a decoder based on a sequential model with MHG.
+Since the encoder is a GNN variant, MHG-GNN can accept any molecule as input, and
+demonstrate high predictive performance on molecular graph data.
+In addition, the decoder inherits the theoretical guarantee of MHG on always generating a structurally valid molecule as output.
+## Table of Contents
+1. [Getting Started](#getting-started)
+    1. [Pretrained Models and Training Logs](#pretrained-models-and-training-logs)
+    2. [Installation](#installation)
+2. [Feature Extraction](#feature-extraction)
+## Getting Started
+**This code and environment have been tested on Intel E5-2667 CPUs at 3.30GHz and NVIDIA A100 Tensor Core GPUs.**
+### Pretrained Models and Training Logs
+We provide checkpoints of the MHG-GNN model pre-trained on a dataset of ~1.34M molecules curated from PubChem. (later) For model weights: [HuggingFace Link]()
+Add the MHG-GNN `pre-trained weights.pt` to the `models/` directory according to your needs.
+### Installation
+We recommend to create a virtual environment. For example:
+```
+python3 -m venv .venv
+. .venv/bin/activate
+```
+Type the following command once the virtual environment is activated:
+```
+git clone [email protected]:CMD-TRL/mhg-gnn.git
+cd ./mhg-gnn
+pip install .
+```
+## Feature Extraction
+The example notebook [mhg-gnn_encoder_decoder_example.ipynb](notebooks/mhg-gnn_encoder_decoder_example.ipynb) contains code to load checkpoint files and use the pre-trained model for encoder and decoder tasks.
+To load mhg-gnn, you can simply use:
+```python
+import torch
+import load
+model = load.load()
+```
+To encode SMILES into embeddings, you can use:
+```python
+with torch.no_grad():
+    repr = model.encode(["CCO", "O=C=O", "OC(=O)c1ccccc1C(=O)O"])
+```
+For decoder, you can use the function, so you can return from embeddings to SMILES strings:
+```python
+orig = model.decode(repr)
+```

models/mhg_model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# -*- coding:utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#

models/mhg_model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (224 Bytes). View file

models/mhg_model/__pycache__/load.cpython-310.pyc ADDED Viewed

Binary file (3.16 kB). View file

models/mhg_model/graph_grammar/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 1 2018"

models/mhg_model/graph_grammar/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (676 Bytes). View file

models/mhg_model/graph_grammar/__pycache__/hypergraph.cpython-310.pyc ADDED Viewed

Binary file (15.3 kB). View file

models/mhg_model/graph_grammar/algo/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 1 2018"

models/mhg_model/graph_grammar/algo/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (681 Bytes). View file

models/mhg_model/graph_grammar/algo/__pycache__/tree_decomposition.cpython-310.pyc ADDED Viewed

Binary file (19.5 kB). View file

models/mhg_model/graph_grammar/algo/tree_decomposition.py ADDED Viewed

	@@ -0,0 +1,821 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2017"
+__version__ = "0.1"
+__date__ = "Dec 11 2017"
+from copy import deepcopy
+from itertools import combinations
+from ..hypergraph import Hypergraph
+import networkx as nx
+import numpy as np
+class CliqueTree(nx.Graph):
+    ''' clique tree object
+    Attributes
+    ----------
+    hg : Hypergraph
+        This hypergraph will be decomposed.
+    root_hg : Hypergraph
+        Hypergraph on the root node.
+    ident_node_dict : dict
+        ident_node_dict[key_node] gives a list of nodes that are identical (i.e., the adjacent hyperedges are common)
+    '''
+    def __init__(self, hg=None, **kwargs):
+        self.hg = deepcopy(hg)
+        if self.hg is not None:
+            self.ident_node_dict = self.hg.get_identical_node_dict()
+        else:
+            self.ident_node_dict = {}
+        super().__init__(**kwargs)
+    @property
+    def root_hg(self):
+        ''' return the hypergraph on the root node
+        '''
+        return self.nodes[0]['subhg']
+    @root_hg.setter
+    def root_hg(self, hypergraph):
+        ''' set the hypergraph on the root node
+        '''
+        self.nodes[0]['subhg'] = hypergraph
+    def insert_subhg(self, subhypergraph: Hypergraph) -> None:
+        ''' insert a subhypergraph, which is extracted from a root hypergraph, into the tree.
+        Parameters
+        ----------
+        subhg : Hypergraph
+        '''
+        num_nodes = self.number_of_nodes()
+        self.add_node(num_nodes, subhg=subhypergraph)
+        self.add_edge(num_nodes, 0)
+        adj_nodes = deepcopy(list(self.adj[0].keys()))
+        for each_node in adj_nodes:
+            if len(self.nodes[each_node]["subhg"].nodes.intersection(
+                    self.nodes[num_nodes]["subhg"].nodes)\
+                   - self.root_hg.nodes) != 0 and each_node != num_nodes:
+                self.remove_edge(0, each_node)
+                self.add_edge(each_node, num_nodes)
+    def to_irredundant(self) -> None:
+        ''' convert the clique tree to be irredundant
+        '''
+        for each_node in self.hg.nodes:
+            subtree = self.subgraph([
+                each_tree_node for each_tree_node in self.nodes()\
+                if each_node in self.nodes[each_tree_node]["subhg"].nodes]).copy()
+            leaf_node_list = [x for x in subtree.nodes() if subtree.degree(x)==1]
+            redundant_leaf_node_list = []
+            for each_leaf_node in leaf_node_list:
+                if len(self.nodes[each_leaf_node]["subhg"].adj_edges(each_node)) == 0:
+                    redundant_leaf_node_list.append(each_leaf_node)
+            for each_red_leaf_node in redundant_leaf_node_list:
+                current_node = each_red_leaf_node
+                while subtree.degree(current_node) == 1 \
+                      and len(subtree.nodes[current_node]["subhg"].adj_edges(each_node)) == 0:
+                    self.nodes[current_node]["subhg"].remove_node(each_node)
+                    remove_node = current_node
+                    current_node = list(dict(subtree[remove_node]).keys())[0]
+                    subtree.remove_node(remove_node)
+        fixed_node_set = deepcopy(self.nodes)
+        for each_node in fixed_node_set:
+            if self.nodes[each_node]["subhg"].num_edges == 0:
+                if len(self[each_node]) == 1:
+                    self.remove_node(each_node)
+                elif len(self[each_node]) == 2:
+                    self.add_edge(*self[each_node])
+                    self.remove_node(each_node)
+                else:
+                    pass
+            else:
+                pass
+        redundant = True
+        while redundant:
+            redundant = False
+            fixed_edge_set = deepcopy(self.edges)
+            remove_node_set = set()
+            for node_1, node_2 in fixed_edge_set:
+                if node_1 in remove_node_set or node_2 in remove_node_set:
+                    pass
+                else:
+                    if self.nodes[node_1]['subhg'].is_subhg(self.nodes[node_2]['subhg']):
+                        redundant = True
+                        adj_node_list = set(self.adj[node_1]) - {node_2}
+                        self.remove_node(node_1)
+                        remove_node_set.add(node_1)
+                        for each_node in adj_node_list:
+                            self.add_edge(node_2, each_node)
+                    elif self.nodes[node_2]['subhg'].is_subhg(self.nodes[node_1]['subhg']):
+                        redundant = True
+                        adj_node_list = set(self.adj[node_2]) - {node_1}
+                        self.remove_node(node_2)
+                        remove_node_set.add(node_2)
+                        for each_node in adj_node_list:
+                            self.add_edge(node_1, each_node)
+    def node_update(self, key_node: str, subhg) -> None:
+        """ given a pair of a hypergraph, H, and its subhypergraph, sH, return a hypergraph H\sH.
+        Parameters
+        ----------
+        key_node : str
+            key node that must be removed.
+        subhg : Hypegraph
+        """
+        for each_edge in subhg.edges:
+            self.root_hg.remove_edge(each_edge)
+        self.root_hg.remove_nodes(self.ident_node_dict[key_node])
+        adj_node_list = list(subhg.nodes)
+        for each_node in subhg.nodes:
+            if each_node not in self.ident_node_dict[key_node]:
+                if set(self.root_hg.adj_edges(each_node)).issubset(subhg.edges):
+                    self.root_hg.remove_node(each_node)
+                    adj_node_list.remove(each_node)
+            else:
+                adj_node_list.remove(each_node)
+        for each_node_1, each_node_2 in combinations(adj_node_list, 2):
+            if not self.root_hg.is_adj(each_node_1, each_node_2):
+                self.root_hg.add_edge(set([each_node_1, each_node_2]), attr_dict=dict(tmp=True))
+        subhg.remove_edges_with_attr({'tmp' : True})
+        self.insert_subhg(subhg)
+    def update(self, subhg, remove_nodes=False):
+        """ given a pair of a hypergraph, H, and its subhypergraph, sH, return a hypergraph H\sH.
+        Parameters
+        ----------
+        subhg : Hypegraph
+        """
+        for each_edge in subhg.edges:
+            self.root_hg.remove_edge(each_edge)
+        if remove_nodes:
+            remove_edge_list = []
+            for each_edge in self.root_hg.edges:
+                if set(self.root_hg.nodes_in_edge(each_edge)).issubset(subhg.nodes)\
+                   and self.root_hg.edge_attr(each_edge).get('tmp', False):
+                    remove_edge_list.append(each_edge)
+            self.root_hg.remove_edges(remove_edge_list)
+        adj_node_list = list(subhg.nodes)
+        for each_node in subhg.nodes:
+            if self.root_hg.degree(each_node) == 0:
+                self.root_hg.remove_node(each_node)
+                adj_node_list.remove(each_node)
+        if len(adj_node_list) != 1 and not remove_nodes:
+            self.root_hg.add_edge(set(adj_node_list), attr_dict=dict(tmp=True))
+        '''
+        else:
+            for each_node_1, each_node_2 in combinations(adj_node_list, 2):
+                if not self.root_hg.is_adj(each_node_1, each_node_2):
+                    self.root_hg.add_edge(
+                        [each_node_1, each_node_2], attr_dict=dict(tmp=True))
+        '''
+        subhg.remove_edges_with_attr({'tmp':True})
+        self.insert_subhg(subhg)
+def _get_min_deg_node(hg, ident_node_dict: dict, mode='mol'):
+    if mode == 'standard':
+        degree_dict = hg.degrees()
+        min_deg_node = min(degree_dict, key=degree_dict.get)
+        min_deg_subhg = hg.adj_subhg(min_deg_node, ident_node_dict)
+        return min_deg_node, min_deg_subhg
+    elif mode == 'mol':
+        degree_dict = hg.degrees()
+        min_deg = min(degree_dict.values())
+        min_deg_node_list = [each_node for each_node in hg.nodes if degree_dict[each_node]==min_deg]
+        min_deg_subhg_list = [hg.adj_subhg(each_min_deg_node, ident_node_dict)
+                              for each_min_deg_node in min_deg_node_list]
+        best_score = np.inf
+        best_idx = -1
+        for each_idx in range(len(min_deg_subhg_list)):
+            if min_deg_subhg_list[each_idx].num_nodes < best_score:
+                best_idx = each_idx
+        return min_deg_node_list[each_idx], min_deg_subhg_list[each_idx]
+    else:
+        raise ValueError
+def tree_decomposition(hg, irredundant=True):
+    """ compute a tree decomposition of the input hypergraph
+    Parameters
+    ----------
+    hg : Hypergraph
+        hypergraph to be decomposed
+    irredundant : bool
+        if True, irredundant tree decomposition will be computed.
+    Returns
+    -------
+    clique_tree : nx.Graph
+        each node contains a subhypergraph of `hg`
+    """
+    org_hg = hg.copy()
+    ident_node_dict = hg.get_identical_node_dict()
+    clique_tree = CliqueTree(org_hg)
+    clique_tree.add_node(0, subhg=org_hg)
+    while True:
+        degree_dict = org_hg.degrees()
+        min_deg_node = min(degree_dict, key=degree_dict.get)
+        min_deg_subhg = org_hg.adj_subhg(min_deg_node, ident_node_dict)
+        if org_hg.nodes == min_deg_subhg.nodes:
+            break
+        # org_hg and min_deg_subhg are divided
+        clique_tree.node_update(min_deg_node, min_deg_subhg)
+    clique_tree.root_hg.remove_edges_with_attr({'tmp' : True})
+    if irredundant:
+        clique_tree.to_irredundant()
+    return clique_tree
+def tree_decomposition_with_hrg(hg, hrg, irredundant=True, return_root=False):
+    ''' compute a tree decomposition given a hyperedge replacement grammar.
+    the resultant clique tree should induce a less compact HRG.
+    Parameters
+    ----------
+    hg : Hypergraph
+        hypergraph to be decomposed
+    hrg : HyperedgeReplacementGrammar
+        current HRG
+    irredundant : bool
+        if True, irredundant tree decomposition will be computed.
+    Returns
+    -------
+    clique_tree : nx.Graph
+        each node contains a subhypergraph of `hg`
+    '''
+    org_hg = hg.copy()
+    ident_node_dict = hg.get_identical_node_dict()
+    clique_tree = CliqueTree(org_hg)
+    clique_tree.add_node(0, subhg=org_hg)
+    root_node = 0
+    # construct a clique tree using HRG
+    success_any = True
+    while success_any:
+        success_any = False
+        for each_prod_rule in hrg.prod_rule_list:
+            org_hg, success, subhg = each_prod_rule.revert(org_hg, True)
+            if success:
+                if each_prod_rule.is_start_rule: root_node = clique_tree.number_of_nodes()
+                success_any = True
+                subhg.remove_edges_with_attr({'terminal' : False})
+                clique_tree.root_hg = org_hg
+                clique_tree.insert_subhg(subhg)
+    clique_tree.root_hg = org_hg
+    for each_edge in deepcopy(org_hg.edges):
+        if not org_hg.edge_attr(each_edge)['terminal']:
+            node_list = org_hg.nodes_in_edge(each_edge)
+            org_hg.remove_edge(each_edge)
+            for each_node_1, each_node_2 in combinations(node_list, 2):
+                if not org_hg.is_adj(each_node_1, each_node_2):
+                    org_hg.add_edge([each_node_1, each_node_2], attr_dict=dict(tmp=True))
+    # construct a clique tree using the existing algorithm
+    degree_dict = org_hg.degrees()
+    if degree_dict:
+        while True:
+            min_deg_node, min_deg_subhg = _get_min_deg_node(org_hg, ident_node_dict)
+            if org_hg.nodes == min_deg_subhg.nodes: break
+            # org_hg and min_deg_subhg are divided
+            clique_tree.node_update(min_deg_node, min_deg_subhg)
+    clique_tree.root_hg.remove_edges_with_attr({'tmp' : True})
+    if irredundant:
+        clique_tree.to_irredundant()
+    if return_root:
+        if root_node == 0 and 0 not in clique_tree.nodes:
+            root_node = clique_tree.number_of_nodes()
+            while root_node not in clique_tree.nodes:
+                root_node -= 1
+        elif root_node not in clique_tree.nodes:
+            while root_node not in clique_tree.nodes:
+                root_node -= 1
+        else:
+            pass
+        return clique_tree, root_node
+    else:
+        return clique_tree
+def tree_decomposition_from_leaf(hg, irredundant=True):
+    """ compute a tree decomposition of the input hypergraph
+    Parameters
+    ----------
+    hg : Hypergraph
+        hypergraph to be decomposed
+    irredundant : bool
+        if True, irredundant tree decomposition will be computed.
+    Returns
+    -------
+    clique_tree : nx.Graph
+        each node contains a subhypergraph of `hg`
+    """
+    def apply_normal_decomposition(clique_tree):
+        degree_dict = clique_tree.root_hg.degrees()
+        min_deg_node = min(degree_dict, key=degree_dict.get)
+        min_deg_subhg = clique_tree.root_hg.adj_subhg(min_deg_node, clique_tree.ident_node_dict)
+        if clique_tree.root_hg.nodes == min_deg_subhg.nodes:
+            return clique_tree, False
+        clique_tree.node_update(min_deg_node, min_deg_subhg)
+        return clique_tree, True
+    def apply_min_edge_deg_decomposition(clique_tree):
+        edge_degree_dict = clique_tree.root_hg.edge_degrees()
+        non_tmp_edge_list = [each_edge for each_edge in clique_tree.root_hg.edges \
+                             if not clique_tree.root_hg.edge_attr(each_edge).get('tmp')]
+        if not non_tmp_edge_list:
+            return clique_tree, False
+        min_deg_edge = None
+        min_deg = np.inf
+        for each_edge in non_tmp_edge_list:
+            if min_deg > edge_degree_dict[each_edge]:
+                min_deg_edge = each_edge
+                min_deg = edge_degree_dict[each_edge]
+        node_list = clique_tree.root_hg.nodes_in_edge(min_deg_edge)
+        min_deg_subhg = clique_tree.root_hg.get_subhg(
+            node_list, [min_deg_edge], clique_tree.ident_node_dict)
+        if clique_tree.root_hg.nodes == min_deg_subhg.nodes:
+            return clique_tree, False
+        clique_tree.update(min_deg_subhg)
+        return clique_tree, True
+    org_hg = hg.copy()
+    clique_tree = CliqueTree(org_hg)
+    clique_tree.add_node(0, subhg=org_hg)
+    success = True
+    while success:
+        clique_tree, success = apply_min_edge_deg_decomposition(clique_tree)
+        if not success:
+            clique_tree, success = apply_normal_decomposition(clique_tree)
+    clique_tree.root_hg.remove_edges_with_attr({'tmp' : True})
+    if irredundant:
+        clique_tree.to_irredundant()
+    return clique_tree
+def topological_tree_decomposition(
+        hg, irredundant=True, rip_labels=True, shrink_cycle=False, contract_cycles=False):
+    ''' compute a tree decomposition of the input hypergraph
+    Parameters
+    ----------
+    hg : Hypergraph
+        hypergraph to be decomposed
+    irredundant : bool
+        if True, irredundant tree decomposition will be computed.
+    Returns
+    -------
+    clique_tree : CliqueTree
+        each node contains a subhypergraph of `hg`
+    '''
+    def _contract_tree(clique_tree):
+        ''' contract a single leaf
+        Parameters
+        ----------
+        clique_tree : CliqueTree
+        Returns
+        -------
+        CliqueTree, bool
+            bool represents whether this operation succeeds or not.
+        '''
+        edge_degree_dict = clique_tree.root_hg.edge_degrees()
+        leaf_edge_list = [each_edge for each_edge in clique_tree.root_hg.edges \
+                          if (not clique_tree.root_hg.edge_attr(each_edge).get('tmp'))\
+                          and edge_degree_dict[each_edge] == 1]
+        if not leaf_edge_list:
+            return clique_tree, False
+        min_deg_edge = leaf_edge_list[0]
+        node_list = clique_tree.root_hg.nodes_in_edge(min_deg_edge)
+        min_deg_subhg = clique_tree.root_hg.get_subhg(
+            node_list, [min_deg_edge], clique_tree.ident_node_dict)
+        if clique_tree.root_hg.nodes == min_deg_subhg.nodes:
+            return clique_tree, False
+        clique_tree.update(min_deg_subhg)
+        return clique_tree, True
+    def _rip_labels_from_cycles(clique_tree, org_hg):
+        ''' rip hyperedge-labels off
+        Parameters
+        ----------
+        clique_tree : CliqueTree
+        org_hg : Hypergraph
+        Returns
+        -------
+        CliqueTree, bool
+            bool represents whether this operation succeeds or not.
+        '''
+        ident_node_dict = clique_tree.ident_node_dict #hg.get_identical_node_dict()
+        for each_edge in clique_tree.root_hg.edges:
+            if each_edge in org_hg.edges:
+                if org_hg.in_cycle(each_edge):
+                    node_list = clique_tree.root_hg.nodes_in_edge(each_edge)
+                    subhg = clique_tree.root_hg.get_subhg(
+                        node_list, [each_edge], ident_node_dict)
+                    if clique_tree.root_hg.nodes == subhg.nodes:
+                        return clique_tree, False
+                    clique_tree.update(subhg)
+                    '''
+                    in_cycle_dict = {each_node: org_hg.node_attr(each_node)['is_in_ring'] for each_node in node_list}
+                    if not all(in_cycle_dict.values()):
+                        node_not_in_cycle = [each_node for each_node in in_cycle_dict.keys() if not in_cycle_dict[each_node]][0]
+                        node_list = [node_not_in_cycle]
+                        node_list.extend(clique_tree.root_hg.adj_nodes(node_not_in_cycle))
+                        edge_list = clique_tree.root_hg.adj_edges(node_not_in_cycle)
+                        import pdb; pdb.set_trace()
+                        subhg = clique_tree.root_hg.get_subhg(
+                            node_list, edge_list, ident_node_dict)
+                        clique_tree.update(subhg)
+                    '''
+                    return clique_tree, True
+        return clique_tree, False
+    def _shrink_cycle(clique_tree):
+        ''' shrink a cycle
+        Parameters
+        ----------
+        clique_tree : CliqueTree
+        Returns
+        -------
+        CliqueTree, bool
+            bool represents whether this operation succeeds or not.
+        '''
+        def filter_subhg(subhg, hg, key_node):
+            num_nodes_cycle = 0
+            nodes_in_cycle_list = []
+            for each_node in subhg.nodes:
+                if hg.in_cycle(each_node):
+                    num_nodes_cycle += 1
+                    if each_node != key_node:
+                        nodes_in_cycle_list.append(each_node)
+                if num_nodes_cycle > 3:
+                    break
+            if num_nodes_cycle != 3:
+                return False
+            else:
+                for each_edge in hg.edges:
+                    if set(nodes_in_cycle_list).issubset(hg.nodes_in_edge(each_edge)):
+                        return False
+                return True
+        #ident_node_dict = hg.get_identical_node_dict()
+        ident_node_dict = clique_tree.ident_node_dict
+        for each_node in clique_tree.root_hg.nodes:
+            if clique_tree.root_hg.in_cycle(each_node)\
+               and filter_subhg(clique_tree.root_hg.adj_subhg(each_node, ident_node_dict),
+                                clique_tree.root_hg,
+                                each_node):
+                target_node = each_node
+                target_subhg = clique_tree.root_hg.adj_subhg(target_node, ident_node_dict)
+                if clique_tree.root_hg.nodes == target_subhg.nodes:
+                    return clique_tree, False
+                clique_tree.update(target_subhg)
+                return clique_tree, True
+        return clique_tree, False
+    def _contract_cycles(clique_tree):
+        '''
+        remove a subhypergraph that looks like a cycle on a leaf.
+        Parameters
+        ----------
+        clique_tree : CliqueTree
+        Returns
+        -------
+        CliqueTree, bool
+            bool represents whether this operation succeeds or not.
+        '''
+        def _divide_hg(hg):
+            ''' divide a hypergraph into subhypergraphs such that
+            each subhypergraph is connected to each other in a tree-like way.
+            Parameters
+            ----------
+            hg : Hypergraph
+            Returns
+            -------
+            list of Hypergraphs
+                each element corresponds to a subhypergraph of `hg`
+            '''
+            for each_node in hg.nodes:
+                if hg.is_dividable(each_node):
+                    adj_edges_dict = {each_edge: hg.in_cycle(each_edge) for each_edge in hg.adj_edges(each_node)}
+                    '''
+                    if any(adj_edges_dict.values()):
+                        import pdb; pdb.set_trace()
+                        edge_in_cycle = [each_key for each_key, each_val in adj_edges_dict.items() if each_val][0]
+                        subhg1, subhg2, subhg3 = hg.divide(each_node, edge_in_cycle)
+                        return _divide_hg(subhg1) + _divide_hg(subhg2) + _divide_hg(subhg3)
+                    else:
+                    '''
+                    subhg1, subhg2 = hg.divide(each_node)
+                    return _divide_hg(subhg1) + _divide_hg(subhg2)
+            return [hg]
+        def _is_leaf(hg, divided_subhg) -> bool:
+            ''' judge whether subhg is a leaf-like in the original hypergraph
+            Parameters
+            ----------
+            hg : Hypergraph
+            divided_subhg : Hypergraph
+                `divided_subhg` is a subhypergraph of `hg`
+            Returns
+            -------
+            bool
+            '''
+            '''
+            adj_edges_set = set([])
+            for each_node in divided_subhg.nodes:
+                adj_edges_set.update(set(hg.adj_edges(each_node)))
+            _hg = deepcopy(hg)
+            _hg.remove_subhg(divided_subhg)
+            if nx.is_connected(_hg.hg) != (len(adj_edges_set - divided_subhg.edges) == 1):
+                import pdb; pdb.set_trace()
+            return len(adj_edges_set - divided_subhg.edges) == 1
+            '''
+            _hg = deepcopy(hg)
+            _hg.remove_subhg(divided_subhg)
+            return nx.is_connected(_hg.hg)
+        subhg_list = _divide_hg(clique_tree.root_hg)
+        if len(subhg_list) == 1:
+            return clique_tree, False
+        else:
+            while len(subhg_list) > 1:
+                max_leaf_subhg = None
+                for each_subhg in subhg_list:
+                    if _is_leaf(clique_tree.root_hg, each_subhg):
+                        if max_leaf_subhg is None:
+                            max_leaf_subhg = each_subhg
+                        elif max_leaf_subhg.num_nodes < each_subhg.num_nodes:
+                            max_leaf_subhg = each_subhg
+                clique_tree.update(max_leaf_subhg)
+                subhg_list.remove(max_leaf_subhg)
+            return clique_tree, True
+    org_hg = hg.copy()
+    clique_tree = CliqueTree(org_hg)
+    clique_tree.add_node(0, subhg=org_hg)
+    success = True
+    while success:
+        '''
+        clique_tree, success = _rip_labels_from_cycles(clique_tree, hg)
+        if not success:
+            clique_tree, success = _contract_cycles(clique_tree)
+        '''
+        clique_tree, success = _contract_tree(clique_tree)
+        if not success:
+            if rip_labels:
+                clique_tree, success = _rip_labels_from_cycles(clique_tree, hg)
+            if not success:
+                if shrink_cycle:
+                    clique_tree, success = _shrink_cycle(clique_tree)
+                if not success:
+                    if contract_cycles:
+                        clique_tree, success = _contract_cycles(clique_tree)
+    clique_tree.root_hg.remove_edges_with_attr({'tmp' : True})
+    if irredundant:
+        clique_tree.to_irredundant()
+    return clique_tree
+def molecular_tree_decomposition(hg, irredundant=True):
+    """ compute a tree decomposition of the input molecular hypergraph
+    Parameters
+    ----------
+    hg : Hypergraph
+        molecular hypergraph to be decomposed
+    irredundant : bool
+        if True, irredundant tree decomposition will be computed.
+    Returns
+    -------
+    clique_tree : CliqueTree
+        each node contains a subhypergraph of `hg`
+    """
+    def _divide_hg(hg):
+        ''' divide a hypergraph into subhypergraphs such that
+        each subhypergraph is connected to each other in a tree-like way.
+        Parameters
+        ----------
+        hg : Hypergraph
+        Returns
+        -------
+        list of Hypergraphs
+            each element corresponds to a subhypergraph of `hg`
+        '''
+        is_ring = False
+        for each_node in hg.nodes:
+            if hg.node_attr(each_node)['is_in_ring']:
+                is_ring = True
+            if not hg.node_attr(each_node)['is_in_ring'] \
+               and hg.degree(each_node) == 2:
+                subhg1, subhg2 = hg.divide(each_node)
+                return _divide_hg(subhg1) + _divide_hg(subhg2)
+        if is_ring:
+            subhg_list = []
+            remove_edge_list = []
+            remove_node_list = []
+            for each_edge in hg.edges:
+                node_list = hg.nodes_in_edge(each_edge)
+                subhg = hg.get_subhg(node_list, [each_edge], hg.get_identical_node_dict())
+                subhg_list.append(subhg)
+                remove_edge_list.append(each_edge)
+                for each_node in node_list:
+                    if not hg.node_attr(each_node)['is_in_ring']:
+                        remove_node_list.append(each_node)
+            hg.remove_edges(remove_edge_list)
+            hg.remove_nodes(remove_node_list, False)
+            return subhg_list + [hg]
+        else:
+            return [hg]
+    org_hg = hg.copy()
+    clique_tree = CliqueTree(org_hg)
+    clique_tree.add_node(0, subhg=org_hg)
+    subhg_list = _divide_hg(deepcopy(clique_tree.root_hg))
+    #_subhg_list = deepcopy(subhg_list)
+    if len(subhg_list) == 1:
+        pass
+    else:
+        while len(subhg_list) > 1:
+            max_leaf_subhg = None
+            for each_subhg in subhg_list:
+                if _is_leaf(clique_tree.root_hg, each_subhg) and not _is_ring(each_subhg):
+                    if max_leaf_subhg is None:
+                        max_leaf_subhg = each_subhg
+                    elif max_leaf_subhg.num_nodes < each_subhg.num_nodes:
+                        max_leaf_subhg = each_subhg
+            if max_leaf_subhg is None:
+                for each_subhg in subhg_list:
+                    if _is_ring_label(clique_tree.root_hg, each_subhg):
+                        if max_leaf_subhg is None:
+                            max_leaf_subhg = each_subhg
+                        elif max_leaf_subhg.num_nodes < each_subhg.num_nodes:
+                            max_leaf_subhg = each_subhg
+            if max_leaf_subhg is not None:
+                clique_tree.update(max_leaf_subhg)
+                subhg_list.remove(max_leaf_subhg)
+            else:
+                for each_subhg in subhg_list:
+                    if _is_leaf(clique_tree.root_hg, each_subhg):
+                        if max_leaf_subhg is None:
+                            max_leaf_subhg = each_subhg
+                        elif max_leaf_subhg.num_nodes < each_subhg.num_nodes:
+                            max_leaf_subhg = each_subhg
+                if max_leaf_subhg is not None:
+                    clique_tree.update(max_leaf_subhg, True)
+                    subhg_list.remove(max_leaf_subhg)
+                else:
+                    break
+    if len(subhg_list) > 1:
+        '''
+        for each_idx, each_subhg in enumerate(subhg_list):
+            each_subhg.draw(f'{each_idx}', True)
+        clique_tree.root_hg.draw('root', True)
+        import pickle
+        with open('buggy_hg.pkl', 'wb') as f:
+            pickle.dump(hg, f)
+        return clique_tree, subhg_list, _subhg_list
+        '''
+        raise RuntimeError('bug in tree decomposition algorithm')
+    clique_tree.root_hg.remove_edges_with_attr({'tmp' : True})
+    '''
+    for each_tree_node in clique_tree.adj[0]:
+        subhg = clique_tree.nodes[each_tree_node]['subhg']
+        for each_edge in subhg.edges:
+            if set(subhg.nodes_in_edge(each_edge)).issubset(clique_tree.root_hg.nodes):
+                clique_tree.root_hg.add_edge(set(subhg.nodes_in_edge(each_edge)), attr_dict=dict(tmp=True))
+    '''
+    if irredundant:
+        clique_tree.to_irredundant()
+    return clique_tree #, _subhg_list
+def _is_leaf(hg, subhg) -> bool:
+    ''' judge whether subhg is a leaf-like in the original hypergraph
+    Parameters
+    ----------
+    hg : Hypergraph
+    subhg : Hypergraph
+        `subhg` is a subhypergraph of `hg`
+    Returns
+    -------
+    bool
+    '''
+    if len(subhg.edges) == 0:
+        adj_edge_set = set([])
+        subhg_edge_set = set([])
+        for each_edge in hg.edges:
+            if set(hg.nodes_in_edge(each_edge)).issubset(subhg.nodes) and hg.edge_attr(each_edge).get('tmp', False):
+                subhg_edge_set.add(each_edge)
+        for each_node in subhg.nodes:
+            adj_edge_set.update(set(hg.adj_edges(each_node)))
+        if subhg_edge_set.issubset(adj_edge_set) and len(adj_edge_set.difference(subhg_edge_set)) == 1:
+            return True
+        else:
+            return False
+    elif len(subhg.edges) == 1:
+        adj_edge_set = set([])
+        subhg_edge_set = subhg.edges
+        for each_node in subhg.nodes:
+            for each_adj_edge in hg.adj_edges(each_node):
+                adj_edge_set.add(each_adj_edge)
+        if subhg_edge_set.issubset(adj_edge_set) and len(adj_edge_set.difference(subhg_edge_set)) == 1:
+            return True
+        else:
+            return False
+    else:
+        raise ValueError('subhg should be nodes only or one-edge hypergraph.')
+def _is_ring_label(hg, subhg):
+    if len(subhg.edges) != 1:
+        return False
+    edge_name = list(subhg.edges)[0]
+    #assert edge_name in hg.edges, f'{edge_name}'
+    is_in_ring = False
+    for each_node in subhg.nodes:
+        if subhg.node_attr(each_node)['is_in_ring']:
+            is_in_ring = True
+        else:
+            adj_edge_list = list(hg.adj_edges(each_node))
+            adj_edge_list.remove(edge_name)
+            if len(adj_edge_list) == 1:
+                if not hg.edge_attr(adj_edge_list[0]).get('tmp', False):
+                    return False
+            elif len(adj_edge_list) == 0:
+                pass
+            else:
+                raise ValueError
+    if is_in_ring:
+        return True
+    else:
+        return False
+def _is_ring(hg):
+    for each_node in hg.nodes:
+        if not hg.node_attr(each_node)['is_in_ring']:
+            return False
+    return True

models/mhg_model/graph_grammar/graph_grammar/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 1 2018"

models/mhg_model/graph_grammar/graph_grammar/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (690 Bytes). View file

models/mhg_model/graph_grammar/graph_grammar/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (1.19 kB). View file

models/mhg_model/graph_grammar/graph_grammar/__pycache__/corpus.cpython-310.pyc ADDED Viewed

Binary file (4.73 kB). View file

models/mhg_model/graph_grammar/graph_grammar/__pycache__/hrg.cpython-310.pyc ADDED Viewed

Binary file (29.1 kB). View file

models/mhg_model/graph_grammar/graph_grammar/__pycache__/symbols.cpython-310.pyc ADDED Viewed

Binary file (5.39 kB). View file

models/mhg_model/graph_grammar/graph_grammar/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.65 kB). View file

models/mhg_model/graph_grammar/graph_grammar/base.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2017"
+__version__ = "0.1"
+__date__ = "Dec 11 2017"
+from abc import ABCMeta, abstractmethod
+class GraphGrammarBase(metaclass=ABCMeta):
+    @abstractmethod
+    def learn(self):
+        pass
+    @abstractmethod
+    def sample(self):
+        pass

models/mhg_model/graph_grammar/graph_grammar/corpus.py ADDED Viewed

	@@ -0,0 +1,152 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jun 4 2018"
+from collections import Counter
+from functools import partial
+from .utils import _easy_node_match, _edge_match, _node_match, common_node_list, _node_match_prod_rule
+from networkx.algorithms.isomorphism import GraphMatcher
+import os
+class CliqueTreeCorpus(object):
+    ''' clique tree corpus
+    Attributes
+    ----------
+    clique_tree_list : list of CliqueTree
+    subhg_list : list of Hypergraph
+    '''
+    def __init__(self):
+        self.clique_tree_list = []
+        self.subhg_list = []
+    @property
+    def size(self):
+        return len(self.subhg_list)
+    def add_clique_tree(self, clique_tree):
+        for each_node in clique_tree.nodes:
+            subhg = clique_tree.nodes[each_node]['subhg']
+            subhg_idx = self.add_subhg(subhg)
+            clique_tree.nodes[each_node]['subhg_idx'] = subhg_idx
+        self.clique_tree_list.append(clique_tree)
+    def add_to_subhg_list(self, clique_tree, root_node):
+        parent_node_dict = {}
+        current_node = None
+        parent_node_dict[root_node] = None
+        stack = [root_node]
+        while stack:
+            current_node = stack.pop()
+            current_subhg = clique_tree.nodes[current_node]['subhg']
+            for each_child in clique_tree.adj[current_node]:
+                if each_child != parent_node_dict[current_node]:
+                    stack.append(each_child)
+                    parent_node_dict[each_child] = current_node
+            if parent_node_dict[current_node] is not None:
+                parent_subhg = clique_tree.nodes[parent_node_dict[current_node]]['subhg']
+                common, _ = common_node_list(parent_subhg, current_subhg)
+                parent_subhg.add_edge(set(common), attr_dict={'tmp': True})
+        parent_node_dict = {}
+        current_node = None
+        parent_node_dict[root_node] = None
+        stack = [root_node]
+        while stack:
+            current_node = stack.pop()
+            current_subhg = clique_tree.nodes[current_node]['subhg']
+            for each_child in clique_tree.adj[current_node]:
+                if each_child != parent_node_dict[current_node]:
+                    stack.append(each_child)
+                    parent_node_dict[each_child] = current_node
+            if parent_node_dict[current_node] is not None:
+                parent_subhg = clique_tree.nodes[parent_node_dict[current_node]]['subhg']
+                common, _ = common_node_list(parent_subhg, current_subhg)
+                for each_idx, each_node in enumerate(common):
+                    current_subhg.set_node_attr(each_node, {'ext_id': each_idx})
+            subhg_idx, is_new = self.add_subhg(current_subhg)
+            clique_tree.nodes[current_node]['subhg_idx'] = subhg_idx
+        return clique_tree
+    def add_subhg(self, subhg):
+        if len(self.subhg_list) == 0:
+            node_dict = {}
+            for each_node in subhg.nodes:
+                node_dict[each_node] = subhg.node_attr(each_node)['symbol'].__hash__()
+            node_list = []
+            for each_key, _ in sorted(node_dict.items(), key=lambda x:x[1]):
+                node_list.append(each_key)
+            for each_idx, each_node in enumerate(node_list):
+                subhg.node_attr(each_node)['order4hrg'] = each_idx
+            self.subhg_list.append(subhg)
+            return 0, True
+        else:
+            match = False
+            subhg_bond_symbol_counter \
+                = Counter([subhg.node_attr(each_node)['symbol'] \
+                           for each_node in subhg.nodes])
+            subhg_atom_symbol_counter \
+                = Counter([subhg.edge_attr(each_edge).get('symbol', None) \
+                           for each_edge in subhg.edges])
+            for each_idx, each_subhg in enumerate(self.subhg_list):
+                each_bond_symbol_counter \
+                    = Counter([each_subhg.node_attr(each_node)['symbol'] \
+                               for each_node in each_subhg.nodes])
+                each_atom_symbol_counter \
+                    = Counter([each_subhg.edge_attr(each_edge).get('symbol', None) \
+                               for each_edge in each_subhg.edges])
+                if not match \
+                   and (subhg.num_nodes == each_subhg.num_nodes
+                        and subhg.num_edges == each_subhg.num_edges
+                        and subhg_bond_symbol_counter == each_bond_symbol_counter
+                        and subhg_atom_symbol_counter == each_atom_symbol_counter):
+                    gm = GraphMatcher(each_subhg.hg,
+                                      subhg.hg,
+                                      node_match=_easy_node_match,
+                                      edge_match=_edge_match)
+                    try:
+                        isomap = next(gm.isomorphisms_iter())
+                        match = True
+                        for each_node in each_subhg.nodes:
+                            subhg.node_attr(isomap[each_node])['order4hrg'] \
+                                = each_subhg.node_attr(each_node)['order4hrg']
+                            if 'ext_id' in each_subhg.node_attr(each_node):
+                                subhg.node_attr(isomap[each_node])['ext_id'] \
+                                    = each_subhg.node_attr(each_node)['ext_id']
+                        return each_idx, False
+                    except StopIteration:
+                        match = False
+            if not match:
+                node_dict = {}
+                for each_node in subhg.nodes:
+                    node_dict[each_node] = subhg.node_attr(each_node)['symbol'].__hash__()
+                node_list = []
+                for each_key, _ in sorted(node_dict.items(), key=lambda x:x[1]):
+                    node_list.append(each_key)
+                for each_idx, each_node in enumerate(node_list):
+                    subhg.node_attr(each_node)['order4hrg'] = each_idx
+                #for each_idx, each_node in enumerate(subhg.nodes):
+                #    subhg.node_attr(each_node)['order4hrg'] = each_idx
+                self.subhg_list.append(subhg)
+                return len(self.subhg_list) - 1, True

models/mhg_model/graph_grammar/graph_grammar/hrg.py ADDED Viewed

	@@ -0,0 +1,1065 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2017"
+__version__ = "0.1"
+__date__ = "Dec 11 2017"
+from .corpus import CliqueTreeCorpus
+from .base import GraphGrammarBase
+from .symbols import TSymbol, NTSymbol, BondSymbol
+from .utils import _node_match, _node_match_prod_rule, _edge_match, masked_softmax, common_node_list
+from ..hypergraph import Hypergraph
+from collections import Counter
+from copy import deepcopy
+from ..algo.tree_decomposition import (
+    tree_decomposition,
+    tree_decomposition_with_hrg,
+    tree_decomposition_from_leaf,
+    topological_tree_decomposition,
+    molecular_tree_decomposition)
+from functools import partial
+from networkx.algorithms.isomorphism import GraphMatcher
+from typing import List, Dict, Tuple
+import networkx as nx
+import numpy as np
+import torch
+import os
+import random
+DEBUG = False
+class ProductionRule(object):
+    """ A class of a production rule
+    Attributes
+    ----------
+    lhs : Hypergraph or None
+        the left hand side of the production rule.
+        if None, the rule is a starting rule.
+    rhs : Hypergraph
+        the right hand side of the production rule.
+    """
+    def __init__(self, lhs, rhs):
+        self.lhs = lhs
+        self.rhs = rhs
+    @property
+    def is_start_rule(self) -> bool:
+        return self.lhs.num_nodes == 0
+    @property
+    def ext_node(self) -> Dict[int, str]:
+        """ return a dict of external nodes
+        """
+        if self.is_start_rule:
+            return {}
+        else:
+            ext_node_dict = {}
+            for each_node in self.lhs.nodes:
+                ext_node_dict[self.lhs.node_attr(each_node)["ext_id"]] = each_node
+            return ext_node_dict
+    @property
+    def lhs_nt_symbol(self) -> NTSymbol:
+        if self.is_start_rule:
+            return NTSymbol(degree=0, is_aromatic=False, bond_symbol_list=[])
+        else:
+            return self.lhs.edge_attr(list(self.lhs.edges)[0])['symbol']
+    def rhs_adj_mat(self, node_edge_list):
+        ''' return the adjacency matrix of rhs of the production rule
+        '''
+        return nx.adjacency_matrix(self.rhs.hg, node_edge_list)
+    def draw(self, file_path=None):
+        return self.rhs.draw(file_path)
+    def is_same(self, prod_rule, ignore_order=False):
+        """ judge whether this production rule is
+        the same as the input one, `prod_rule`
+        Parameters
+        ----------
+        prod_rule : ProductionRule
+            production rule to be compared
+        Returns
+        -------
+        is_same : bool
+        isomap : dict
+            isomorphism of nodes and hyperedges.
+            ex) {'bond_42': 'bond_37', 'bond_2': 'bond_1',
+                 'e36': 'e11', 'e16': 'e12', 'e25': 'e18',
+                 'bond_40': 'bond_38', 'e26': 'e21', 'bond_41': 'bond_39'}.
+            key comes from `prod_rule`, value comes from `self`.
+        """
+        if self.is_start_rule:
+            if not prod_rule.is_start_rule:
+                return False, {}
+        else:
+            if prod_rule.is_start_rule:
+                return False, {}
+            else:
+                if prod_rule.lhs.num_nodes != self.lhs.num_nodes:
+                    return False, {}
+        if prod_rule.rhs.num_nodes != self.rhs.num_nodes:
+            return False, {}
+        if prod_rule.rhs.num_edges != self.rhs.num_edges:
+            return False, {}
+        subhg_bond_symbol_counter \
+            = Counter([prod_rule.rhs.node_attr(each_node)['symbol'] \
+                       for each_node in prod_rule.rhs.nodes])
+        each_bond_symbol_counter \
+            = Counter([self.rhs.node_attr(each_node)['symbol'] \
+                       for each_node in self.rhs.nodes])
+        if subhg_bond_symbol_counter != each_bond_symbol_counter:
+            return False, {}
+        subhg_atom_symbol_counter \
+            = Counter([prod_rule.rhs.edge_attr(each_edge)['symbol'] \
+                       for each_edge in prod_rule.rhs.edges])
+        each_atom_symbol_counter \
+            = Counter([self.rhs.edge_attr(each_edge)['symbol'] \
+                       for each_edge in self.rhs.edges])
+        if subhg_atom_symbol_counter != each_atom_symbol_counter:
+            return False, {}
+        gm = GraphMatcher(prod_rule.rhs.hg,
+                          self.rhs.hg,
+                          partial(_node_match_prod_rule,
+                                  ignore_order=ignore_order),
+                          partial(_edge_match,
+                                  ignore_order=ignore_order))
+        try:
+            return True, next(gm.isomorphisms_iter())
+        except StopIteration:
+            return False, {}
+    def applied_to(self,
+                   hg: Hypergraph,
+                   edge: str) -> Tuple[Hypergraph, List[str]]:
+        """ augment `hg` by replacing `edge` with `self.rhs`.
+        Parameters
+        ----------
+        hg : Hypergraph
+        edge : str
+            `edge` must belong to `hg`
+        Returns
+        -------
+        hg : Hypergraph
+            resultant hypergraph
+        nt_edge_list : list
+            list of non-terminal edges
+        """
+        nt_edge_dict = {}
+        if self.is_start_rule:
+            if (edge is not None) or (hg is not None):
+                ValueError("edge and hg must be None for this prod rule.")
+            hg = Hypergraph()
+            node_map_rhs = {} # node id in rhs -> node id in hg, where rhs is augmented.
+            for num_idx, each_node in enumerate(self.rhs.nodes):
+                hg.add_node(f"bond_{num_idx}",
+                            #attr_dict=deepcopy(self.rhs.node_attr(each_node)))
+                            attr_dict=self.rhs.node_attr(each_node))
+                node_map_rhs[each_node] = f"bond_{num_idx}"
+            for each_edge in self.rhs.edges:
+                node_list = []
+                for each_node in self.rhs.nodes_in_edge(each_edge):
+                    node_list.append(node_map_rhs[each_node])
+                if isinstance(self.rhs.nodes_in_edge(each_edge), set):
+                    node_list = set(node_list)
+                edge_id = hg.add_edge(
+                    node_list,
+                    #attr_dict=deepcopy(self.rhs.edge_attr(each_edge)))
+                    attr_dict=self.rhs.edge_attr(each_edge))
+                if "nt_idx" in hg.edge_attr(edge_id):
+                    nt_edge_dict[hg.edge_attr(edge_id)["nt_idx"]] = edge_id
+            nt_edge_list = [nt_edge_dict[key] for key in range(len(nt_edge_dict))]
+            return hg, nt_edge_list
+        else:
+            if edge not in hg.edges:
+                raise ValueError("the input hyperedge does not exist.")
+            if hg.edge_attr(edge)["terminal"]:
+                raise ValueError("the input hyperedge is terminal.")
+            if hg.edge_attr(edge)['symbol'] != self.lhs_nt_symbol:
+                print(hg.edge_attr(edge)['symbol'], self.lhs_nt_symbol)
+                raise ValueError("the input hyperedge and lhs have inconsistent number of nodes.")
+            if DEBUG:
+                for node_idx, each_node in enumerate(hg.nodes_in_edge(edge)):
+                    other_node = self.lhs.nodes_in_edge(list(self.lhs.edges)[0])[node_idx]
+                    attr = deepcopy(self.lhs.node_attr(other_node))
+                    attr.pop('ext_id')
+                    if hg.node_attr(each_node) != attr:
+                        raise ValueError('node attributes are inconsistent.')
+            # order of nodes that belong to the non-terminal edge in hg
+            nt_order_dict = {}  # hg_node -> order ("bond_17" : 1)
+            nt_order_dict_inv = {} # order -> hg_node
+            for each_idx, each_node in enumerate(hg.nodes_in_edge(edge)):
+                nt_order_dict[each_node] = each_idx
+                nt_order_dict_inv[each_idx] = each_node
+            # construct a node_map_rhs: rhs -> new hg
+            node_map_rhs = {} # node id in rhs -> node id in hg, where rhs is augmented.
+            node_idx = hg.num_nodes
+            for each_node in self.rhs.nodes:
+                if "ext_id" in self.rhs.node_attr(each_node):
+                    node_map_rhs[each_node] \
+                        = nt_order_dict_inv[
+                            self.rhs.node_attr(each_node)["ext_id"]]
+                else:
+                    node_map_rhs[each_node] = f"bond_{node_idx}"
+                    node_idx += 1
+            # delete non-terminal
+            hg.remove_edge(edge)
+            # add nodes to hg
+            for each_node in self.rhs.nodes:
+                hg.add_node(node_map_rhs[each_node],
+                            attr_dict=self.rhs.node_attr(each_node))
+            # add hyperedges to hg
+            for each_edge in self.rhs.edges:
+                node_list_hg = []
+                for each_node in self.rhs.nodes_in_edge(each_edge):
+                    node_list_hg.append(node_map_rhs[each_node])
+                edge_id = hg.add_edge(
+                    node_list_hg,
+                    attr_dict=self.rhs.edge_attr(each_edge))#deepcopy(self.rhs.edge_attr(each_edge)))
+                if "nt_idx" in hg.edge_attr(edge_id):
+                    nt_edge_dict[hg.edge_attr(edge_id)["nt_idx"]] = edge_id
+            nt_edge_list = [nt_edge_dict[key] for key in range(len(nt_edge_dict))]
+            return hg, nt_edge_list
+    def revert(self, hg: Hypergraph, return_subhg=False):
+        ''' revert applying this production rule.
+        i.e., if there exists a subhypergraph that matches the r.h.s. of this production rule,
+        this method replaces the subhypergraph with a non-terminal hyperedge.
+        Parameters
+        ----------
+        hg : Hypergraph
+            hypergraph to be reverted
+        return_subhg : bool
+            if True, the removed subhypergraph will be returned.
+        Returns
+        -------
+        hg : Hypergraph
+            the resultant hypergraph. if it cannot be reverted, the original one is returned without any replacement.
+        success : bool
+            this indicates whether reverting is successed or not.
+        '''
+        gm = GraphMatcher(hg.hg, self.rhs.hg, node_match=_node_match_prod_rule,
+                          edge_match=_edge_match)
+        try:
+            # in case when the matched subhg is connected to the other part via external nodes and more.
+            not_iso = True
+            while not_iso:
+                isomap = next(gm.subgraph_isomorphisms_iter())
+                adj_node_set = set([]) # reachable nodes from the internal nodes
+                subhg_node_set = set(isomap.keys()) # nodes in subhg
+                for each_node in subhg_node_set:
+                    adj_node_set.add(each_node)
+                    if isomap[each_node] not in self.ext_node.values():
+                        adj_node_set.update(hg.hg.adj[each_node])
+                if adj_node_set == subhg_node_set:
+                    not_iso = False
+                else:
+                    if return_subhg:
+                        return hg, False, Hypergraph()
+                    else:
+                        return hg, False
+            inv_isomap = {v: k for k, v in isomap.items()}
+            '''
+            isomap = {'e35': 'e8', 'bond_13': 'bond_18', 'bond_14': 'bond_19',
+                      'bond_15': 'bond_17', 'e29': 'e23', 'bond_12': 'bond_20'}
+            where keys come from `hg` and values come from `self.rhs`
+            '''
+        except StopIteration:
+            if return_subhg:
+                return hg, False, Hypergraph()
+            else:
+                return hg, False
+        if return_subhg:
+            subhg = Hypergraph()
+            for each_node in hg.nodes:
+                if each_node in isomap:
+                    subhg.add_node(each_node, attr_dict=hg.node_attr(each_node))
+            for each_edge in hg.edges:
+                if each_edge in isomap:
+                    subhg.add_edge(hg.nodes_in_edge(each_edge),
+                                   attr_dict=hg.edge_attr(each_edge),
+                                   edge_name=each_edge)
+            subhg.edge_idx = hg.edge_idx
+        # remove subhg except for the externael nodes
+        for each_key, each_val in isomap.items():
+            if each_key.startswith('e'):
+                hg.remove_edge(each_key)
+        for each_key, each_val in isomap.items():
+            if each_key.startswith('bond_'):
+                if each_val not in self.ext_node.values():
+                    hg.remove_node(each_key)
+        # add non-terminal hyperedge
+        nt_node_list = []
+        for each_ext_id in self.ext_node.keys():
+            nt_node_list.append(inv_isomap[self.ext_node[each_ext_id]])
+        hg.add_edge(nt_node_list,
+                    attr_dict=dict(
+                        terminal=False,
+                        symbol=self.lhs_nt_symbol))
+        if return_subhg:
+            return hg, True, subhg
+        else:
+            return hg, True
+class ProductionRuleCorpus(object):
+    '''
+    A corpus of production rules.
+    This class maintains
+        (i) list of unique production rules,
+        (ii) list of unique edge symbols (both terminal and non-terminal), and
+        (iii) list of unique node symbols.
+    Attributes
+    ----------
+    prod_rule_list : list
+        list of unique production rules
+    edge_symbol_list : list
+        list of unique symbols (including both terminal and non-terminal)
+    node_symbol_list : list
+        list of node symbols
+    nt_symbol_list : list
+        list of unique lhs symbols
+    ext_id_list : list
+        list of ext_ids
+    lhs_in_prod_rule : array
+        a matrix of lhs vs prod_rule (= lhs_in_prod_rule)
+    '''
+    def __init__(self):
+        self.prod_rule_list = []
+        self.edge_symbol_list = []
+        self.edge_symbol_dict = {}
+        self.node_symbol_list = []
+        self.node_symbol_dict = {}
+        self.nt_symbol_list = []
+        self.ext_id_list = []
+        self._lhs_in_prod_rule = None
+        self.lhs_in_prod_rule_row_list = []
+        self.lhs_in_prod_rule_col_list = []
+    @property
+    def lhs_in_prod_rule(self):
+        if self._lhs_in_prod_rule is None:
+            self._lhs_in_prod_rule = torch.sparse.FloatTensor(
+                torch.LongTensor(list(zip(self.lhs_in_prod_rule_row_list, self.lhs_in_prod_rule_col_list))).t(),
+                torch.FloatTensor([1.0]*len(self.lhs_in_prod_rule_col_list)),
+                torch.Size([len(self.nt_symbol_list), len(self.prod_rule_list)])
+            ).to_dense()
+        return self._lhs_in_prod_rule
+    @property
+    def num_prod_rule(self):
+        ''' return the number of production rules
+        Returns
+        -------
+        int : the number of unique production rules
+        '''
+        return len(self.prod_rule_list)
+    @property
+    def start_rule_list(self):
+        ''' return a list of start rules
+        Returns
+        -------
+        list : list of start rules
+        '''
+        start_rule_list = []
+        for each_prod_rule in self.prod_rule_list:
+            if each_prod_rule.is_start_rule:
+                start_rule_list.append(each_prod_rule)
+        return start_rule_list
+    @property
+    def num_edge_symbol(self):
+        return len(self.edge_symbol_list)
+    @property
+    def num_node_symbol(self):
+        return len(self.node_symbol_list)
+    @property
+    def num_ext_id(self):
+        return len(self.ext_id_list)
+    def construct_feature_vectors(self):
+        ''' this method constructs feature vectors for the production rules collected so far.
+        currently, NTSymbol and TSymbol are treated in the same manner.
+        '''
+        feature_id_dict = {}
+        feature_id_dict['TSymbol'] = 0
+        feature_id_dict['NTSymbol'] = 1
+        feature_id_dict['BondSymbol'] = 2
+        for each_edge_symbol in self.edge_symbol_list:
+            for each_attr in each_edge_symbol.__dict__.keys():
+                each_val = each_edge_symbol.__dict__[each_attr]
+                if isinstance(each_val, list):
+                    each_val = tuple(each_val)
+                if (each_attr, each_val) not in feature_id_dict:
+                    feature_id_dict[(each_attr, each_val)] = len(feature_id_dict)
+        for each_node_symbol in self.node_symbol_list:
+            for each_attr in each_node_symbol.__dict__.keys():
+                each_val = each_node_symbol.__dict__[each_attr]
+                if isinstance(each_val, list):
+                    each_val = tuple(each_val)
+                if (each_attr, each_val) not in feature_id_dict:
+                    feature_id_dict[(each_attr, each_val)] = len(feature_id_dict)
+        for each_ext_id in self.ext_id_list:
+            feature_id_dict[('ext_id', each_ext_id)] = len(feature_id_dict)
+        dim = len(feature_id_dict)
+        feature_dict = {}
+        for each_edge_symbol in self.edge_symbol_list:
+            idx_list = []
+            idx_list.append(feature_id_dict[each_edge_symbol.__class__.__name__])
+            for each_attr in each_edge_symbol.__dict__.keys():
+                each_val = each_edge_symbol.__dict__[each_attr]
+                if isinstance(each_val, list):
+                    each_val = tuple(each_val)
+                idx_list.append(feature_id_dict[(each_attr, each_val)])
+            feature = torch.sparse.LongTensor(
+                torch.LongTensor([idx_list]),
+                torch.ones(len(idx_list)),
+                torch.Size([len(feature_id_dict)])
+            )
+            feature_dict[each_edge_symbol] = feature
+        for each_node_symbol in self.node_symbol_list:
+            idx_list = []
+            idx_list.append(feature_id_dict[each_node_symbol.__class__.__name__])
+            for each_attr in each_node_symbol.__dict__.keys():
+                each_val = each_node_symbol.__dict__[each_attr]
+                if isinstance(each_val, list):
+                    each_val = tuple(each_val)
+                idx_list.append(feature_id_dict[(each_attr, each_val)])
+            feature = torch.sparse.LongTensor(
+                torch.LongTensor([idx_list]),
+                torch.ones(len(idx_list)),
+                torch.Size([len(feature_id_dict)])
+            )
+            feature_dict[each_node_symbol] = feature
+        for each_ext_id in self.ext_id_list:
+            idx_list = [feature_id_dict[('ext_id', each_ext_id)]]
+            feature_dict[('ext_id', each_ext_id)] \
+                = torch.sparse.LongTensor(
+                    torch.LongTensor([idx_list]),
+                    torch.ones(len(idx_list)),
+                    torch.Size([len(feature_id_dict)])
+                )
+        return feature_dict, dim
+    def edge_symbol_idx(self, symbol):
+        return self.edge_symbol_dict[symbol]
+    def node_symbol_idx(self, symbol):
+        return self.node_symbol_dict[symbol]
+    def append(self, prod_rule: ProductionRule) -> Tuple[int, ProductionRule]:
+        """ return whether the input production rule is new or not, and its production rule id.
+        Production rules are regarded as the same if
+            i) there exists a one-to-one mapping of nodes and edges, and
+            ii) all the attributes associated with nodes and hyperedges are the same.
+        Parameters
+        ----------
+        prod_rule : ProductionRule
+        Returns
+        -------
+        prod_rule_id : int
+            production rule index. if new, a new index will be assigned.
+        prod_rule : ProductionRule
+        """
+        num_lhs = len(self.nt_symbol_list)
+        for each_idx, each_prod_rule in enumerate(self.prod_rule_list):
+            is_same, isomap = prod_rule.is_same(each_prod_rule)
+            if is_same:
+                # we do not care about edge and node names, but care about the order of non-terminal edges.
+                for key, val in isomap.items(): # key : edges & nodes in each_prod_rule.rhs , val : those in prod_rule.rhs
+                    if key.startswith("bond_"):
+                        continue
+                    # rewrite `nt_idx` in `prod_rule` for further processing
+                    if "nt_idx" in prod_rule.rhs.edge_attr(val).keys():
+                        if "nt_idx" not in each_prod_rule.rhs.edge_attr(key).keys():
+                            raise ValueError
+                        prod_rule.rhs.set_edge_attr(
+                            val,
+                            {'nt_idx': each_prod_rule.rhs.edge_attr(key)["nt_idx"]})
+                return each_idx, prod_rule
+        self.prod_rule_list.append(prod_rule)
+        self._update_edge_symbol_list(prod_rule)
+        self._update_node_symbol_list(prod_rule)
+        self._update_ext_id_list(prod_rule)
+        lhs_idx = self.nt_symbol_list.index(prod_rule.lhs_nt_symbol)
+        self.lhs_in_prod_rule_row_list.append(lhs_idx)
+        self.lhs_in_prod_rule_col_list.append(len(self.prod_rule_list)-1)
+        self._lhs_in_prod_rule = None
+        return len(self.prod_rule_list)-1, prod_rule
+    def get_prod_rule(self, prod_rule_idx: int) -> ProductionRule:
+        return self.prod_rule_list[prod_rule_idx]
+    def sample(self, unmasked_logit_array, nt_symbol, deterministic=False):
+        ''' sample a production rule whose lhs is `nt_symbol`, followihng `unmasked_logit_array`.
+        Parameters
+        ----------
+        unmasked_logit_array : array-like, length `num_prod_rule`
+        nt_symbol : NTSymbol
+        '''
+        if not isinstance(unmasked_logit_array, np.ndarray):
+            unmasked_logit_array = unmasked_logit_array.numpy().astype(np.float64)
+        if deterministic:
+            prob = masked_softmax(unmasked_logit_array,
+                                  self.lhs_in_prod_rule[self.nt_symbol_list.index(nt_symbol)].numpy().astype(np.float64))
+            return self.prod_rule_list[np.argmax(prob)]
+        else:
+            return np.random.choice(
+                self.prod_rule_list, 1,
+                p=masked_softmax(unmasked_logit_array,
+                                 self.lhs_in_prod_rule[self.nt_symbol_list.index(nt_symbol)].numpy().astype(np.float64)))[0]
+    def masked_logprob(self, unmasked_logit_array, nt_symbol):
+        if not isinstance(unmasked_logit_array, np.ndarray):
+            unmasked_logit_array = unmasked_logit_array.numpy().astype(np.float64)
+        prob = masked_softmax(unmasked_logit_array,
+                              self.lhs_in_prod_rule[self.nt_symbol_list.index(nt_symbol)].numpy().astype(np.float64))
+        return np.log(prob)
+    def _update_edge_symbol_list(self, prod_rule: ProductionRule):
+        ''' update edge symbol list
+        Parameters
+        ----------
+        prod_rule : ProductionRule
+        '''
+        if prod_rule.lhs_nt_symbol not in self.nt_symbol_list:
+            self.nt_symbol_list.append(prod_rule.lhs_nt_symbol)
+        for each_edge in prod_rule.rhs.edges:
+            if prod_rule.rhs.edge_attr(each_edge)['symbol'] not in self.edge_symbol_dict:
+                edge_symbol_idx = len(self.edge_symbol_list)
+                self.edge_symbol_list.append(prod_rule.rhs.edge_attr(each_edge)['symbol'])
+                self.edge_symbol_dict[prod_rule.rhs.edge_attr(each_edge)['symbol']] = edge_symbol_idx
+            else:
+                edge_symbol_idx = self.edge_symbol_dict[prod_rule.rhs.edge_attr(each_edge)['symbol']]
+            prod_rule.rhs.edge_attr(each_edge)['symbol_idx'] = edge_symbol_idx
+        pass
+    def _update_node_symbol_list(self, prod_rule: ProductionRule):
+        ''' update node symbol list
+        Parameters
+        ----------
+        prod_rule : ProductionRule
+        '''
+        for each_node in prod_rule.rhs.nodes:
+            if prod_rule.rhs.node_attr(each_node)['symbol'] not in self.node_symbol_dict:
+                node_symbol_idx = len(self.node_symbol_list)
+                self.node_symbol_list.append(prod_rule.rhs.node_attr(each_node)['symbol'])
+                self.node_symbol_dict[prod_rule.rhs.node_attr(each_node)['symbol']] = node_symbol_idx
+            else:
+                node_symbol_idx = self.node_symbol_dict[prod_rule.rhs.node_attr(each_node)['symbol']]
+            prod_rule.rhs.node_attr(each_node)['symbol_idx'] = node_symbol_idx
+    def _update_ext_id_list(self, prod_rule: ProductionRule):
+        for each_node in prod_rule.rhs.nodes:
+            if 'ext_id' in prod_rule.rhs.node_attr(each_node):
+                if prod_rule.rhs.node_attr(each_node)['ext_id'] not in self.ext_id_list:
+                    self.ext_id_list.append(prod_rule.rhs.node_attr(each_node)['ext_id'])
+class HyperedgeReplacementGrammar(GraphGrammarBase):
+    """
+    Learn a hyperedge replacement grammar from a set of hypergraphs.
+    Attributes
+    ----------
+    prod_rule_list : list of ProductionRule
+        production rules learned from the input hypergraphs
+    """
+    def __init__(self,
+                 tree_decomposition=molecular_tree_decomposition,
+                 ignore_order=False, **kwargs):
+        from functools import partial
+        self.prod_rule_corpus = ProductionRuleCorpus()
+        self.clique_tree_corpus = CliqueTreeCorpus()
+        self.ignore_order = ignore_order
+        self.tree_decomposition = partial(tree_decomposition, **kwargs)
+    @property
+    def num_prod_rule(self):
+        ''' return the number of production rules
+        Returns
+        -------
+        int : the number of unique production rules
+        '''
+        return self.prod_rule_corpus.num_prod_rule
+    @property
+    def start_rule_list(self):
+        ''' return a list of start rules
+        Returns
+        -------
+        list : list of start rules
+        '''
+        return self.prod_rule_corpus.start_rule_list
+    @property
+    def prod_rule_list(self):
+        return self.prod_rule_corpus.prod_rule_list
+    def learn(self, hg_list, logger=print, max_mol=np.inf, print_freq=500):
+        """ learn from a list of hypergraphs
+        Parameters
+        ----------
+        hg_list : list of Hypergraph
+        Returns
+        -------
+        prod_rule_seq_list : list of integers
+            each element corresponds to a sequence of production rules to generate each hypergraph.
+        """
+        prod_rule_seq_list = []
+        idx = 0
+        for each_idx, each_hg in enumerate(hg_list):
+            clique_tree = self.tree_decomposition(each_hg)
+            # get a pair of myself and children
+            root_node = _find_root(clique_tree)
+            clique_tree = self.clique_tree_corpus.add_to_subhg_list(clique_tree, root_node)
+            prod_rule_seq = []
+            stack = []
+            children = sorted(list(clique_tree[root_node].keys()))
+            # extract a temporary production rule
+            prod_rule = extract_prod_rule(
+                None,
+                clique_tree.nodes[root_node]["subhg"],
+                [clique_tree.nodes[each_child]["subhg"]
+                 for each_child in children],
+                clique_tree.nodes[root_node].get('subhg_idx', None))
+            # update the production rule list
+            prod_rule_id, prod_rule = self.update_prod_rule_list(prod_rule)
+            children = reorder_children(root_node,
+                                        children,
+                                        prod_rule,
+                                        clique_tree)
+            stack.extend([(root_node, each_child) for each_child in children[::-1]])
+            prod_rule_seq.append(prod_rule_id)
+            while len(stack) != 0:
+                # get a triple of parent, myself, and children
+                parent, myself = stack.pop()
+                children = sorted(list(dict(clique_tree[myself]).keys()))
+                children.remove(parent)
+                # extract a temp prod rule
+                prod_rule = extract_prod_rule(
+                    clique_tree.nodes[parent]["subhg"],
+                    clique_tree.nodes[myself]["subhg"],
+                    [clique_tree.nodes[each_child]["subhg"]
+                     for each_child in children],
+                    clique_tree.nodes[myself].get('subhg_idx', None))
+                # update the prod rule list
+                prod_rule_id, prod_rule = self.update_prod_rule_list(prod_rule)
+                children = reorder_children(myself,
+                                            children,
+                                            prod_rule,
+                                            clique_tree)
+                stack.extend([(myself, each_child)
+                              for each_child in children[::-1]])
+                prod_rule_seq.append(prod_rule_id)
+            prod_rule_seq_list.append(prod_rule_seq)
+            if (each_idx+1) % print_freq == 0:
+                msg = f'#(molecules processed)={each_idx+1}\t'\
+                        f'#(production rules)={self.prod_rule_corpus.num_prod_rule}\t#(subhg in corpus)={self.clique_tree_corpus.size}'
+                logger(msg)
+            if each_idx > max_mol:
+                break
+        print(f'corpus_size = {self.clique_tree_corpus.size}')
+        return prod_rule_seq_list
+    def sample(self, z, deterministic=False):
+        """ sample a new hypergraph from HRG.
+        Parameters
+        ----------
+        z : array-like, shape (len, num_prod_rule)
+            logit
+        deterministic : bool
+            if True, deterministic sampling
+        Returns
+        -------
+        Hypergraph
+        """
+        seq_idx = 0
+        stack = []
+        z = z[:, :-1]
+        init_prod_rule = self.prod_rule_corpus.sample(z[0], NTSymbol(degree=0,
+                                                                     is_aromatic=False,
+                                                                     bond_symbol_list=[]),
+                                                      deterministic=deterministic)
+        hg, nt_edge_list = init_prod_rule.applied_to(None, None)
+        stack = deepcopy(nt_edge_list[::-1])
+        while len(stack) != 0 and seq_idx < z.shape[0]-1:
+            seq_idx += 1
+            nt_edge = stack.pop()
+            nt_symbol = hg.edge_attr(nt_edge)['symbol']
+            prod_rule = self.prod_rule_corpus.sample(z[seq_idx], nt_symbol, deterministic=deterministic)
+            hg, nt_edge_list = prod_rule.applied_to(hg, nt_edge)
+            stack.extend(nt_edge_list[::-1])
+        if len(stack) != 0:
+            raise RuntimeError(f'{len(stack)} non-terminals are left.')
+        return hg
+    def construct(self, prod_rule_seq):
+        """ construct a hypergraph following `prod_rule_seq`
+        Parameters
+        ----------
+        prod_rule_seq : list of integers
+            a sequence of production rules.
+        Returns
+        -------
+        UndirectedHypergraph
+        """
+        seq_idx = 0
+        init_prod_rule = self.prod_rule_corpus.get_prod_rule(prod_rule_seq[seq_idx])
+        hg, nt_edge_list = init_prod_rule.applied_to(None, None)
+        stack = deepcopy(nt_edge_list[::-1])
+        while len(stack) != 0:
+            seq_idx += 1
+            nt_edge = stack.pop()
+            hg, nt_edge_list = self.prod_rule_corpus.get_prod_rule(prod_rule_seq[seq_idx]).applied_to(hg, nt_edge)
+            stack.extend(nt_edge_list[::-1])
+        return hg
+    def update_prod_rule_list(self, prod_rule):
+        """ return whether the input production rule is new or not, and its production rule id.
+        Production rules are regarded as the same if
+            i) there exists a one-to-one mapping of nodes and edges, and
+            ii) all the attributes associated with nodes and hyperedges are the same.
+        Parameters
+        ----------
+        prod_rule : ProductionRule
+        Returns
+        -------
+        is_new : bool
+            if True, this production rule is new
+        prod_rule_id : int
+            production rule index. if new, a new index will be assigned.
+        """
+        return self.prod_rule_corpus.append(prod_rule)
+class IncrementalHyperedgeReplacementGrammar(HyperedgeReplacementGrammar):
+    '''
+    This class learns HRG incrementally leveraging the previously obtained production rules.
+    '''
+    def __init__(self, tree_decomposition=tree_decomposition_with_hrg, ignore_order=False):
+        self.prod_rule_list = []
+        self.tree_decomposition = tree_decomposition
+        self.ignore_order = ignore_order
+    def learn(self, hg_list):
+        """ learn from a list of hypergraphs
+        Parameters
+        ----------
+        hg_list : list of UndirectedHypergraph
+        Returns
+        -------
+        prod_rule_seq_list : list of integers
+            each element corresponds to a sequence of production rules to generate each hypergraph.
+        """
+        prod_rule_seq_list = []
+        for each_hg in hg_list:
+            clique_tree, root_node = tree_decomposition_with_hrg(each_hg, self, return_root=True)
+            prod_rule_seq = []
+            stack = []
+            # get a pair of myself and children
+            children = sorted(list(clique_tree[root_node].keys()))
+            # extract a temporary production rule
+            prod_rule = extract_prod_rule(None, clique_tree.nodes[root_node]["subhg"],
+                                          [clique_tree.nodes[each_child]["subhg"] for each_child in children])
+            # update the production rule list
+            prod_rule_id, prod_rule = self.update_prod_rule_list(prod_rule)
+            children = reorder_children(root_node, children, prod_rule, clique_tree)
+            stack.extend([(root_node, each_child) for each_child in children[::-1]])
+            prod_rule_seq.append(prod_rule_id)
+            while len(stack) != 0:
+                # get a triple of parent, myself, and children
+                parent, myself = stack.pop()
+                children = sorted(list(dict(clique_tree[myself]).keys()))
+                children.remove(parent)
+                # extract a temp prod rule
+                prod_rule = extract_prod_rule(
+                    clique_tree.nodes[parent]["subhg"], clique_tree.nodes[myself]["subhg"],
+                    [clique_tree.nodes[each_child]["subhg"] for each_child in children])
+                # update the prod rule list
+                prod_rule_id, prod_rule = self.update_prod_rule_list(prod_rule)
+                children = reorder_children(myself, children, prod_rule, clique_tree)
+                stack.extend([(myself, each_child) for each_child in children[::-1]])
+                prod_rule_seq.append(prod_rule_id)
+            prod_rule_seq_list.append(prod_rule_seq)
+        self._compute_stats()
+        return prod_rule_seq_list
+def reorder_children(myself, children, prod_rule, clique_tree):
+    """ reorder children so that they match the order in `prod_rule`.
+    Parameters
+    ----------
+    myself : int
+    children : list of int
+    prod_rule : ProductionRule
+    clique_tree : nx.Graph
+    Returns
+    -------
+    new_children : list of str
+        reordered children
+    """
+    perm = {} # key : `nt_idx`, val : child
+    for each_edge in prod_rule.rhs.edges:
+        if "nt_idx" in prod_rule.rhs.edge_attr(each_edge).keys():
+            for each_child in children:
+                common_node_set = set(
+                    common_node_list(clique_tree.nodes[myself]["subhg"],
+                                     clique_tree.nodes[each_child]["subhg"])[0])
+                if set(prod_rule.rhs.nodes_in_edge(each_edge)) == common_node_set:
+                    assert prod_rule.rhs.edge_attr(each_edge)["nt_idx"] not in perm
+                    perm[prod_rule.rhs.edge_attr(each_edge)["nt_idx"]] = each_child
+    new_children = []
+    assert len(perm) == len(children)
+    for i in range(len(perm)):
+        new_children.append(perm[i])
+    return new_children
+def extract_prod_rule(parent_hg, myself_hg, children_hg_list, subhg_idx=None):
+    """ extract a production rule from a triple of `parent_hg`, `myself_hg`, and `children_hg_list`.
+    Parameters
+    ----------
+    parent_hg : Hypergraph
+    myself_hg : Hypergraph
+    children_hg_list : list of Hypergraph
+    Returns
+    -------
+    ProductionRule, consisting of
+        lhs : Hypergraph or None
+        rhs : Hypergraph
+    """
+    def _add_ext_node(hg, ext_nodes):
+        """ mark nodes to be external (ordered ids are assigned)
+        Parameters
+        ----------
+        hg : UndirectedHypergraph
+        ext_nodes : list of str
+            list of external nodes
+        Returns
+        -------
+        hg : Hypergraph
+            nodes in `ext_nodes` are marked to be external
+        """
+        ext_id = 0
+        ext_id_exists = []
+        for each_node in ext_nodes:
+            ext_id_exists.append('ext_id' in hg.node_attr(each_node))
+        if ext_id_exists and any(ext_id_exists) != all(ext_id_exists):
+            raise ValueError
+        if not all(ext_id_exists):
+            for each_node in ext_nodes:
+                hg.node_attr(each_node)['ext_id'] = ext_id
+                ext_id += 1
+        return hg
+    def _check_aromatic(hg, node_list):
+        is_aromatic = False
+        node_aromatic_list = []
+        for each_node in node_list:
+            if hg.node_attr(each_node)['symbol'].is_aromatic:
+                is_aromatic = True
+                node_aromatic_list.append(True)
+            else:
+                node_aromatic_list.append(False)
+        return is_aromatic, node_aromatic_list
+    def _check_ring(hg):
+        for each_edge in hg.edges:
+            if not ('tmp' in hg.edge_attr(each_edge) or (not hg.edge_attr(each_edge)['terminal'])):
+                return False
+        return True
+    if parent_hg is None:
+        lhs = Hypergraph()
+        node_list = []
+    else:
+        lhs = Hypergraph()
+        node_list, edge_exists = common_node_list(parent_hg, myself_hg)
+        for each_node in node_list:
+            lhs.add_node(each_node,
+                         deepcopy(myself_hg.node_attr(each_node)))
+        is_aromatic, _ = _check_aromatic(parent_hg, node_list)
+        for_ring = _check_ring(myself_hg)
+        bond_symbol_list = []
+        for each_node in node_list:
+            bond_symbol_list.append(parent_hg.node_attr(each_node)['symbol'])
+        lhs.add_edge(
+            node_list,
+            attr_dict=dict(
+                terminal=False,
+                edge_exists=edge_exists,
+                symbol=NTSymbol(
+                    degree=len(node_list),
+                    is_aromatic=is_aromatic,
+                    bond_symbol_list=bond_symbol_list,
+                    for_ring=for_ring)))
+        try:
+            lhs = _add_ext_node(lhs, node_list)
+        except ValueError:
+            import pdb; pdb.set_trace()
+    rhs = remove_tmp_edge(deepcopy(myself_hg))
+    #rhs = remove_ext_node(rhs)
+    #rhs = remove_nt_edge(rhs)
+    try:
+        rhs = _add_ext_node(rhs, node_list)
+    except ValueError:
+        import pdb; pdb.set_trace()
+    nt_idx = 0
+    if children_hg_list is not None:
+        for each_child_hg in children_hg_list:
+            node_list, edge_exists = common_node_list(myself_hg, each_child_hg)
+            is_aromatic, _ = _check_aromatic(myself_hg, node_list)
+            for_ring = _check_ring(each_child_hg)
+            bond_symbol_list = []
+            for each_node in node_list:
+                bond_symbol_list.append(myself_hg.node_attr(each_node)['symbol'])
+            rhs.add_edge(
+                node_list,
+                attr_dict=dict(
+                    terminal=False,
+                    nt_idx=nt_idx,
+                    edge_exists=edge_exists,
+                    symbol=NTSymbol(degree=len(node_list),
+                                    is_aromatic=is_aromatic,
+                                    bond_symbol_list=bond_symbol_list,
+                                    for_ring=for_ring)))
+            nt_idx += 1
+    prod_rule = ProductionRule(lhs, rhs)
+    prod_rule.subhg_idx = subhg_idx
+    if DEBUG:
+        if sorted(list(prod_rule.ext_node.keys())) \
+           != list(np.arange(len(prod_rule.ext_node))):
+            raise RuntimeError('ext_id is not continuous')
+    return prod_rule
+def _find_root(clique_tree):
+    max_node = None
+    num_nodes_max = -np.inf
+    for each_node in clique_tree.nodes:
+        if clique_tree.nodes[each_node]['subhg'].num_nodes > num_nodes_max:
+            max_node = each_node
+            num_nodes_max = clique_tree.nodes[each_node]['subhg'].num_nodes
+        '''
+        children = sorted(list(clique_tree[each_node].keys()))
+        prod_rule = extract_prod_rule(None,
+                                      clique_tree.nodes[each_node]["subhg"],
+                                      [clique_tree.nodes[each_child]["subhg"]
+                                       for each_child in children])
+        for each_start_rule in start_rule_list:
+            if prod_rule.is_same(each_start_rule):
+                return each_node
+        '''
+    return max_node
+def remove_ext_node(hg):
+    for each_node in hg.nodes:
+        hg.node_attr(each_node).pop('ext_id', None)
+    return hg
+def remove_nt_edge(hg):
+    remove_edge_list = []
+    for each_edge in hg.edges:
+        if not hg.edge_attr(each_edge)['terminal']:
+            remove_edge_list.append(each_edge)
+    hg.remove_edges(remove_edge_list)
+    return hg
+def remove_tmp_edge(hg):
+    remove_edge_list = []
+    for each_edge in hg.edges:
+        if hg.edge_attr(each_edge).get('tmp', False):
+            remove_edge_list.append(each_edge)
+    hg.remove_edges(remove_edge_list)
+    return hg

models/mhg_model/graph_grammar/graph_grammar/symbols.py ADDED Viewed

	@@ -0,0 +1,180 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 1 2018"
+from typing import List
+class TSymbol(object):
+    ''' terminal symbol
+    Attributes
+    ----------
+    degree : int
+        the number of nodes in a hyperedge
+    is_aromatic : bool
+        whether or not the hyperedge is in an aromatic ring
+    symbol : str
+        atomic symbol
+    num_explicit_Hs : int
+        the number of hydrogens associated to this hyperedge
+    formal_charge : int
+        charge
+    chirality : int
+        chirality
+    '''
+    def __init__(self, degree, is_aromatic,
+                 symbol, num_explicit_Hs, formal_charge, chirality):
+        self.degree = degree
+        self.is_aromatic = is_aromatic
+        self.symbol = symbol
+        self.num_explicit_Hs = num_explicit_Hs
+        self.formal_charge = formal_charge
+        self.chirality = chirality
+    @property
+    def terminal(self):
+        return True
+    def __eq__(self, other):
+        if not isinstance(other, TSymbol):
+            return False
+        if self.degree != other.degree:
+            return False
+        if self.is_aromatic != other.is_aromatic:
+            return False
+        if self.symbol != other.symbol:
+            return False
+        if self.num_explicit_Hs != other.num_explicit_Hs:
+            return False
+        if self.formal_charge != other.formal_charge:
+            return False
+        if self.chirality != other.chirality:
+            return False
+        return True
+    def __hash__(self):
+        return self.__str__().__hash__()
+    def __str__(self):
+        return f'degree={self.degree}, is_aromatic={self.is_aromatic}, '\
+            f'symbol={self.symbol}, '\
+            f'num_explicit_Hs={self.num_explicit_Hs}, '\
+            f'formal_charge={self.formal_charge}, chirality={self.chirality}'
+class NTSymbol(object):
+    ''' non-terminal symbol
+    Attributes
+    ----------
+    degree : int
+        degree of the hyperedge
+    is_aromatic : bool
+        if True, at least one of the associated bonds must be aromatic.
+    node_aromatic_list : list of bool
+        indicate whether each of the nodes is aromatic or not.
+    bond_type_list : list of int
+        bond type of each node"
+    '''
+    def __init__(self, degree: int, is_aromatic: bool,
+                 bond_symbol_list: list,
+                 for_ring=False):
+        self.degree = degree
+        self.is_aromatic = is_aromatic
+        self.for_ring = for_ring
+        self.bond_symbol_list = bond_symbol_list
+    @property
+    def terminal(self) -> bool:
+        return False
+    @property
+    def symbol(self):
+        return f'NT{self.degree}'
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, NTSymbol):
+            return False
+        if self.degree != other.degree:
+            return False
+        if self.is_aromatic != other.is_aromatic:
+            return False
+        if self.for_ring != other.for_ring:
+            return False
+        if len(self.bond_symbol_list) != len(other.bond_symbol_list):
+            return False
+        for each_idx in range(len(self.bond_symbol_list)):
+            if self.bond_symbol_list[each_idx] != other.bond_symbol_list[each_idx]:
+                return False
+        return True
+    def __hash__(self):
+        return self.__str__().__hash__()
+    def __str__(self) -> str:
+        return f'degree={self.degree}, is_aromatic={self.is_aromatic}, '\
+            f'bond_symbol_list={[str(each_symbol) for each_symbol in self.bond_symbol_list]}'\
+            f'for_ring={self.for_ring}'
+class BondSymbol(object):
+    ''' Bond symbol
+    Attributes
+    ----------
+    is_aromatic : bool
+        if True, at least one of the associated bonds must be aromatic.
+    bond_type : int
+        bond type of each node"
+    '''
+    def __init__(self, is_aromatic: bool,
+                 bond_type: int,
+                 stereo: int):
+        self.is_aromatic = is_aromatic
+        self.bond_type = bond_type
+        self.stereo = stereo
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, BondSymbol):
+            return False
+        if self.is_aromatic != other.is_aromatic:
+            return False
+        if self.bond_type != other.bond_type:
+            return False
+        if self.stereo != other.stereo:
+            return False
+        return True
+    def __hash__(self):
+        return self.__str__().__hash__()
+    def __str__(self) -> str:
+        return f'is_aromatic={self.is_aromatic}, '\
+            f'bond_type={self.bond_type}, '\
+            f'stereo={self.stereo}, '

models/mhg_model/graph_grammar/graph_grammar/utils.py ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jun 4 2018"
+from ..hypergraph import Hypergraph
+from copy import deepcopy
+from typing import List
+import numpy as np
+def common_node_list(hg1: Hypergraph, hg2: Hypergraph) -> List[str]:
+    """ return a list of common nodes
+    Parameters
+    ----------
+    hg1, hg2 : Hypergraph
+    Returns
+    -------
+    list of str
+        list of common nodes
+    """
+    if hg1 is None or hg2 is None:
+        return [], False
+    else:
+        node_set = hg1.nodes.intersection(hg2.nodes)
+        node_dict = {}
+        if 'order4hrg' in hg1.node_attr(list(hg1.nodes)[0]):
+            for each_node in node_set:
+                node_dict[each_node] = hg1.node_attr(each_node)['order4hrg']
+        else:
+            for each_node in node_set:
+                node_dict[each_node] = hg1.node_attr(each_node)['symbol'].__hash__()
+        node_list = []
+        for each_key, _ in sorted(node_dict.items(), key=lambda x:x[1]):
+            node_list.append(each_key)
+        edge_name = hg1.has_edge(node_list, ignore_order=True)
+        if edge_name:
+            if not hg1.edge_attr(edge_name).get('terminal', True):
+                node_list = hg1.nodes_in_edge(edge_name)
+            return node_list, True
+        else:
+            return node_list, False
+def _node_match(node1, node2):
+    # if the nodes are hyperedges, `atom_attr` determines the match
+    if node1['bipartite'] == 'edge' and node2['bipartite'] == 'edge':
+        return node1["attr_dict"]['symbol'] == node2["attr_dict"]['symbol']
+    elif node1['bipartite'] == 'node' and node2['bipartite'] == 'node':
+        # bond_symbol
+        return node1['attr_dict']['symbol'] == node2['attr_dict']['symbol']
+    else:
+        return False
+def _easy_node_match(node1, node2):
+    # if the nodes are hyperedges, `atom_attr` determines the match
+    if node1['bipartite'] == 'edge' and node2['bipartite'] == 'edge':
+        return node1["attr_dict"].get('symbol', None) == node2["attr_dict"].get('symbol', None)
+    elif node1['bipartite'] == 'node' and node2['bipartite'] == 'node':
+        # bond_symbol
+        return node1['attr_dict'].get('ext_id', -1) == node2['attr_dict'].get('ext_id', -1)\
+            and node1['attr_dict']['symbol'] == node2['attr_dict']['symbol']
+    else:
+        return False
+def _node_match_prod_rule(node1, node2, ignore_order=False):
+    # if the nodes are hyperedges, `atom_attr` determines the match
+    if node1['bipartite'] == 'edge' and node2['bipartite'] == 'edge':
+        return node1["attr_dict"]['symbol'] == node2["attr_dict"]['symbol']
+    elif node1['bipartite'] == 'node' and node2['bipartite'] == 'node':
+        # ext_id, order4hrg, bond_symbol
+        if ignore_order:
+            return node1['attr_dict']['symbol'] == node2['attr_dict']['symbol']
+        else:
+            return node1['attr_dict']['symbol'] == node2['attr_dict']['symbol']\
+                and node1['attr_dict'].get('ext_id', -1) == node2['attr_dict'].get('ext_id', -1)
+    else:
+        return False
+def _edge_match(edge1, edge2, ignore_order=False):
+    #return True
+    if ignore_order:
+        return True
+    else:
+        return edge1["order"] == edge2["order"]
+def masked_softmax(logit, mask):
+    ''' compute a probability distribution from logit
+    Parameters
+    ----------
+    logit : array-like, length D
+        each element indicates how each dimension is likely to be chosen
+        (the larger, the more likely)
+    mask : array-like, length D
+        each element is either 0 or 1.
+        if 0, the dimension is ignored
+        when computing the probability distribution.
+    Returns
+    -------
+    prob_dist : array, length D
+        probability distribution computed from logit.
+        if `mask[d] = 0`, `prob_dist[d] = 0`.
+    '''
+    if logit.shape != mask.shape:
+        raise ValueError('logit and mask must have the same shape')
+    c = np.max(logit)
+    exp_logit = np.exp(logit - c) * mask
+    sum_exp_logit = exp_logit @ mask
+    return exp_logit / sum_exp_logit

models/mhg_model/graph_grammar/hypergraph.py ADDED Viewed

	@@ -0,0 +1,544 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 31 2018"
+from copy import deepcopy
+from typing import List, Dict, Tuple
+import networkx as nx
+import numpy as np
+import os
+class Hypergraph(object):
+    '''
+    A class of a hypergraph.
+    Each hyperedge can be ordered. For the ordered case,
+    edges adjacent to the hyperedge node are labeled by their orders.
+    Attributes
+    ----------
+    hg : nx.Graph
+        a bipartite graph representation of a hypergraph
+    edge_idx : int
+        total number of hyperedges that exist so far
+    '''
+    def __init__(self):
+        self.hg = nx.Graph()
+        self.edge_idx = 0
+        self.nodes = set([])
+        self.num_nodes = 0
+        self.edges = set([])
+        self.num_edges = 0
+        self.nodes_in_edge_dict = {}
+    def add_node(self, node: str, attr_dict=None):
+        ''' add a node to hypergraph
+        Parameters
+        ----------
+        node : str
+            node name
+        attr_dict : dict
+            dictionary of node attributes
+        '''
+        self.hg.add_node(node, bipartite='node', attr_dict=attr_dict)
+        if node not in self.nodes:
+            self.num_nodes += 1
+        self.nodes.add(node)
+    def add_edge(self, node_list: List[str], attr_dict=None, edge_name=None):
+        ''' add an edge consisting of nodes `node_list`
+        Parameters
+        ----------
+        node_list : list
+            ordered list of nodes that consist the edge
+        attr_dict : dict
+            dictionary of edge attributes
+        '''
+        if edge_name is None:
+            edge = 'e{}'.format(self.edge_idx)
+        else:
+            assert edge_name not in self.edges
+            edge = edge_name
+        self.hg.add_node(edge, bipartite='edge', attr_dict=attr_dict)
+        if edge not in self.edges:
+            self.num_edges += 1
+        self.edges.add(edge)
+        self.nodes_in_edge_dict[edge] = node_list
+        if type(node_list) == list:
+            for node_idx, each_node in enumerate(node_list):
+                self.hg.add_edge(edge, each_node, order=node_idx)
+                if each_node not in self.nodes:
+                    self.num_nodes += 1
+                self.nodes.add(each_node)
+        elif type(node_list) == set:
+            for each_node in node_list:
+                self.hg.add_edge(edge, each_node, order=-1)
+                if each_node not in self.nodes:
+                    self.num_nodes += 1
+                self.nodes.add(each_node)
+        else:
+            raise ValueError
+        self.edge_idx += 1
+        return edge
+    def remove_node(self, node: str, remove_connected_edges=True):
+        ''' remove a node
+        Parameters
+        ----------
+        node : str
+            node name
+        remove_connected_edges : bool
+            if True, remove edges that are adjacent to the node
+        '''
+        if remove_connected_edges:
+            connected_edges = deepcopy(self.adj_edges(node))
+            for each_edge in connected_edges:
+                self.remove_edge(each_edge)
+        self.hg.remove_node(node)
+        self.num_nodes -= 1
+        self.nodes.remove(node)
+    def remove_nodes(self, node_iter, remove_connected_edges=True):
+        ''' remove a set of nodes
+        Parameters
+        ----------
+        node_iter : iterator of strings
+            nodes to be removed
+        remove_connected_edges : bool
+            if True, remove edges that are adjacent to the node
+        '''
+        for each_node in node_iter:
+            self.remove_node(each_node, remove_connected_edges)
+    def remove_edge(self, edge: str):
+        ''' remove an edge
+        Parameters
+        ----------
+        edge : str
+            edge to be removed
+        '''
+        self.hg.remove_node(edge)
+        self.edges.remove(edge)
+        self.num_edges -= 1
+        self.nodes_in_edge_dict.pop(edge)
+    def remove_edges(self, edge_iter):
+        ''' remove a set of edges
+        Parameters
+        ----------
+        edge_iter : iterator of strings
+            edges to be removed
+        '''
+        for each_edge in edge_iter:
+            self.remove_edge(each_edge)
+    def remove_edges_with_attr(self, edge_attr_dict):
+        remove_edge_list = []
+        for each_edge in self.edges:
+            satisfy = True
+            for each_key, each_val in edge_attr_dict.items():
+                if not satisfy:
+                    break
+                try:
+                    if self.edge_attr(each_edge)[each_key] != each_val:
+                        satisfy = False
+                except KeyError:
+                    satisfy = False
+            if satisfy:
+                remove_edge_list.append(each_edge)
+        self.remove_edges(remove_edge_list)
+    def remove_subhg(self, subhg):
+        ''' remove subhypergraph.
+        all of the hyperedges are removed.
+        each node of subhg is removed if its degree becomes 0 after removing hyperedges.
+        Parameters
+        ----------
+        subhg : Hypergraph
+        '''
+        for each_edge in subhg.edges:
+            self.remove_edge(each_edge)
+        for each_node in subhg.nodes:
+            if self.degree(each_node) == 0:
+                self.remove_node(each_node)
+    def nodes_in_edge(self, edge):
+        ''' return an ordered list of nodes in a given edge.
+        Parameters
+        ----------
+        edge : str
+            edge whose nodes are returned
+        Returns
+        -------
+        list or set
+            ordered list or set of nodes that belong to the edge
+        '''
+        if edge.startswith('e'):
+            return self.nodes_in_edge_dict[edge]
+        else:
+            adj_node_list = self.hg.adj[edge]
+            adj_node_order_list = []
+            adj_node_name_list = []
+            for each_node in adj_node_list:
+                adj_node_order_list.append(adj_node_list[each_node]['order'])
+                adj_node_name_list.append(each_node)
+            if adj_node_order_list == [-1] * len(adj_node_order_list):
+                return set(adj_node_name_list)
+            else:
+                return [adj_node_name_list[each_idx] for each_idx
+                        in np.argsort(adj_node_order_list)]
+    def adj_edges(self, node):
+        ''' return a dict of adjacent hyperedges
+        Parameters
+        ----------
+        node : str
+        Returns
+        -------
+        set
+            set of edges that are adjacent to `node`
+        '''
+        return self.hg.adj[node]
+    def adj_nodes(self, node):
+        ''' return a set of adjacent nodes
+        Parameters
+        ----------
+        node : str
+        Returns
+        -------
+        set
+            set of nodes that are adjacent to `node`
+        '''
+        node_set = set([])
+        for each_adj_edge in self.adj_edges(node):
+            node_set.update(set(self.nodes_in_edge(each_adj_edge)))
+        node_set.discard(node)
+        return node_set
+    def has_edge(self, node_list, ignore_order=False):
+        for each_edge in self.edges:
+            if ignore_order:
+                if set(self.nodes_in_edge(each_edge)) == set(node_list):
+                    return each_edge
+            else:
+                if self.nodes_in_edge(each_edge) == node_list:
+                    return each_edge
+        return False
+    def degree(self, node):
+        return len(self.hg.adj[node])
+    def degrees(self):
+        return {each_node: self.degree(each_node) for each_node in self.nodes}
+    def edge_degree(self, edge):
+        return len(self.nodes_in_edge(edge))
+    def edge_degrees(self):
+        return {each_edge: self.edge_degree(each_edge) for each_edge in self.edges}
+    def is_adj(self, node1, node2):
+        return node1 in self.adj_nodes(node2)
+    def adj_subhg(self, node, ident_node_dict=None):
+        """ return a subhypergraph consisting of a set of nodes and hyperedges adjacent to `node`.
+        if an adjacent node has a self-loop hyperedge, it will be also added to the subhypergraph.
+        Parameters
+        ----------
+        node : str
+        ident_node_dict : dict
+            dict containing identical nodes. see `get_identical_node_dict` for more details
+        Returns
+        -------
+        subhg : Hypergraph
+        """
+        if ident_node_dict is None:
+            ident_node_dict = self.get_identical_node_dict()
+        adj_node_set = set(ident_node_dict[node])
+        adj_edge_set = set([])
+        for each_node in ident_node_dict[node]:
+            adj_edge_set.update(set(self.adj_edges(each_node)))
+        fixed_adj_edge_set = deepcopy(adj_edge_set)
+        for each_edge in fixed_adj_edge_set:
+            other_nodes = self.nodes_in_edge(each_edge)
+            adj_node_set.update(other_nodes)
+            # if the adjacent node has self-loop edge, it will be appended to adj_edge_list.
+            for each_node in other_nodes:
+                for other_edge in set(self.adj_edges(each_node)) - set([each_edge]):
+                    if len(set(self.nodes_in_edge(other_edge)) \
+                           - set(self.nodes_in_edge(each_edge))) == 0:
+                        adj_edge_set.update(set([other_edge]))
+        subhg = Hypergraph()
+        for each_node in adj_node_set:
+            subhg.add_node(each_node, attr_dict=self.node_attr(each_node))
+        for each_edge in adj_edge_set:
+            subhg.add_edge(self.nodes_in_edge(each_edge),
+                           attr_dict=self.edge_attr(each_edge),
+                           edge_name=each_edge)
+        subhg.edge_idx = self.edge_idx
+        return subhg
+    def get_subhg(self, node_list, edge_list, ident_node_dict=None):
+        """ return a subhypergraph consisting of a set of nodes and hyperedges adjacent to `node`.
+        if an adjacent node has a self-loop hyperedge, it will be also added to the subhypergraph.
+        Parameters
+        ----------
+        node : str
+        ident_node_dict : dict
+            dict containing identical nodes. see `get_identical_node_dict` for more details
+        Returns
+        -------
+        subhg : Hypergraph
+        """
+        if ident_node_dict is None:
+            ident_node_dict = self.get_identical_node_dict()
+        adj_node_set = set([])
+        for each_node in node_list:
+            adj_node_set.update(set(ident_node_dict[each_node]))
+        adj_edge_set = set(edge_list)
+        subhg = Hypergraph()
+        for each_node in adj_node_set:
+            subhg.add_node(each_node,
+                           attr_dict=deepcopy(self.node_attr(each_node)))
+        for each_edge in adj_edge_set:
+            subhg.add_edge(self.nodes_in_edge(each_edge),
+                           attr_dict=deepcopy(self.edge_attr(each_edge)),
+                           edge_name=each_edge)
+        subhg.edge_idx = self.edge_idx
+        return subhg
+    def copy(self):
+        ''' return a copy of the object
+        Returns
+        -------
+        Hypergraph
+        '''
+        return deepcopy(self)
+    def node_attr(self, node):
+        return self.hg.nodes[node]['attr_dict']
+    def edge_attr(self, edge):
+        return self.hg.nodes[edge]['attr_dict']
+    def set_node_attr(self, node, attr_dict):
+        for each_key, each_val in attr_dict.items():
+            self.hg.nodes[node]['attr_dict'][each_key] = each_val
+    def set_edge_attr(self, edge, attr_dict):
+        for each_key, each_val in attr_dict.items():
+            self.hg.nodes[edge]['attr_dict'][each_key] = each_val
+    def get_identical_node_dict(self):
+        ''' get identical nodes
+        nodes are identical if they share the same set of adjacent edges.
+        Returns
+        -------
+        ident_node_dict : dict
+            ident_node_dict[node] returns a list of nodes that are identical to `node`.
+        '''
+        ident_node_dict = {}
+        for each_node in self.nodes:
+            ident_node_list = []
+            for each_other_node in self.nodes:
+                if each_other_node == each_node:
+                    ident_node_list.append(each_other_node)
+                elif self.adj_edges(each_node) == self.adj_edges(each_other_node) \
+                   and len(self.adj_edges(each_node)) != 0:
+                    ident_node_list.append(each_other_node)
+            ident_node_dict[each_node] = ident_node_list
+        return ident_node_dict
+    '''
+        ident_node_dict = {}
+        for each_node in self.nodes:
+            ident_node_dict[each_node] = [each_node]
+        return ident_node_dict
+    '''
+    def get_leaf_edge(self):
+        ''' get an edge that is incident only to one edge
+        Returns
+        -------
+        if exists, return a leaf edge. otherwise, return None.
+        '''
+        for each_edge in self.edges:
+            if len(self.adj_nodes(each_edge)) == 1:
+                if 'tmp' not in self.edge_attr(each_edge):
+                    return each_edge
+        return None
+    def get_nontmp_edge(self):
+        for each_edge in self.edges:
+            if 'tmp' not in self.edge_attr(each_edge):
+                return each_edge
+        return None
+    def is_subhg(self, hg):
+        ''' return whether this hypergraph is a subhypergraph of `hg`
+        Returns
+        -------
+        True if self \in hg,
+        False otherwise.
+        '''
+        for each_node in self.nodes:
+            if each_node not in hg.nodes:
+                return False
+        for each_edge in self.edges:
+            if each_edge not in hg.edges:
+                return False
+        return True
+    def in_cycle(self, node, visited=None, parent='', root_node='') -> bool:
+        ''' if `node` is in a cycle, then return True. otherwise, False.
+        Parameters
+        ----------
+        node : str
+            node in a hypergraph
+        visited : list
+            list of visited nodes, used for recursion
+        parent : str
+            parent node, used to eliminate a cycle consisting of two nodes and one edge.
+        Returns
+        -------
+        bool
+        '''
+        if visited is None:
+            visited = []
+        if parent == '':
+            visited = []
+        if root_node == '':
+            root_node = node
+        visited.append(node)
+        for each_adj_node in self.adj_nodes(node):
+            if each_adj_node not in visited:
+                if self.in_cycle(each_adj_node, visited, node, root_node):
+                    return True
+            elif each_adj_node != parent and each_adj_node == root_node:
+                return True
+        return False
+    def draw(self, file_path=None, with_node=False, with_edge_name=False):
+        ''' draw hypergraph
+        '''
+        import graphviz
+        G = graphviz.Graph(format='png')
+        for each_node in self.nodes:
+            if 'ext_id' in self.node_attr(each_node):
+                G.node(each_node, label='',
+                       shape='circle', width='0.1', height='0.1', style='filled',
+                       fillcolor='black')
+            else:
+                if with_node:
+                    G.node(each_node, label='',
+                           shape='circle', width='0.1', height='0.1', style='filled',
+                           fillcolor='gray')
+        edge_list = []
+        for each_edge in self.edges:
+            if self.edge_attr(each_edge).get('terminal', False):
+                G.node(each_edge,
+                       label=self.edge_attr(each_edge)['symbol'].symbol if not with_edge_name \
+                       else self.edge_attr(each_edge)['symbol'].symbol + ', ' + each_edge,
+                       fontcolor='black', shape='square')
+            elif self.edge_attr(each_edge).get('tmp', False):
+                G.node(each_edge, label='tmp' if not with_edge_name else 'tmp, ' + each_edge,
+                       fontcolor='black', shape='square')
+            else:
+                G.node(each_edge,
+                       label=self.edge_attr(each_edge)['symbol'].symbol if not with_edge_name \
+                       else self.edge_attr(each_edge)['symbol'].symbol + ', ' + each_edge,
+                       fontcolor='black', shape='square', style='filled')
+            if with_node:
+                for each_node in self.nodes_in_edge(each_edge):
+                    G.edge(each_edge, each_node)
+            else:
+                for each_node in self.nodes_in_edge(each_edge):
+                    if 'ext_id' in self.node_attr(each_node)\
+                       and set([each_node, each_edge]) not in edge_list:
+                        G.edge(each_edge, each_node)
+                        edge_list.append(set([each_node, each_edge]))
+                for each_other_edge in self.adj_nodes(each_edge):
+                    if set([each_edge, each_other_edge]) not in edge_list:
+                        num_bond = 0
+                        common_node_set = set(self.nodes_in_edge(each_edge))\
+                                          .intersection(set(self.nodes_in_edge(each_other_edge)))
+                        for each_node in common_node_set:
+                            if self.node_attr(each_node)['symbol'].bond_type in [1, 2, 3]:
+                                num_bond += self.node_attr(each_node)['symbol'].bond_type
+                            elif self.node_attr(each_node)['symbol'].bond_type in [12]:
+                                num_bond += 1
+                            else:
+                                raise NotImplementedError('unsupported bond type')
+                        for _ in range(num_bond):
+                            G.edge(each_edge, each_other_edge)
+                        edge_list.append(set([each_edge, each_other_edge]))
+        if file_path is not None:
+            G.render(file_path, cleanup=True)
+            #os.remove(file_path)
+        return G
+    def is_dividable(self, node):
+        _hg = deepcopy(self.hg)
+        _hg.remove_node(node)
+        return (not nx.is_connected(_hg))
+    def divide(self, node):
+        subhg_list = []
+        hg_wo_node = deepcopy(self)
+        hg_wo_node.remove_node(node, remove_connected_edges=False)
+        connected_components = nx.connected_components(hg_wo_node.hg)
+        for each_component in connected_components:
+            node_list = [node]
+            edge_list = []
+            node_list.extend([each_node for each_node in each_component
+                              if each_node.startswith('bond_')])
+            edge_list.extend([each_edge for each_edge in each_component
+                              if each_edge.startswith('e')])
+            subhg_list.append(self.get_subhg(node_list, edge_list))
+            #subhg_list[-1].set_node_attr(node, {'divided': True})
+        return subhg_list

models/mhg_model/graph_grammar/io/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 1 2018"

models/mhg_model/graph_grammar/io/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (679 Bytes). View file

models/mhg_model/graph_grammar/io/__pycache__/smi.cpython-310.pyc ADDED Viewed

Binary file (13 kB). View file

models/mhg_model/graph_grammar/io/smi.py ADDED Viewed

	@@ -0,0 +1,559 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 12 2018"
+from copy import deepcopy
+from rdkit import Chem
+from rdkit import RDLogger
+import networkx as nx
+import numpy as np
+from ..hypergraph import Hypergraph
+from ..graph_grammar.symbols import TSymbol, BondSymbol
+# supress warnings
+lg = RDLogger.logger()
+lg.setLevel(RDLogger.CRITICAL)
+class HGGen(object):
+    """
+    load .smi file and yield a hypergraph.
+    Attributes
+    ----------
+    path_to_file : str
+        path to .smi file
+    kekulize : bool
+        kekulize or not
+    add_Hs : bool
+        add implicit hydrogens to the molecule or not.
+    all_single : bool
+        if True, all multiple bonds are summarized into a single bond with some attributes
+    Yields
+    ------
+    Hypergraph
+    """
+    def __init__(self, path_to_file, kekulize=True, add_Hs=False, all_single=True):
+        self.num_line = 1
+        self.mol_gen = Chem.SmilesMolSupplier(path_to_file, titleLine=False)
+        self.kekulize = kekulize
+        self.add_Hs = add_Hs
+        self.all_single = all_single
+    def __iter__(self):
+        return self
+    def __next__(self):
+        '''
+        each_mol = None
+        while each_mol is None:
+            each_mol = next(self.mol_gen)
+        '''
+        # not ignoring parse errors
+        each_mol = next(self.mol_gen)
+        if each_mol is None:
+            raise ValueError(f'incorrect smiles in line {self.num_line}')
+        else:
+            self.num_line += 1
+        return mol_to_hg(each_mol, self.kekulize, self.add_Hs)
+def mol_to_bipartite(mol, kekulize):
+    """
+    get a bipartite representation of a molecule.
+    Parameters
+    ----------
+    mol : rdkit.Chem.rdchem.Mol
+        molecule object
+    Returns
+    -------
+    nx.Graph
+        a bipartite graph representing which bond is connected to which atoms.
+    """
+    try:
+        mol = standardize_stereo(mol)
+    except KeyError:
+        print(Chem.MolToSmiles(mol))
+        raise KeyError
+    if kekulize:
+        Chem.Kekulize(mol)
+    bipartite_g = nx.Graph()
+    for each_atom in mol.GetAtoms():
+        bipartite_g.add_node(f"atom_{each_atom.GetIdx()}",
+                             atom_attr=atom_attr(each_atom, kekulize))
+    for each_bond in mol.GetBonds():
+        bond_idx = each_bond.GetIdx()
+        bipartite_g.add_node(
+            f"bond_{bond_idx}",
+            bond_attr=bond_attr(each_bond, kekulize))
+        bipartite_g.add_edge(
+            f"atom_{each_bond.GetBeginAtomIdx()}",
+            f"bond_{bond_idx}")
+        bipartite_g.add_edge(
+            f"atom_{each_bond.GetEndAtomIdx()}",
+            f"bond_{bond_idx}")
+    return bipartite_g
+def mol_to_hg(mol, kekulize, add_Hs):
+    """
+    get a bipartite representation of a molecule.
+    Parameters
+    ----------
+    mol : rdkit.Chem.rdchem.Mol
+        molecule object
+    kekulize : bool
+        kekulize or not
+    add_Hs : bool
+        add implicit hydrogens to the molecule or not.
+    Returns
+    -------
+    Hypergraph
+    """
+    if add_Hs:
+        mol = Chem.AddHs(mol)
+    if kekulize:
+        Chem.Kekulize(mol)
+    bipartite_g = mol_to_bipartite(mol, kekulize)
+    hg = Hypergraph()
+    for each_atom in [each_node for each_node in bipartite_g.nodes()
+                      if each_node.startswith('atom_')]:
+        node_set = set([])
+        for each_bond in bipartite_g.adj[each_atom]:
+            hg.add_node(each_bond,
+                        attr_dict=bipartite_g.nodes[each_bond]['bond_attr'])
+            node_set.add(each_bond)
+        hg.add_edge(node_set,
+                    attr_dict=bipartite_g.nodes[each_atom]['atom_attr'])
+    return hg
+def hg_to_mol(hg, verbose=False):
+    """ convert a hypergraph into Mol object
+    Parameters
+    ----------
+    hg : Hypergraph
+    Returns
+    -------
+    mol : Chem.RWMol
+    """
+    mol = Chem.RWMol()
+    atom_dict = {}
+    bond_set = set([])
+    for each_edge in hg.edges:
+        atom = Chem.Atom(hg.edge_attr(each_edge)['symbol'].symbol)
+        atom.SetNumExplicitHs(hg.edge_attr(each_edge)['symbol'].num_explicit_Hs)
+        atom.SetFormalCharge(hg.edge_attr(each_edge)['symbol'].formal_charge)
+        atom.SetChiralTag(
+            Chem.rdchem.ChiralType.values[
+                hg.edge_attr(each_edge)['symbol'].chirality])
+        atom_idx = mol.AddAtom(atom)
+        atom_dict[each_edge] = atom_idx
+    for each_node in hg.nodes:
+        edge_1, edge_2 = hg.adj_edges(each_node)
+        if edge_1+edge_2 not in bond_set:
+            if hg.node_attr(each_node)['symbol'].bond_type <= 3:
+                num_bond = hg.node_attr(each_node)['symbol'].bond_type
+            elif hg.node_attr(each_node)['symbol'].bond_type == 12:
+                num_bond = 1
+            else:
+                raise ValueError(f'too many bonds; {hg.node_attr(each_node)["bond_symbol"].bond_type}')
+            _ = mol.AddBond(atom_dict[edge_1],
+                            atom_dict[edge_2],
+                            order=Chem.rdchem.BondType.values[num_bond])
+            bond_idx = mol.GetBondBetweenAtoms(atom_dict[edge_1], atom_dict[edge_2]).GetIdx()
+            # stereo
+            mol.GetBondWithIdx(bond_idx).SetStereo(
+                Chem.rdchem.BondStereo.values[hg.node_attr(each_node)['symbol'].stereo])
+            bond_set.update([edge_1+edge_2])
+            bond_set.update([edge_2+edge_1])
+    mol.UpdatePropertyCache()
+    mol = mol.GetMol()
+    not_stereo_mol = deepcopy(mol)
+    if Chem.MolFromSmiles(Chem.MolToSmiles(not_stereo_mol)) is None:
+        raise RuntimeError('no valid molecule was obtained.')
+    try:
+        mol = set_stereo(mol)
+        is_stereo = True
+    except:
+        import traceback
+        traceback.print_exc()
+        is_stereo = False
+    mol_tmp = deepcopy(mol)
+    Chem.SetAromaticity(mol_tmp)
+    if Chem.MolFromSmiles(Chem.MolToSmiles(mol_tmp)) is not None:
+        mol = mol_tmp
+    else:
+        if Chem.MolFromSmiles(Chem.MolToSmiles(mol)) is None:
+            mol = not_stereo_mol
+    mol.UpdatePropertyCache()
+    Chem.GetSymmSSSR(mol)
+    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
+    if verbose:
+        return mol, is_stereo
+    else:
+        return mol
+def hgs_to_mols(hg_list, ignore_error=False):
+    if ignore_error:
+        mol_list = []
+        for each_hg in hg_list:
+            try:
+                mol = hg_to_mol(each_hg)
+            except:
+                mol = None
+            mol_list.append(mol)
+    else:
+        mol_list = [hg_to_mol(each_hg) for each_hg in hg_list]
+    return mol_list
+def hgs_to_smiles(hg_list, ignore_error=False):
+    mol_list = hgs_to_mols(hg_list, ignore_error)
+    smiles_list = []
+    for each_mol in mol_list:
+        try:
+            smiles_list.append(
+                Chem.MolToSmiles(
+                    Chem.MolFromSmiles(
+                        Chem.MolToSmiles(
+                            each_mol))))
+        except:
+            smiles_list.append(None)
+    return smiles_list
+def atom_attr(atom, kekulize):
+    """
+    get atom's attributes
+    Parameters
+    ----------
+    atom : rdkit.Chem.rdchem.Atom
+    kekulize : bool
+        kekulize or not
+    Returns
+    -------
+    atom_attr : dict
+        "is_aromatic" : bool
+            the atom is aromatic or not.
+        "smarts" : str
+            SMARTS representation of the atom.
+    """
+    if kekulize:
+        return {'terminal': True,
+                'is_in_ring': atom.IsInRing(),
+                'symbol': TSymbol(degree=0,
+                                  #degree=atom.GetTotalDegree(),
+                                  is_aromatic=False,
+                                  symbol=atom.GetSymbol(),
+                                  num_explicit_Hs=atom.GetNumExplicitHs(),
+                                  formal_charge=atom.GetFormalCharge(),
+                                  chirality=atom.GetChiralTag().real
+                )}
+    else:
+        return {'terminal': True,
+                'is_in_ring': atom.IsInRing(),
+                'symbol': TSymbol(degree=0,
+                                  #degree=atom.GetTotalDegree(),
+                                  is_aromatic=atom.GetIsAromatic(),
+                                  symbol=atom.GetSymbol(),
+                                  num_explicit_Hs=atom.GetNumExplicitHs(),
+                                  formal_charge=atom.GetFormalCharge(),
+                                  chirality=atom.GetChiralTag().real
+                )}
+def bond_attr(bond, kekulize):
+    """
+    get atom's attributes
+    Parameters
+    ----------
+    bond : rdkit.Chem.rdchem.Bond
+    kekulize : bool
+        kekulize or not
+    Returns
+    -------
+    bond_attr : dict
+        "bond_type" : int
+        {0: rdkit.Chem.rdchem.BondType.UNSPECIFIED,
+         1: rdkit.Chem.rdchem.BondType.SINGLE,
+         2: rdkit.Chem.rdchem.BondType.DOUBLE,
+         3: rdkit.Chem.rdchem.BondType.TRIPLE,
+         4: rdkit.Chem.rdchem.BondType.QUADRUPLE,
+         5: rdkit.Chem.rdchem.BondType.QUINTUPLE,
+         6: rdkit.Chem.rdchem.BondType.HEXTUPLE,
+         7: rdkit.Chem.rdchem.BondType.ONEANDAHALF,
+         8: rdkit.Chem.rdchem.BondType.TWOANDAHALF,
+         9: rdkit.Chem.rdchem.BondType.THREEANDAHALF,
+         10: rdkit.Chem.rdchem.BondType.FOURANDAHALF,
+         11: rdkit.Chem.rdchem.BondType.FIVEANDAHALF,
+         12: rdkit.Chem.rdchem.BondType.AROMATIC,
+         13: rdkit.Chem.rdchem.BondType.IONIC,
+         14: rdkit.Chem.rdchem.BondType.HYDROGEN,
+         15: rdkit.Chem.rdchem.BondType.THREECENTER,
+         16: rdkit.Chem.rdchem.BondType.DATIVEONE,
+         17: rdkit.Chem.rdchem.BondType.DATIVE,
+         18: rdkit.Chem.rdchem.BondType.DATIVEL,
+         19: rdkit.Chem.rdchem.BondType.DATIVER,
+         20: rdkit.Chem.rdchem.BondType.OTHER,
+         21: rdkit.Chem.rdchem.BondType.ZERO}
+    """
+    if kekulize:
+        is_aromatic = False
+        if bond.GetBondType().real == 12:
+            bond_type = 1
+        else:
+            bond_type = bond.GetBondType().real
+    else:
+        is_aromatic = bond.GetIsAromatic()
+        bond_type = bond.GetBondType().real
+    return {'symbol': BondSymbol(is_aromatic=is_aromatic,
+                                 bond_type=bond_type,
+                                 stereo=int(bond.GetStereo())),
+            'is_in_ring': bond.IsInRing()}
+def standardize_stereo(mol):
+    '''
+ 0: rdkit.Chem.rdchem.BondDir.NONE,
+ 1: rdkit.Chem.rdchem.BondDir.BEGINWEDGE,
+ 2: rdkit.Chem.rdchem.BondDir.BEGINDASH,
+ 3: rdkit.Chem.rdchem.BondDir.ENDDOWNRIGHT,
+ 4: rdkit.Chem.rdchem.BondDir.ENDUPRIGHT,
+    '''
+    # mol = Chem.AddHs(mol) # this removes CIPRank !!!
+    for each_bond in mol.GetBonds():
+        if int(each_bond.GetStereo()) in [2, 3]: #2=Z (same side), 3=E
+            begin_stereo_atom_idx = each_bond.GetBeginAtomIdx()
+            end_stereo_atom_idx = each_bond.GetEndAtomIdx()
+            atom_idx_1 = each_bond.GetStereoAtoms()[0]
+            atom_idx_2 = each_bond.GetStereoAtoms()[1]
+            if mol.GetBondBetweenAtoms(atom_idx_1, begin_stereo_atom_idx):
+                begin_atom_idx = atom_idx_1
+                end_atom_idx = atom_idx_2
+            else:
+                begin_atom_idx = atom_idx_2
+                end_atom_idx = atom_idx_1
+            begin_another_atom_idx = None
+            assert len(mol.GetAtomWithIdx(begin_stereo_atom_idx).GetNeighbors()) <= 3
+            for each_neighbor in mol.GetAtomWithIdx(begin_stereo_atom_idx).GetNeighbors():
+                each_neighbor_idx = each_neighbor.GetIdx()
+                if each_neighbor_idx not in [end_stereo_atom_idx, begin_atom_idx]:
+                    begin_another_atom_idx = each_neighbor_idx
+            end_another_atom_idx = None
+            assert len(mol.GetAtomWithIdx(end_stereo_atom_idx).GetNeighbors()) <= 3
+            for each_neighbor in mol.GetAtomWithIdx(end_stereo_atom_idx).GetNeighbors():
+                each_neighbor_idx = each_neighbor.GetIdx()
+                if each_neighbor_idx not in [begin_stereo_atom_idx, end_atom_idx]:
+                    end_another_atom_idx = each_neighbor_idx
+            '''
+            relationship between begin_atom_idx and end_atom_idx is encoded in GetStereo
+            '''
+            begin_atom_rank = int(mol.GetAtomWithIdx(begin_atom_idx).GetProp('_CIPRank'))
+            end_atom_rank = int(mol.GetAtomWithIdx(end_atom_idx).GetProp('_CIPRank'))
+            try:
+                begin_another_atom_rank = int(mol.GetAtomWithIdx(begin_another_atom_idx).GetProp('_CIPRank'))
+            except:
+                begin_another_atom_rank = np.inf
+            try:
+                end_another_atom_rank = int(mol.GetAtomWithIdx(end_another_atom_idx).GetProp('_CIPRank'))
+            except:
+                end_another_atom_rank = np.inf
+            if begin_atom_rank < begin_another_atom_rank\
+               and end_atom_rank < end_another_atom_rank:
+                pass
+            elif begin_atom_rank < begin_another_atom_rank\
+                 and end_atom_rank > end_another_atom_rank:
+                # (begin_atom_idx +) end_another_atom_idx should be in StereoAtoms
+                if each_bond.GetStereo() == 2:
+                    # set stereo
+                    each_bond.SetStereo(Chem.rdchem.BondStereo.values[3])
+                    # set bond dir
+                    mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 3)
+                    mol = safe_set_bond_dir(mol, begin_another_atom_idx, begin_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, end_another_atom_idx, end_stereo_atom_idx, 3)
+                elif each_bond.GetStereo() == 3:
+                    # set stereo
+                    each_bond.SetStereo(Chem.rdchem.BondStereo.values[2])
+                    # set bond dir
+                    mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 3)
+                    mol = safe_set_bond_dir(mol, begin_another_atom_idx, begin_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, end_another_atom_idx, end_stereo_atom_idx, 4)
+                else:
+                    raise ValueError
+                each_bond.SetStereoAtoms(begin_atom_idx, end_another_atom_idx)
+            elif begin_atom_rank > begin_another_atom_rank\
+                 and end_atom_rank < end_another_atom_rank:
+                # (end_atom_idx +) begin_another_atom_idx should be in StereoAtoms
+                if each_bond.GetStereo() == 2:
+                    # set stereo
+                    each_bond.SetStereo(Chem.rdchem.BondStereo.values[3])
+                    # set bond dir
+                    mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, begin_another_atom_idx, begin_stereo_atom_idx, 4)
+                    mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 4)
+                    mol = safe_set_bond_dir(mol, end_another_atom_idx, end_stereo_atom_idx, 0)
+                elif each_bond.GetStereo() == 3:
+                    # set stereo
+                    each_bond.SetStereo(Chem.rdchem.BondStereo.values[2])
+                    # set bond dir
+                    mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, begin_another_atom_idx, begin_stereo_atom_idx, 4)
+                    mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 3)
+                    mol = safe_set_bond_dir(mol, end_another_atom_idx, end_stereo_atom_idx, 0)
+                else:
+                    raise ValueError
+                each_bond.SetStereoAtoms(begin_another_atom_idx, end_atom_idx)
+            elif begin_atom_rank > begin_another_atom_rank\
+                 and end_atom_rank > end_another_atom_rank:
+                # begin_another_atom_idx + end_another_atom_idx should be in StereoAtoms
+                if each_bond.GetStereo() == 2:
+                    # set bond dir
+                    mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, begin_another_atom_idx, begin_stereo_atom_idx, 4)
+                    mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, end_another_atom_idx, end_stereo_atom_idx, 3)
+                elif each_bond.GetStereo() == 3:
+                    # set bond dir
+                    mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, begin_another_atom_idx, begin_stereo_atom_idx, 4)
+                    mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 0)
+                    mol = safe_set_bond_dir(mol, end_another_atom_idx, end_stereo_atom_idx, 4)
+                else:
+                    raise ValueError
+                each_bond.SetStereoAtoms(begin_another_atom_idx, end_another_atom_idx)
+            else:
+                raise RuntimeError
+    return mol
+def set_stereo(mol):
+    '''
+ 0: rdkit.Chem.rdchem.BondDir.NONE,
+ 1: rdkit.Chem.rdchem.BondDir.BEGINWEDGE,
+ 2: rdkit.Chem.rdchem.BondDir.BEGINDASH,
+ 3: rdkit.Chem.rdchem.BondDir.ENDDOWNRIGHT,
+ 4: rdkit.Chem.rdchem.BondDir.ENDUPRIGHT,
+    '''
+    _mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
+    Chem.Kekulize(_mol, True)
+    substruct_match = mol.GetSubstructMatch(_mol)
+    if not substruct_match:
+        ''' mol and _mol are kekulized.
+        sometimes, the order of '=' and '-' changes, which causes mol and _mol not matched.
+        '''
+        Chem.SetAromaticity(mol)
+        Chem.SetAromaticity(_mol)
+        substruct_match = mol.GetSubstructMatch(_mol)
+    try:
+        atom_match = {substruct_match[_mol_atom_idx]: _mol_atom_idx for _mol_atom_idx in range(_mol.GetNumAtoms())} # mol to _mol
+    except:
+        raise ValueError('two molecules obtained from the same data do not match.')
+    for each_bond in mol.GetBonds():
+        begin_atom_idx = each_bond.GetBeginAtomIdx()
+        end_atom_idx = each_bond.GetEndAtomIdx()
+        _bond = _mol.GetBondBetweenAtoms(atom_match[begin_atom_idx], atom_match[end_atom_idx])
+        _bond.SetStereo(each_bond.GetStereo())
+    mol = _mol
+    for each_bond in mol.GetBonds():
+        if int(each_bond.GetStereo()) in [2, 3]: #2=Z (same side), 3=E
+            begin_stereo_atom_idx = each_bond.GetBeginAtomIdx()
+            end_stereo_atom_idx = each_bond.GetEndAtomIdx()
+            begin_atom_idx_set = set([each_neighbor.GetIdx()
+                                      for each_neighbor
+                                      in mol.GetAtomWithIdx(begin_stereo_atom_idx).GetNeighbors()
+                                      if each_neighbor.GetIdx() != end_stereo_atom_idx])
+            end_atom_idx_set = set([each_neighbor.GetIdx()
+                                    for each_neighbor
+                                    in mol.GetAtomWithIdx(end_stereo_atom_idx).GetNeighbors()
+                                    if each_neighbor.GetIdx() != begin_stereo_atom_idx])
+            if not begin_atom_idx_set:
+                each_bond.SetStereo(Chem.rdchem.BondStereo(0))
+                continue
+            if not end_atom_idx_set:
+                each_bond.SetStereo(Chem.rdchem.BondStereo(0))
+                continue
+            if len(begin_atom_idx_set) == 1:
+                begin_atom_idx = begin_atom_idx_set.pop()
+                begin_another_atom_idx = None
+            if len(end_atom_idx_set) == 1:
+                end_atom_idx = end_atom_idx_set.pop()
+                end_another_atom_idx = None
+            if len(begin_atom_idx_set) == 2:
+                atom_idx_1 = begin_atom_idx_set.pop()
+                atom_idx_2 = begin_atom_idx_set.pop()
+                if int(mol.GetAtomWithIdx(atom_idx_1).GetProp('_CIPRank')) < int(mol.GetAtomWithIdx(atom_idx_2).GetProp('_CIPRank')):
+                    begin_atom_idx = atom_idx_1
+                    begin_another_atom_idx = atom_idx_2
+                else:
+                    begin_atom_idx = atom_idx_2
+                    begin_another_atom_idx = atom_idx_1
+            if len(end_atom_idx_set) == 2:
+                atom_idx_1 = end_atom_idx_set.pop()
+                atom_idx_2 = end_atom_idx_set.pop()
+                if int(mol.GetAtomWithIdx(atom_idx_1).GetProp('_CIPRank')) < int(mol.GetAtomWithIdx(atom_idx_2).GetProp('_CIPRank')):
+                    end_atom_idx = atom_idx_1
+                    end_another_atom_idx = atom_idx_2
+                else:
+                    end_atom_idx = atom_idx_2
+                    end_another_atom_idx = atom_idx_1
+            if each_bond.GetStereo() == 2: # same side
+                mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 3)
+                mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 4)
+                each_bond.SetStereoAtoms(begin_atom_idx, end_atom_idx)
+            elif each_bond.GetStereo() == 3: # opposite side
+                mol = safe_set_bond_dir(mol, begin_atom_idx, begin_stereo_atom_idx, 3)
+                mol = safe_set_bond_dir(mol, end_atom_idx, end_stereo_atom_idx, 3)
+                each_bond.SetStereoAtoms(begin_atom_idx, end_atom_idx)
+            else:
+                raise ValueError
+    return mol
+def safe_set_bond_dir(mol, atom_idx_1, atom_idx_2, bond_dir_val):
+    if atom_idx_1 is None or atom_idx_2 is None:
+        return mol
+    else:
+        mol.GetBondBetweenAtoms(atom_idx_1, atom_idx_2).SetBondDir(Chem.rdchem.BondDir.values[bond_dir_val])
+        return mol

models/mhg_model/graph_grammar/nn/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# -*- coding:utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""

models/mhg_model/graph_grammar/nn/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (518 Bytes). View file

models/mhg_model/graph_grammar/nn/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (3.99 kB). View file

models/mhg_model/graph_grammar/nn/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (5.39 kB). View file

models/mhg_model/graph_grammar/nn/dataset.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Apr 18 2018"
+from torch.utils.data import Dataset, DataLoader
+import torch
+import numpy as np
+def left_padding(sentence_list, max_len, pad_idx=-1, inverse=False):
+    ''' pad left
+    Parameters
+    ----------
+    sentence_list : list of sequences of integers
+    max_len : int
+        maximum length of sentences.
+        if a sentence is shorter than `max_len`, its left part is padded.
+    pad_idx : int
+        integer for padding
+    inverse : bool
+        if True, the sequence is inversed.
+    Returns
+    -------
+    List of torch.LongTensor
+        each sentence is left-padded.
+    '''
+    max_in_list = max([len(each_sen) for each_sen in sentence_list])
+    if max_in_list > max_len:
+        raise ValueError('`max_len` should be larger than the maximum length of input sequences, {}.'.format(max_in_list))
+    if inverse:
+        return [torch.LongTensor([pad_idx] * (max_len - len(each_sen)) + each_sen[::-1]) for each_sen in sentence_list]
+    else:
+        return [torch.LongTensor([pad_idx] * (max_len - len(each_sen)) + each_sen) for each_sen in sentence_list]
+def right_padding(sentence_list, max_len, pad_idx=-1):
+    ''' pad right
+    Parameters
+    ----------
+    sentence_list : list of sequences of integers
+    max_len : int
+        maximum length of sentences.
+        if a sentence is shorter than `max_len`, its right part is padded.
+    pad_idx : int
+        integer for padding
+    Returns
+    -------
+    List of torch.LongTensor
+        each sentence is right-padded.
+    '''
+    max_in_list = max([len(each_sen) for each_sen in sentence_list])
+    if max_in_list > max_len:
+        raise ValueError('`max_len` should be larger than the maximum length of input sequences, {}.'.format(max_in_list))
+    return [torch.LongTensor(each_sen + [pad_idx] * (max_len - len(each_sen))) for each_sen in sentence_list]
+class HRGDataset(Dataset):
+    '''
+    A class of HRG data
+    '''
+    def __init__(self, hrg, prod_rule_seq_list, max_len, target_val_list=None, inversed_input=False):
+        self.hrg = hrg
+        self.left_prod_rule_seq_list = left_padding(prod_rule_seq_list,
+                                                    max_len,
+                                                    inverse=inversed_input)
+        self.right_prod_rule_seq_list = right_padding(prod_rule_seq_list, max_len)
+        self.inserved_input = inversed_input
+        self.target_val_list = target_val_list
+        if target_val_list is not None:
+            if len(prod_rule_seq_list) != len(target_val_list):
+                raise ValueError(f'prod_rule_seq_list and target_val_list have inconsistent lengths: {len(prod_rule_seq_list)}, {len(target_val_list)}')
+    def __len__(self):
+        return len(self.left_prod_rule_seq_list)
+    def __getitem__(self, idx):
+        if self.target_val_list is not None:
+            return self.left_prod_rule_seq_list[idx], self.right_prod_rule_seq_list[idx], np.float32(self.target_val_list[idx])
+        else:
+            return self.left_prod_rule_seq_list[idx], self.right_prod_rule_seq_list[idx]
+    @property
+    def vocab_size(self):
+        return self.hrg.num_prod_rule
+def batch_padding(each_batch, batch_size, padding_idx):
+    num_pad = batch_size - len(each_batch[0])
+    if num_pad:
+        each_batch[0] = torch.cat([each_batch[0],
+                                   padding_idx * torch.ones((batch_size - len(each_batch[0]),
+                                                             len(each_batch[0][0])), dtype=torch.int64)], dim=0)
+        each_batch[1] = torch.cat([each_batch[1],
+                                   padding_idx * torch.ones((batch_size - len(each_batch[1]),
+                                                             len(each_batch[1][0])), dtype=torch.int64)], dim=0)
+    return each_batch, num_pad

models/mhg_model/graph_grammar/nn/decoder.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Aug 9 2018"
+import abc
+import numpy as np
+import torch
+from torch import nn
+class DecoderBase(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.hidden_dict = {}
+    @abc.abstractmethod
+    def forward_one_step(self, tgt_emb_in):
+        ''' one-step forward model
+        Parameters
+        ----------
+        tgt_emb_in : Tensor, shape (batch_size, input_dim)
+        Returns
+        -------
+        Tensor, shape (batch_size, hidden_dim)
+        '''
+        tgt_emb_out = None
+        return tgt_emb_out
+    @abc.abstractmethod
+    def init_hidden(self):
+        ''' initialize the hidden states
+        '''
+        pass
+    @abc.abstractmethod
+    def feed_hidden(self, hidden_dict_0):
+        for each_hidden in self.hidden_dict.keys():
+            self.hidden_dict[each_hidden][0] = hidden_dict_0[each_hidden]
+class GRUDecoder(DecoderBase):
+    def __init__(self, input_dim: int, hidden_dim: int, num_layers: int,
+                 dropout: float, batch_size: int, use_gpu: bool,
+                 no_dropout=False):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.batch_size = batch_size
+        self.use_gpu = use_gpu
+        self.model = nn.GRU(input_size=self.input_dim,
+                            hidden_size=self.hidden_dim,
+                            num_layers=self.num_layers,
+                            batch_first=True,
+                            bidirectional=False,
+                            dropout=self.dropout if not no_dropout else 0
+        )
+        if self.use_gpu:
+            self.model.cuda()
+        self.init_hidden()
+    def init_hidden(self):
+        self.hidden_dict['h'] = torch.zeros((self.num_layers,
+                                             self.batch_size,
+                                             self.hidden_dim),
+                                            requires_grad=False)
+        if self.use_gpu:
+            self.hidden_dict['h'] = self.hidden_dict['h'].cuda()
+    def forward_one_step(self, tgt_emb_in):
+        ''' one-step forward model
+        Parameters
+        ----------
+        tgt_emb_in : Tensor, shape (batch_size, input_dim)
+        Returns
+        -------
+        Tensor, shape (batch_size, hidden_dim)
+        '''
+        tgt_emb_out, self.hidden_dict['h'] \
+            = self.model(tgt_emb_in.view(self.batch_size, 1, -1),
+                         self.hidden_dict['h'])
+        return tgt_emb_out
+class LSTMDecoder(DecoderBase):
+    def __init__(self, input_dim: int, hidden_dim: int, num_layers: int,
+                 dropout: float, batch_size: int, use_gpu: bool,
+                 no_dropout=False):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.batch_size = batch_size
+        self.use_gpu = use_gpu
+        self.model = nn.LSTM(input_size=self.input_dim,
+                             hidden_size=self.hidden_dim,
+                             num_layers=self.num_layers,
+                             batch_first=True,
+                             bidirectional=False,
+                             dropout=self.dropout if not no_dropout else 0)
+        if self.use_gpu:
+            self.model.cuda()
+        self.init_hidden()
+    def init_hidden(self):
+        self.hidden_dict['h'] = torch.zeros((self.num_layers,
+                                             self.batch_size,
+                                             self.hidden_dim),
+                                            requires_grad=False)
+        self.hidden_dict['c'] = torch.zeros((self.num_layers,
+                                             self.batch_size,
+                                             self.hidden_dim),
+                                            requires_grad=False)
+        if self.use_gpu:
+            for each_hidden in self.hidden_dict.keys():
+                self.hidden_dict[each_hidden] = self.hidden_dict[each_hidden].cuda()
+    def forward_one_step(self, tgt_emb_in):
+        ''' one-step forward model
+        Parameters
+        ----------
+        tgt_emb_in : Tensor, shape (batch_size, input_dim)
+        Returns
+        -------
+        Tensor, shape (batch_size, hidden_dim)
+        '''
+        tgt_hidden_out, self.hidden_dict['h'], self.hidden_dict['c'] \
+            = self.model(tgt_emb_in.view(self.batch_size, 1, -1),
+                         self.hidden_dict['h'], self.hidden_dict['c'])
+        return tgt_hidden_out

models/mhg_model/graph_grammar/nn/encoder.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Aug 9 2018"
+import abc
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import List
+class EncoderBase(nn.Module):
+    def __init__(self):
+        super().__init__()
+    @abc.abstractmethod
+    def forward(self, in_seq):
+        ''' forward model
+        Parameters
+        ----------
+        in_seq_emb : Variable, shape (batch_size, max_len, input_dim)
+        Returns
+        -------
+        hidden_seq_emb : Tensor, shape (batch_size, max_len, 1 + bidirectional, hidden_dim)
+        '''
+        pass
+    @abc.abstractmethod
+    def init_hidden(self):
+        ''' initialize the hidden states
+        '''
+        pass
+class GRUEncoder(EncoderBase):
+    def __init__(self, input_dim: int, hidden_dim: int, num_layers: int,
+                 bidirectional: bool, dropout: float, batch_size: int, use_gpu: bool,
+                 no_dropout=False):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.dropout = dropout
+        self.batch_size = batch_size
+        self.use_gpu = use_gpu
+        self.model = nn.GRU(input_size=self.input_dim,
+                            hidden_size=self.hidden_dim,
+                            num_layers=self.num_layers,
+                            batch_first=True,
+                            bidirectional=self.bidirectional,
+                            dropout=self.dropout if not no_dropout else 0)
+        if self.use_gpu:
+            self.model.cuda()
+        self.init_hidden()
+    def init_hidden(self):
+        self.h0 = torch.zeros(((self.bidirectional + 1) * self.num_layers,
+                               self.batch_size,
+                               self.hidden_dim),
+                              requires_grad=False)
+        if self.use_gpu:
+            self.h0 = self.h0.cuda()
+    def forward(self, in_seq_emb):
+        ''' forward model
+        Parameters
+        ----------
+        in_seq_emb : Tensor, shape (batch_size, max_len, input_dim)
+        Returns
+        -------
+        hidden_seq_emb : Tensor, shape (batch_size, max_len, 1 + bidirectional, hidden_dim)
+        '''
+        max_len = in_seq_emb.size(1)
+        hidden_seq_emb, self.h0 = self.model(
+            in_seq_emb, self.h0)
+        hidden_seq_emb = hidden_seq_emb.view(self.batch_size,
+                                             max_len,
+                                             1 + self.bidirectional,
+                                             self.hidden_dim)
+        return hidden_seq_emb
+class LSTMEncoder(EncoderBase):
+    def __init__(self, input_dim: int, hidden_dim: int, num_layers: int,
+                 bidirectional: bool, dropout: float, batch_size: int, use_gpu: bool,
+                 no_dropout=False):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.dropout = dropout
+        self.batch_size = batch_size
+        self.use_gpu = use_gpu
+        self.model = nn.LSTM(input_size=self.input_dim,
+                             hidden_size=self.hidden_dim,
+                             num_layers=self.num_layers,
+                             batch_first=True,
+                             bidirectional=self.bidirectional,
+                             dropout=self.dropout if not no_dropout else 0)
+        if self.use_gpu:
+            self.model.cuda()
+        self.init_hidden()
+    def init_hidden(self):
+        self.h0 = torch.zeros(((self.bidirectional + 1) * self.num_layers,
+                               self.batch_size,
+                               self.hidden_dim),
+                              requires_grad=False)
+        self.c0 = torch.zeros(((self.bidirectional + 1) * self.num_layers,
+                               self.batch_size,
+                               self.hidden_dim),
+                              requires_grad=False)
+        if self.use_gpu:
+            self.h0 = self.h0.cuda()
+            self.c0 = self.c0.cuda()
+    def forward(self, in_seq_emb):
+        ''' forward model
+        Parameters
+        ----------
+        in_seq_emb : Tensor, shape (batch_size, max_len, input_dim)
+        Returns
+        -------
+        hidden_seq_emb : Tensor, shape (batch_size, max_len, 1 + bidirectional, hidden_dim)
+        '''
+        max_len = in_seq_emb.size(1)
+        hidden_seq_emb, (self.h0, self.c0) = self.model(
+            in_seq_emb, (self.h0, self.c0))
+        hidden_seq_emb = hidden_seq_emb.view(self.batch_size,
+                                             max_len,
+                                             1 + self.bidirectional,
+                                             self.hidden_dim)
+        return hidden_seq_emb
+class FullConnectedEncoder(EncoderBase):
+    def __init__(self, input_dim: int, hidden_dim: int, max_len: int, hidden_dim_list: List[int],
+                 batch_size: int, use_gpu: bool):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.max_len = max_len
+        self.hidden_dim_list = hidden_dim_list
+        self.use_gpu = use_gpu
+        in_out_dim_list = [input_dim * max_len] + list(hidden_dim_list) + [hidden_dim]
+        self.linear_list = nn.ModuleList(
+            [nn.Linear(in_out_dim_list[each_idx], in_out_dim_list[each_idx + 1])\
+             for each_idx in range(len(in_out_dim_list) - 1)])
+    def forward(self, in_seq_emb):
+        ''' forward model
+        Parameters
+        ----------
+        in_seq_emb : Tensor, shape (batch_size, max_len, input_dim)
+        Returns
+        -------
+        hidden_seq_emb : Tensor, shape (batch_size, max_len, 1 + bidirectional, hidden_dim)
+        '''
+        batch_size = in_seq_emb.size(0)
+        x = in_seq_emb.view(batch_size, -1)
+        for each_linear in self.linear_list:
+            x = F.relu(each_linear(x))
+        return x.view(batch_size, 1, -1)
+    def init_hidden(self):
+        pass

models/mhg_model/graph_grammar/nn/graph.py ADDED Viewed

	@@ -0,0 +1,313 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Rhizome
+# Version beta 0.0, August 2023
+# Property of IBM Research, Accelerated Discovery
+#
+"""
+PLEASE NOTE THIS IMPLEMENTATION INCLUDES THE ORIGINAL SOURCE CODE (AND SOME ADAPTATIONS)
+OF THE MHG IMPLEMENTATION OF HIROSHI KAJINO AT IBM TRL ALREADY PUBLICLY AVAILABLE.
+THIS MIGHT INFLUENCE THE DECISION OF THE FINAL LICENSE SO CAREFUL CHECK NEEDS BE DONE.
+"""
+""" Title """
+__author__ = "Hiroshi Kajino <[email protected]>"
+__copyright__ = "(c) Copyright IBM Corp. 2018"
+__version__ = "0.1"
+__date__ = "Jan 1 2018"
+import numpy as np
+import torch
+import torch.nn.functional as F
+from graph_grammar.graph_grammar.hrg import ProductionRuleCorpus
+from torch import nn
+from torch.autograd import Variable
+class MolecularProdRuleEmbedding(nn.Module):
+    ''' molecular fingerprint layer
+    '''
+    def __init__(self, prod_rule_corpus, layer2layer_activation, layer2out_activation,
+                 out_dim=32, element_embed_dim=32,
+                 num_layers=3, padding_idx=None, use_gpu=False):
+        super().__init__()
+        if padding_idx is not None:
+            assert padding_idx == -1, 'padding_idx must be -1.'
+        self.prod_rule_corpus = prod_rule_corpus
+        self.layer2layer_activation = layer2layer_activation
+        self.layer2out_activation = layer2out_activation
+        self.out_dim = out_dim
+        self.element_embed_dim = element_embed_dim
+        self.num_layers = num_layers
+        self.padding_idx = padding_idx
+        self.use_gpu = use_gpu
+        self.layer2layer_list = []
+        self.layer2out_list = []
+        if self.use_gpu:
+            self.atom_embed = torch.randn(self.prod_rule_corpus.num_edge_symbol,
+                                          self.element_embed_dim, requires_grad=True).cuda()
+            self.bond_embed = torch.randn(self.prod_rule_corpus.num_node_symbol,
+                                          self.element_embed_dim, requires_grad=True).cuda()
+            self.ext_id_embed = torch.randn(self.prod_rule_corpus.num_ext_id,
+                                            self.element_embed_dim, requires_grad=True).cuda()
+            for _ in range(num_layers):
+                self.layer2layer_list.append(nn.Linear(self.element_embed_dim, self.element_embed_dim).cuda())
+                self.layer2out_list.append(nn.Linear(self.element_embed_dim, self.out_dim).cuda())
+        else:
+            self.atom_embed = torch.randn(self.prod_rule_corpus.num_edge_symbol,
+                                          self.element_embed_dim, requires_grad=True)
+            self.bond_embed = torch.randn(self.prod_rule_corpus.num_node_symbol,
+                                          self.element_embed_dim, requires_grad=True)
+            self.ext_id_embed = torch.randn(self.prod_rule_corpus.num_ext_id,
+                                            self.element_embed_dim, requires_grad=True)
+            for _ in range(num_layers):
+                self.layer2layer_list.append(nn.Linear(self.element_embed_dim, self.element_embed_dim))
+                self.layer2out_list.append(nn.Linear(self.element_embed_dim, self.out_dim))
+    def forward(self, prod_rule_idx_seq):
+        ''' forward model for mini-batch
+        Parameters
+        ----------
+        prod_rule_idx_seq : (batch_size, length)
+        Returns
+        -------
+        Variable, shape (batch_size, length, out_dim)
+        '''
+        batch_size, length = prod_rule_idx_seq.shape
+        if self.use_gpu:
+            out = Variable(torch.zeros((batch_size, length, self.out_dim))).cuda()
+        else:
+            out = Variable(torch.zeros((batch_size, length, self.out_dim)))
+        for each_batch_idx in range(batch_size):
+            for each_idx in range(length):
+                if int(prod_rule_idx_seq[each_batch_idx, each_idx]) == len(self.prod_rule_corpus.prod_rule_list):
+                    continue
+                else:
+                    each_prod_rule = self.prod_rule_corpus.prod_rule_list[int(prod_rule_idx_seq[each_batch_idx, each_idx])]
+                    layer_wise_embed_dict = {each_edge: self.atom_embed[
+                        each_prod_rule.rhs.edge_attr(each_edge)['symbol_idx']]
+                                             for each_edge in each_prod_rule.rhs.edges}
+                    layer_wise_embed_dict.update({each_node: self.bond_embed[
+                        each_prod_rule.rhs.node_attr(each_node)['symbol_idx']]
+                                                  for each_node in each_prod_rule.rhs.nodes})
+                    for each_node in each_prod_rule.rhs.nodes:
+                        if 'ext_id' in each_prod_rule.rhs.node_attr(each_node):
+                            layer_wise_embed_dict[each_node] \
+                                = layer_wise_embed_dict[each_node] \
+                                + self.ext_id_embed[each_prod_rule.rhs.node_attr(each_node)['ext_id']]
+                    for each_layer in range(self.num_layers):
+                        next_layer_embed_dict = {}
+                        for each_edge in each_prod_rule.rhs.edges:
+                            v = layer_wise_embed_dict[each_edge]
+                            for each_node in each_prod_rule.rhs.nodes_in_edge(each_edge):
+                                v = v + layer_wise_embed_dict[each_node]
+                            next_layer_embed_dict[each_edge] = self.layer2layer_activation(self.layer2layer_list[each_layer](v))
+                            out[each_batch_idx, each_idx, :] \
+                                = out[each_batch_idx, each_idx, :] + self.layer2out_activation(self.layer2out_list[each_layer](v))
+                        for each_node in each_prod_rule.rhs.nodes:
+                            v = layer_wise_embed_dict[each_node]
+                            for each_edge in each_prod_rule.rhs.adj_edges(each_node):
+                                v = v + layer_wise_embed_dict[each_edge]
+                            next_layer_embed_dict[each_node] = self.layer2layer_activation(self.layer2layer_list[each_layer](v))
+                            out[each_batch_idx, each_idx, :]\
+                                = out[each_batch_idx, each_idx, :] + self.layer2out_activation(self.layer2out_list[each_layer](v))
+                        layer_wise_embed_dict = next_layer_embed_dict
+        return out
+class MolecularProdRuleEmbeddingLastLayer(nn.Module):
+    ''' molecular fingerprint layer
+    '''
+    def __init__(self, prod_rule_corpus, layer2layer_activation, layer2out_activation,
+                 out_dim=32, element_embed_dim=32,
+                 num_layers=3, padding_idx=None, use_gpu=False):
+        super().__init__()
+        if padding_idx is not None:
+            assert padding_idx == -1, 'padding_idx must be -1.'
+        self.prod_rule_corpus = prod_rule_corpus
+        self.layer2layer_activation = layer2layer_activation
+        self.layer2out_activation = layer2out_activation
+        self.out_dim = out_dim
+        self.element_embed_dim = element_embed_dim
+        self.num_layers = num_layers
+        self.padding_idx = padding_idx
+        self.use_gpu = use_gpu
+        self.layer2layer_list = []
+        self.layer2out_list = []
+        if self.use_gpu:
+            self.atom_embed = nn.Embedding(self.prod_rule_corpus.num_edge_symbol, self.element_embed_dim).cuda()
+            self.bond_embed = nn.Embedding(self.prod_rule_corpus.num_node_symbol, self.element_embed_dim).cuda()
+            for _ in range(num_layers+1):
+                self.layer2layer_list.append(nn.Linear(self.element_embed_dim, self.element_embed_dim).cuda())
+                self.layer2out_list.append(nn.Linear(self.element_embed_dim, self.out_dim).cuda())
+        else:
+            self.atom_embed = nn.Embedding(self.prod_rule_corpus.num_edge_symbol, self.element_embed_dim)
+            self.bond_embed = nn.Embedding(self.prod_rule_corpus.num_node_symbol, self.element_embed_dim)
+            for _ in range(num_layers+1):
+                self.layer2layer_list.append(nn.Linear(self.element_embed_dim, self.element_embed_dim))
+                self.layer2out_list.append(nn.Linear(self.element_embed_dim, self.out_dim))
+    def forward(self, prod_rule_idx_seq):
+        ''' forward model for mini-batch
+        Parameters
+        ----------
+        prod_rule_idx_seq : (batch_size, length)
+        Returns
+        -------
+        Variable, shape (batch_size, length, out_dim)
+        '''
+        batch_size, length = prod_rule_idx_seq.shape
+        if self.use_gpu:
+            out = Variable(torch.zeros((batch_size, length, self.out_dim))).cuda()
+        else:
+            out = Variable(torch.zeros((batch_size, length, self.out_dim)))
+        for each_batch_idx in range(batch_size):
+            for each_idx in range(length):
+                if int(prod_rule_idx_seq[each_batch_idx, each_idx]) == len(self.prod_rule_corpus.prod_rule_list):
+                    continue
+                else:
+                    each_prod_rule = self.prod_rule_corpus.prod_rule_list[int(prod_rule_idx_seq[each_batch_idx, each_idx])]
+                    if self.use_gpu:
+                        layer_wise_embed_dict = {each_edge: self.atom_embed(
+                            Variable(torch.LongTensor(
+                                [each_prod_rule.rhs.edge_attr(each_edge)['symbol_idx']]
+                            ), requires_grad=False).cuda())
+                                                 for each_edge in each_prod_rule.rhs.edges}
+                        layer_wise_embed_dict.update({each_node: self.bond_embed(
+                                                         Variable(
+                                                             torch.LongTensor([
+                                                                     each_prod_rule.rhs.node_attr(each_node)['symbol_idx']]),
+                                                             requires_grad=False).cuda()
+                                                     ) for each_node in each_prod_rule.rhs.nodes})
+                    else:
+                        layer_wise_embed_dict = {each_edge: self.atom_embed(
+                            Variable(torch.LongTensor(
+                                [each_prod_rule.rhs.edge_attr(each_edge)['symbol_idx']]
+                            ), requires_grad=False))
+                                                 for each_edge in each_prod_rule.rhs.edges}
+                        layer_wise_embed_dict.update({each_node: self.bond_embed(
+                                                         Variable(
+                                                             torch.LongTensor([
+                                                                     each_prod_rule.rhs.node_attr(each_node)['symbol_idx']]),
+                                                             requires_grad=False)
+                                                     ) for each_node in each_prod_rule.rhs.nodes})
+                    for each_layer in range(self.num_layers):
+                        next_layer_embed_dict = {}
+                        for each_edge in each_prod_rule.rhs.edges:
+                            v = layer_wise_embed_dict[each_edge]
+                            for each_node in each_prod_rule.rhs.nodes_in_edge(each_edge):
+                                v += layer_wise_embed_dict[each_node]
+                            next_layer_embed_dict[each_edge] = self.layer2layer_activation(self.layer2layer_list[each_layer](v))
+                        for each_node in each_prod_rule.rhs.nodes:
+                            v = layer_wise_embed_dict[each_node]
+                            for each_edge in each_prod_rule.rhs.adj_edges(each_node):
+                                v += layer_wise_embed_dict[each_edge]
+                            next_layer_embed_dict[each_node] = self.layer2layer_activation(self.layer2layer_list[each_layer](v))
+                        layer_wise_embed_dict = next_layer_embed_dict
+                    for each_edge in each_prod_rule.rhs.edges:
+                        out[each_batch_idx, each_idx, :] = self.layer2out_activation(self.layer2out_list[self.num_layers](v))
+                    for each_edge in each_prod_rule.rhs.edges:
+                        out[each_batch_idx, each_idx, :] = self.layer2out_activation(self.layer2out_list[self.num_layers](v))
+        return out
+class MolecularProdRuleEmbeddingUsingFeatures(nn.Module):
+    ''' molecular fingerprint layer
+    '''
+    def __init__(self, prod_rule_corpus, layer2layer_activation, layer2out_activation,
+                 out_dim=32, num_layers=3, padding_idx=None, use_gpu=False):
+        super().__init__()
+        if padding_idx is not None:
+            assert padding_idx == -1, 'padding_idx must be -1.'
+        self.feature_dict, self.feature_dim = prod_rule_corpus.construct_feature_vectors()
+        self.prod_rule_corpus = prod_rule_corpus
+        self.layer2layer_activation = layer2layer_activation
+        self.layer2out_activation = layer2out_activation
+        self.out_dim = out_dim
+        self.num_layers = num_layers
+        self.padding_idx = padding_idx
+        self.use_gpu = use_gpu
+        self.layer2layer_list = []
+        self.layer2out_list = []
+        if self.use_gpu:
+            for each_key in self.feature_dict:
+                self.feature_dict[each_key] = self.feature_dict[each_key].to_dense().cuda()
+            for _ in range(num_layers):
+                self.layer2layer_list.append(nn.Linear(self.feature_dim, self.feature_dim).cuda())
+                self.layer2out_list.append(nn.Linear(self.feature_dim, self.out_dim).cuda())
+        else:
+            for _ in range(num_layers):
+                self.layer2layer_list.append(nn.Linear(self.feature_dim, self.feature_dim))
+                self.layer2out_list.append(nn.Linear(self.feature_dim, self.out_dim))
+    def forward(self, prod_rule_idx_seq):
+        ''' forward model for mini-batch
+        Parameters
+        ----------
+        prod_rule_idx_seq : (batch_size, length)
+        Returns
+        -------
+        Variable, shape (batch_size, length, out_dim)
+        '''
+        batch_size, length = prod_rule_idx_seq.shape
+        if self.use_gpu:
+            out = Variable(torch.zeros((batch_size, length, self.out_dim))).cuda()
+        else:
+            out = Variable(torch.zeros((batch_size, length, self.out_dim)))
+        for each_batch_idx in range(batch_size):
+            for each_idx in range(length):
+                if int(prod_rule_idx_seq[each_batch_idx, each_idx]) == len(self.prod_rule_corpus.prod_rule_list):
+                    continue
+                else:
+                    each_prod_rule = self.prod_rule_corpus.prod_rule_list[int(prod_rule_idx_seq[each_batch_idx, each_idx])]
+                    edge_list = sorted(list(each_prod_rule.rhs.edges))
+                    node_list = sorted(list(each_prod_rule.rhs.nodes))
+                    adj_mat = torch.FloatTensor(each_prod_rule.rhs_adj_mat(edge_list + node_list).todense() + np.identity(len(edge_list)+len(node_list)))
+                    if self.use_gpu:
+                        adj_mat = adj_mat.cuda()
+                    layer_wise_embed = [
+                        self.feature_dict[each_prod_rule.rhs.edge_attr(each_edge)['symbol']]
+                        for each_edge in edge_list]\
+                            + [self.feature_dict[each_prod_rule.rhs.node_attr(each_node)['symbol']]
+                               for each_node in node_list]
+                    for each_node in each_prod_rule.ext_node.values():
+                        layer_wise_embed[each_prod_rule.rhs.num_edges + node_list.index(each_node)] \
+                                = layer_wise_embed[each_prod_rule.rhs.num_edges + node_list.index(each_node)] \
+                                + self.feature_dict[('ext_id', each_prod_rule.rhs.node_attr(each_node)['ext_id'])]
+                    layer_wise_embed = torch.stack(layer_wise_embed)
+                    for each_layer in range(self.num_layers):
+                        message = adj_mat @ layer_wise_embed
+                        next_layer_embed = self.layer2layer_activation(self.layer2layer_list[each_layer](message))
+                        out[each_batch_idx, each_idx, :] \
+                                = out[each_batch_idx, each_idx, :] \
+                                + self.layer2out_activation(self.layer2out_list[each_layer](message)).sum(dim=0)
+                        layer_wise_embed = next_layer_embed
+        return out

models/mhg_model/images/mhg_example.png ADDED Viewed

models/mhg_model/images/mhg_example1.png ADDED Viewed