Spaces:

arikat
/

Glydentify

Sleeping

App Files Files Community

Aarya Venkat commited on Apr 8

Commit

d1ca73b

•

1 Parent(s): bc978c9

Update -- need to add new model

Browse files

Files changed (9) hide show

Dockerfile +30 -0
app.py +263 -304
backup/family.pth +0 -3
backup/family_labels.pkl +0 -3
best_model_35M_t12_5v5.pth +0 -3
donor_labels.pkl +0 -3
family.pth +0 -3
family_labels.pkl +0 -3
requirements.txt +2 -1

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9
+# Set the working directory in the container
+WORKDIR /usr/src/app
+# Install any needed packages specified in requirements.txt
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+# Install BLAST
+RUN apt-get update && apt-get install -y ncbi-blast+
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \\
+    PATH=/home/user/.local/bin:$PATH
+# Define environment variable
+ENV NAME Glydentify
+# Run app.py when the container launches
+CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from tqdm import tqdm
 import numpy as np
 import seaborn as sns
 from sklearn.model_selection import train_test_split
 import matplotlib.pyplot as plt
 import pickle
 import torch.nn.functional as F
@@ -16,164 +18,196 @@ import io
 from PIL import Image
 import Bio
 from Bio import SeqIO
 import zipfile
 import os
-# Load the model from the file
-with open('family_labels.pkl', 'rb') as filefam:
-    yfam = pickle.load(filefam)
-tokenizerfam = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D") #facebook/esm2_t33_650M_UR50D
-device = 'cpu'
-device
-modelfam = EsmForSequenceClassification.from_pretrained("facebook/esm2_t12_35M_UR50D", num_labels=len(yfam.classes_))
-modelfam = modelfam.to('cpu')
-modelfam.load_state_dict(torch.load("family.pth", map_location=torch.device('cpu')))
-modelfam.eval()
-x_testfam = ["""MAEVLRTLAGKPKCHALRPMILFLIMLVLVLFGYGVLSPRSLMPGSLERGFCMAVREPDH
-LQRVSLPRMVYPQPKVLTPCRKDVLVVTPWLAPIVWEGTFNIDILNEQFRLQNTTIGLTV
-FAIKKYVAFLKLFLETAEKHFMVGHRVHYYVFTDQPAAVPRVTLGTGRQLSVLEVRAYKR
-WQDVSMRRMEMISDFCERRFLSEVDYLVCVDVDMEFRDHVGVEILTPLFGTLHPGFYGSS
-REAFTYERRPQSQAYIPKDEGDFYYLGGFFGGSVQEVQRLTRACHQAMMVDQANGIEAVW
-HDESHLNKYLLRHKPTKVLSPEYLWDQQLLGWPAVLRKLRFTAVPKNHQAVRNP
-"""]
-encoded_inputfam = tokenizerfam(x_testfam, padding=True, truncation=True, max_length=512, return_tensors="pt")
-input_idsfam = encoded_inputfam["input_ids"]
-attention_maskfam = encoded_inputfam["attention_mask"]
-with torch.no_grad():
-    outputfam = modelfam(input_idsfam, attention_mask=attention_maskfam)
-    logitsfam = outputfam.logits
-    probabilitiesfam = F.softmax(logitsfam, dim=1)
-    _, predicted_labelsfam = torch.max(logitsfam, dim=1)
-probabilitiesfam[0]
-decoded_labelsfam = yfam.inverse_transform(predicted_labelsfam.tolist())
-decoded_labelsfam
-#Load donor model from file
-tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
-with open('donor_labels.pkl', 'rb') as file:
-    label_encoder = pickle.load(file)
-# encoded_labels = label_encoder.fit(y)
-# labels = torch.tensor(encoded_labels)
-model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t12_35M_UR50D", num_labels=len(label_encoder.classes_))
-model = model.to('cpu')
-model.load_state_dict(torch.load("best_model_35M_t12_5v5.pth", map_location=torch.device('cpu'))) #model_best_35v2M.pth
-model.eval()
-x_test = ["""MAEVLRTLAGKPKCHALRPMILFLIMLVLVLFGYGVLSPRSLMPGSLERGFCMAVREPDH
-LQRVSLPRMVYPQPKVLTPCRKDVLVVTPWLAPIVWEGTFNIDILNEQFRLQNTTIGLTV
-FAIKKYVAFLKLFLETAEKHFMVGHRVHYYVFTDQPAAVPRVTLGTGRQLSVLEVRAYKR
-WQDVSMRRMEMISDFCERRFLSEVDYLVCVDVDMEFRDHVGVEILTPLFGTLHPGFYGSS
-REAFTYERRPQSQAYIPKDEGDFYYLGGFFGGSVQEVQRLTRACHQAMMVDQANGIEAVW
-HDESHLNKYLLRHKPTKVLSPEYLWDQQLLGWPAVLRKLRFTAVPKNHQAVRNP
-"""]
-encoded_input = tokenizer(x_test, padding=True, truncation=True, max_length=512, return_tensors="pt")
-input_ids = encoded_input["input_ids"]
-attention_mask = encoded_input["attention_mask"]
-with torch.no_grad():
-    output = model(input_ids, attention_mask=attention_mask)
-    logits = output.logits
-    probabilities = F.softmax(logits, dim=1)
-    _, predicted_labels = torch.max(logits, dim=1)
-probabilities[0]
-decoded_labels = label_encoder.inverse_transform(predicted_labels.tolist())
-decoded_labels
 glycosyltransferase_db = {
-    "GT31-chsy"      : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT31.html'},
-    "GT2-CesA2"      : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT43-arath"     : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
-    "GT8-Met1"       : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT8.html' },
-    "GT32-higher"    : {'CAZy Name': 'GT32', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT32.html'},
     "GT40"           : {'CAZy Name': 'GT40', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT40.html'},
     "GT16"           : {'CAZy Name': 'GT16', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT16.html'},
     "GT27"           : {'CAZy Name': 'GT27', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT27.html'},
     "GT55"           : {'CAZy Name': 'GT55', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT55.html'},
-    "GT8-Glycogenin" : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT8.html' },
-    "GT8-1"          : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT8.html' },
     "GT25"           : {'CAZy Name': 'GT25', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT25.html'},
-    "GT2-DPM_like"   : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT31-fringe"    : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT31.html'},
-    "GT2-Bact_puta"  : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT84"           : {'CAZy Name': 'GT84', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT84.html'},
     "GT13"           : {'CAZy Name': 'GT13', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT13.html'},
-    "GT43-cele"      : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
-    "GT2-Bact_LPS1"  : {'CAZy Name': 'GT92', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT2-Bact_Oant"  : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '   ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT67"           : {'CAZy Name': 'GT67', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT67.html'},
-    "GT2-HAS"        : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT82"           : {'CAZy Name': 'GT82', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7  ', 'More Info': 'http://www.cazy.org/GT82.html'},
     "GT24"           : {'CAZy Name': 'GT24', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT24.html'},
-    "GT31-plant"     : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT31.html'},
-    "GT81-Bact"      : {'CAZy Name': 'GT81', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT81.html'},
-    "GT2-Bact_gt25Me": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '   ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT2-B3GntL"     : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '4  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT49"           : {'CAZy Name': 'GT49', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT49.html'},
     "GT34"           : {'CAZy Name': 'GT34', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT34.html'},
     "GT45"           : {'CAZy Name': 'GT45', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT45.html'},
-    "GT32-lower"     : {'CAZy Name': 'GT32', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT32.html'},
     "GT88"           : {'CAZy Name': 'GT88', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT88.html'},
     "GT21"           : {'CAZy Name': 'GT21', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT21.html'},
-    "GT2-DPG_synt"   : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT43-b3gat2"    : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
-    "GT2-Chitin_synt": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT8-Bact"       : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT8.html' },
-    "GT8-Met2"       : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT8.html' },
-    "GT2-Bact_Chlor1": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '   ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT54"           : {'CAZy Name': 'GT54', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT54.html'},
-    "GT2-Cel_bre3"   : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT2-Bact_Rham"  : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT6"            : {'CAZy Name': 'GT6 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT6.html' },
-    "GT2-Bact_puta2" : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '   ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT7-1"          : {'CAZy Name': 'GT7 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT7.html' },
-    "GT2-Csl"        : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '4  ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT2-ExoU"       : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '   ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT2-Csl2"       : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '4  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT64"           : {'CAZy Name': 'GT64', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT64.html'},
-    "GT2-Bact_Chlor2": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '   ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT78"           : {'CAZy Name': 'GT78', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT78.html'},
     "GT12"           : {'CAZy Name': 'GT12', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT12.html'},
-    "GT31-gnt"       : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT31.html'},
-    "GT2-Bact_CHS"   : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT62"           : {'CAZy Name': 'GT62', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '3  ', 'More Info': 'http://www.cazy.org/GT62.html'},
-    "GT8-Met_Pla"    : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT8.html' },
     "GT15"           : {'CAZy Name': 'GT15', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT15.html'},
-    "GT43-b3gat1"    : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
-    "GT31-b3glt"     : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT31.html'},
-    "GT2-CesA1"      : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT60"           : {'CAZy Name': 'GT60', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT60.html'},
     "GT14"           : {'CAZy Name': 'GT14', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7  ', 'More Info': 'http://www.cazy.org/GT14.html'},
-    "GT2-Bact_DPM_sy": {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT17"           : {'CAZy Name': 'GT17', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7  ', 'More Info': 'http://www.cazy.org/GT17.html'},
-    "GT2-Bact_LPS2"  : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '3  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT77"           : {'CAZy Name': 'GT77', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT77.html'},
-    "GT2-Bact_EpsO"  : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '   ', 'More Info': 'http://www.cazy.org/GT2.html' },
-    "GT43-b3gat3"    : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
-    "GT8-Fun"        : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT8.html' },
     "GT75"           : {'CAZy Name': 'GT75', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT75.html'},
-    "GT2-Bact_GlfT"  : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT2.html' },
 }
 def get_family_info(family_name):
@@ -201,26 +235,46 @@ def fig_to_img(fig):
 def preprocess_protein_sequence(protein_fasta):
     lines = protein_fasta.split('\n')
     headers = [line for line in lines if line.startswith('>')]
     if len(headers) > 1:
-        return None, "Multiple fasta sequences detected. Please upload a fasta file with only one sequence."
     protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
-    # Check for invalid characters
-    valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")  # the 20 standard amino acids
-    if not set(protein_sequence).issubset(valid_characters):
-        return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?"
-    return protein_sequence, None
-def process_family_sequence(protein_fasta):
-    protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
-    if error_msg:
-        return None, None, None, error_msg
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
     input_idsfam = encoded_input["input_ids"]
     attention_maskfam = encoded_input["attention_mask"]
@@ -231,28 +285,26 @@ def process_family_sequence(protein_fasta):
         probabilitiesfam = F.softmax(logitsfam, dim=1)
         _, predicted_labelsfam = torch.max(logitsfam, dim=1)
-    decoded_labelsfam = yfam.inverse_transform(predicted_labelsfam.tolist())
-    family_info = get_family_info(decoded_labelsfam[0])
-    figfam = plt.figure(figsize=(10, 5))
-    labelsfam = yfam.classes_
-    probabilitiesfam = probabilitiesfam.tolist()
-    # Convert the nested list to a flat list of probabilities
-    probabilitiesfam_flat = probabilitiesfam[0] if probabilitiesfam else []
-    # Sort labels and probabilities by probability
-    labels_probsfam = list(zip(labelsfam, probabilitiesfam_flat))
-    labels_probsfam.sort(key=lambda x: x[1], reverse=True)
-    # Select the top 5 fams
-    labels_probs_top5fam = labels_probsfam[:5]
-    labels_top5, probabilities_top5 = zip(*labels_probs_top5fam)
-    y_posfam = np.arange(len(labels_top5))
-    plt.barh(y_posfam, [prob*100 for prob in probabilities_top5], align='center', alpha=0.5)
-    plt.yticks(y_posfam, labels_top5)
     plt.xlabel('Probability (%)')
     plt.title('Top 5 Family Class Probabilities')
     plt.xlim(0, 100)
@@ -261,171 +313,96 @@ def process_family_sequence(protein_fasta):
     img = fig_to_img(figfam)
     if len(protein_sequence) < 100:
-        return decoded_labelsfam[0], img, None, f"**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions. \n\n {family_info}"
-    return decoded_labelsfam[0], img, None, family_info
-def process_single_sequence(protein_fasta): #, protein_file
-    protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
-    if error_msg:
-        return None, None, None, error_msg
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
-    input_ids = encoded_input["input_ids"]
-    attention_mask = encoded_input["attention_mask"]
     with torch.no_grad():
-        output = model(input_ids, attention_mask=attention_mask)
-        logits = output.logits
-        dprobabilities = F.softmax(logits, dim=1)[0]
-        _, predicted_labels = torch.max(logits, dim=1)
-    decoded_labels = label_encoder.inverse_transform(predicted_labels.tolist())
-    family_info = get_family_info(decoded_labels[0])
-    fig = plt.figure(figsize=(10, 5))
-    labels = label_encoder.classes_
-    dprobabilities = dprobabilities.tolist()
-    # Sort labels and probabilities by probability
-    labels_probs = list(zip(labels, dprobabilities))
-    labels_probs.sort(key=lambda x: x[1], reverse=True)
-    # Select the top 3 donors
-    labels_probs_top3 = labels_probs[:3]
-    labels_top3, probabilities_top3 = zip(*labels_probs_top3)
-    y_pos = np.arange(len(labels_top3))
-    plt.barh(y_pos, [prob*100 for prob in probabilities_top3], align='center', alpha=0.5)
-    plt.yticks(y_pos, labels_top3)
     plt.xlabel('Probability (%)')
     plt.title('Top 3 Donor Class Probabilities')
-    plt.xlim(0, 100)
-    plt.close(fig)
-    img = fig_to_img(fig)
     if len(protein_sequence) < 100:
-        return decoded_labels[0], img, None, f"**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions. \n\n {family_info}"
-    return decoded_labels[0], img, None, None
-def process_sequence_file(protein_file):  # added progress parameter that is displayed in gradio #, progress=gr.Progress()
-    try:
-        records = list(SeqIO.parse(protein_file.name, "fasta"))
-    except Exception as e:
-        return str(e)
-    if not os.path.exists('results'):
-        os.makedirs('results')
-    total = len(records)
-    for idx, record in enumerate(records):
-        protein_sequence = str(record.seq)
-        valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")
-        if not set(protein_sequence).issubset(valid_characters):
-            with open(f'results/result_{idx+1}.txt', 'w') as file:
-                file.write("Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?")
-            continue
-        label, img, _, info = process_single_sequence(protein_sequence)
-        img.save(f'results/result_{idx+1}.png')
-        with open(f'results/result_{idx+1}.txt', 'w') as file:
-            file.write(f'Predicted Donor: {label}\n\n{info}')
-        # progress(idx/total)  # Update the progress bar
-    # Create a zip file w/ results -- To Do: Figure out how to improve compression for large files
-    with zipfile.ZipFile('predicted_results.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
-        for root, dirs, files in os.walk('results/'):
-            for file in files:
-                zipf.write(os.path.join(root, file))
-    return 'predicted_results.zip' #Provide indication of how to interpret downloaded zip file? f"**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions.
-    # Function to mask a residue at a particular position
-def mask_residue(sequence, position):
-    return sequence[:position] + 'X' + sequence[position+1:]
-def generate_heatmap(protein_fasta):
-    protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
-    # Tokenize and predict for original sequence
-    encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
-    with torch.no_grad():
-        original_output = model(encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"])
-    original_probabilities = F.softmax(original_output.logits, dim=1).cpu().numpy()[0]
-    # Define the size of each group
-    group_size = 10  # allow user to change this
-    # Calculate the number of groups
-    num_groups = len(protein_sequence) // group_size + (len(protein_sequence) % group_size > 0)
-    # Initialize an array to hold the importance scores
-    importance_scores = np.zeros((num_groups, len(original_probabilities)))
-    # Initialize tqdm progress bar
-    # with tqdm(total=num_groups, desc="Processing groups", position=0, leave=True) as pbar:
-    #     # Loop through each group of residues in the sequence
-    for i in range(0, len(protein_sequence), group_size):
-        # Mask the residues in the group at positions [i, i + group_size)
-        masked_sequence = protein_sequence[:i] + 'X' * min(group_size, len(protein_sequence) - i) + protein_sequence[i + group_size:]
-        # Tokenize and predict for the masked sequence
-        encoded_input = tokenizer([masked_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
-        with torch.no_grad():
-            masked_output = model(encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"])
-        masked_probabilities = F.softmax(masked_output.logits, dim=1).cpu().numpy()[0]
-        # Calculate the change in probabilities and store it as the importance score
-        group_index = i // group_size
-        importance_scores[group_index, :] = np.abs(original_probabilities - masked_probabilities)
-        progress = (i // group_size + 1) / num_groups * 100
-        print(f"Progress: {progress:.2f}%")
-    figmap, ax = plt.subplots(figsize=(20, 20))
-    sns.heatmap(importance_scores, annot=True, cmap="coolwarm", xticklabels=label_encoder.classes_, yticklabels=[f"{i}-{i+group_size-1}" for i in range(0, len(protein_sequence), group_size)], ax=ax)
-    ax.set_xlabel("Predicted Labels")
-    ax.set_ylabel("Residue Position Groups")
-    img = fig_to_img(figmap)
-    return img
-def main_function_single(sequence, show_explanation):
-    # Process seq, and return outputs for both fam and don
-    family_label, family_img, _, family_info = process_family_sequence(sequence)
-    donor_label, donor_img, *_ = process_single_sequence(sequence)
-    figmap = None
-    if show_explanation:
-        figmap = generate_heatmap(sequence)
-    return family_label, family_img, family_info, donor_label, donor_img, figmap
-def main_function_upload(protein_file): #, progress=gr.Progress()
-    return process_sequence_file(protein_file) #, progress
 prediction_imagefam = gr.outputs.Image(type='pil', label="Family prediction graph")
 prediction_imagedonor = gr.outputs.Image(type='pil', label="Donor prediction graph")
-prediction_explain = gr.outputs.Image(type='pil', label="Donor prediction explanation")
 with gr.Blocks() as app:
-    gr.Markdown("# Glydentify (alpha v0.3)")
     with gr.Tab("Single Sequence Prediction"):
         with gr.Row().style(equal_height=True):
             with gr.Column():
                 sequence = gr.inputs.Textbox(lines=16, placeholder='Enter Protein Sequence Here...', label="Protein Sequence")
-                explanation_checkbox = gr.inputs.Checkbox(label="Show Explanation", default=False)
             with gr.Column():
                 with gr.Accordion("Example:"):
                     gr.Markdown("""
@@ -443,37 +420,19 @@ with gr.Blocks() as app:
         with gr.Row().style(equal_height=True):
             with gr.Column():
                 predict_button = gr.Button("Predict")
-                predict_button.click(main_function_single, inputs=[sequence, explanation_checkbox],
                                      outputs=[family_prediction, prediction_imagefam, info_markdown,
-                                              donor_prediction, prediction_imagedonor, prediction_explain])
         # Family & Donor Section
         with gr.Row().style(equal_height=True):
             with gr.Column():
-                with gr.Accordion("Prediction Bar Graphs:"):
                     prediction_imagefam.render() # = gr.outputs.Image(type='pil', label="Family prediction graph")
-                    prediction_imagedonor.render() # = gr.outputs.Image(type='pil', label="Donor prediction graph")
-            # Explain Section
             with gr.Column():
-                if explanation_checkbox:  # Only render if the checkbox is checked
-                    with gr.Accordion("Donor explanation"):
-                        prediction_explain.render() # = gr.outputs.Image(type='pil', label="Donor prediction explaination")
-    with gr.Tab("Multiple Sequence Prediction"):
-        with gr.Row().style(equal_height=True):
-            with gr.Column():
-                protein_file = gr.inputs.File(label="Upload FASTA file")
-            with gr.Column():
-                result_file = gr.outputs.File(label="Download predictions of uploaded sequences")
-                with gr.Row().style(equal_height=True):
-                    with gr.Column():
-                        process_button = gr.Button("Process")
-                        process_button.click(main_function_upload, inputs=protein_file, outputs=[result_file])
-                    with gr.Column():
-                        clear = gr.Button("Clear")
-                        clear.click(lambda: None)
-            # clear.click()
 app.launch(show_error=True)

 import numpy as np
 import seaborn as sns
 from sklearn.model_selection import train_test_split
+import matplotlib
+matplotlib.use('Agg')  # Use the non-interactive Agg backend
 import matplotlib.pyplot as plt
 import pickle
 import torch.nn.functional as F
 from PIL import Image
 import Bio
 from Bio import SeqIO
+from Bio.Blast import NCBIXML
+import subprocess
 import zipfile
 import os
+GTA_fam_dict = {
+  0: "GT116",
+  1: "GT12",
+  2: "GT13",
+  3: "GT14",
+  4: "GT15",
+  5: "GT16",
+  6: "GT17",
+  7: "GT2-clade1",
+  8: "GT2-clade2",
+  9: "GT2-clade3",
+  10: "GT2-clade4",
+  11: "GT2-clade5",
+  12: "GT2-related",
+  13: "GT21",
+  14: "GT24",
+  15: "GT25",
+  16: "GT27",
+  17: "GT31",
+  18: "GT32",
+  19: "GT34",
+  20: "GT40",
+  21: "GT43",
+  22: "GT45",
+  23: "GT49",
+  24: "GT54",
+  25: "GT55",
+  26: "GT6",
+  27: "GT60",
+  28: "GT62",
+  29: "GT64",
+  30: "GT67",
+  31: "GT7",
+  32: "GT75",
+  33: "GT77",
+  34: "GT78",
+  35: "GT8",
+  36: "GT81",
+  37: "GT82",
+  38: "GT84",
+  39: "GT88",
+  40: "GT92"
+}
+GTA_don_dict = {
+  0: "N-Acetyl Galactosamine",
+  1: "N-Acetyl Glucosamine",
+  2: "Arabinose",
+  3: "Galactose",
+  4: "Galacturonic Acid",
+  5: "Glucose",
+  6: "Glucuronic Acid",
+  7: "Mannose",
+  8: "Rhamnose",
+  9: "Xylose"
+}
+GTB_fam_dict = {
+  0: "GT1",
+  1: "GT10",
+  2: "GT104",
+  3: "GT11",
+  4: "GT18",
+  5: "GT19",
+  6: "GT20",
+  7: "GT23",
+  8: "GT28",
+  9: "GT3",
+  10: "GT30",
+  11: "GT35",
+  12: "GT37",
+  13: "GT38",
+  14: "GT4",
+  15: "GT41",
+  16: "GT5",
+  17: "GT52",
+  18: "GT63",
+  19: "GT65",
+  20: "GT68",
+  21: "GT70",
+  22: "GT72",
+  23: "GT80",
+  24: "GT9",
+  25: "GT90",
+  26: "GT99"
+}
+GTB_don_dict = {
+  0: "Fucose",
+  1: "Galactose",
+  2: "N-Acetyl Galactosamine",
+  3: "Glucuronic Acid",
+  4: "N-Acetyl Glucosamine",
+  5: "Glucose",
+  6: "Mannose",
+  7: "Other",
+  8: "Xylose"
+}
+tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D") #facebook/esm2_t33_650M_UR50D
 glycosyltransferase_db = {
     "GT40"           : {'CAZy Name': 'GT40', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT40.html'},
     "GT16"           : {'CAZy Name': 'GT16', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT16.html'},
     "GT27"           : {'CAZy Name': 'GT27', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT27.html'},
     "GT55"           : {'CAZy Name': 'GT55', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT55.html'},
     "GT25"           : {'CAZy Name': 'GT25', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT25.html'},
+    "GT2"            : {'CAZy Name': 'GT2 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT2.html' },
     "GT84"           : {'CAZy Name': 'GT84', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT84.html'},
     "GT13"           : {'CAZy Name': 'GT13', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT13.html'},
     "GT67"           : {'CAZy Name': 'GT67', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT67.html'},
     "GT82"           : {'CAZy Name': 'GT82', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7  ', 'More Info': 'http://www.cazy.org/GT82.html'},
     "GT24"           : {'CAZy Name': 'GT24', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT24.html'},
+    "GT81"           : {'CAZy Name': 'GT81', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT81.html'},
     "GT49"           : {'CAZy Name': 'GT49', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT49.html'},
     "GT34"           : {'CAZy Name': 'GT34', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT34.html'},
     "GT45"           : {'CAZy Name': 'GT45', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT45.html'},
+    "GT32"           : {'CAZy Name': 'GT32', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT32.html'},
     "GT88"           : {'CAZy Name': 'GT88', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT88.html'},
     "GT21"           : {'CAZy Name': 'GT21', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '1  ', 'More Info': 'http://www.cazy.org/GT21.html'},
     "GT54"           : {'CAZy Name': 'GT54', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '6  ', 'More Info': 'http://www.cazy.org/GT54.html'},
     "GT6"            : {'CAZy Name': 'GT6 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT6.html' },
+    "GT7"            : {'CAZy Name': 'GT7 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT7.html' },
     "GT64"           : {'CAZy Name': 'GT64', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT64.html'},
     "GT78"           : {'CAZy Name': 'GT78', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '2  ', 'More Info': 'http://www.cazy.org/GT78.html'},
     "GT12"           : {'CAZy Name': 'GT12', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT12.html'},
+    "GT31"           : {'CAZy Name': 'GT31', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT31.html'},
     "GT62"           : {'CAZy Name': 'GT62', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '3  ', 'More Info': 'http://www.cazy.org/GT62.html'},
+    "GT8"            : {'CAZy Name': 'GT8 ', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT8.html' },
     "GT15"           : {'CAZy Name': 'GT15', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '8  ', 'More Info': 'http://www.cazy.org/GT15.html'},
+    "GT43"           : {'CAZy Name': 'GT43', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT43.html'},
     "GT60"           : {'CAZy Name': 'GT60', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '5  ', 'More Info': 'http://www.cazy.org/GT60.html'},
     "GT14"           : {'CAZy Name': 'GT14', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7  ', 'More Info': 'http://www.cazy.org/GT14.html'},
     "GT17"           : {'CAZy Name': 'GT17', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': '7  ', 'More Info': 'http://www.cazy.org/GT17.html'},
     "GT77"           : {'CAZy Name': 'GT77', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Retaining', 'Clade': '9  ', 'More Info': 'http://www.cazy.org/GT77.html'},
     "GT75"           : {'CAZy Name': 'GT75', 'Alternative Name': '', 'Fold': 'A', 'Mechanism': 'Inverting', 'Clade': 'N/A', 'More Info': 'http://www.cazy.org/GT75.html'},
 }
+def parse_blast_output_for_best_evalue(output_file):
+    with open(output_file) as result_handle:
+        blast_record = NCBIXML.read(result_handle)
+    if len(blast_record.alignments) == 0:
+        # Handle the case where no alignments are found
+        # You might return a high e-value or None to indicate no match
+        return None
+    best_hit = blast_record.alignments[0]
+    best_evalue = best_hit.hsps[0].expect
+    print(best_evalue)
+    return best_evalue
+def run_local_blast(sequence, database):
+    # Temporarily save the query sequence to a file
+    query_file = "temp_query.fasta"
+    with open(query_file, "w") as file:
+        file.write(">Query\n" + sequence)
+    # Specify the output file for BLAST results
+    output_file = "blast_results.xml"
+    # Construct the BLAST command
+    blast_cmd = [
+        "blastp",
+        "-query", query_file,
+        "-db", database,
+        "-out", output_file,
+        "-outfmt", "5",  # Output format 5 is XML
+        "-evalue", "1e-2"  # Set your desired E-value threshold here
+    ]
+    # Execute the BLAST search
+    subprocess.run(blast_cmd, check=True)
+    # Parse the BLAST output to find the best E-value
+    best_evalue = parse_blast_output_for_best_evalue(output_file)
+    # Clean up temporary files
+    os.remove(query_file)
+    os.remove(output_file)
+    return best_evalue
 def get_family_info(family_name):
 def preprocess_protein_sequence(protein_fasta):
     lines = protein_fasta.split('\n')
     headers = [line for line in lines if line.startswith('>')]
     if len(headers) > 1:
+        return None, None, None, "Multiple fasta sequences detected. Please upload a fasta file with only one sequence."
     protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
+    valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")
+    # Check if every character in the sequence is in the set of valid characters.
+    if any(char.upper() not in valid_characters for char in protein_sequence):
+        return None, None, None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids."
+    print("Running Blast.")
+    gta_db_path = "blast_data/GTA/GTA.db"
+    gtb_db_path = "blast_data/GTB/GTB.db"
+    evalue_gta = run_local_blast(protein_sequence, gta_db_path)
+    evalue_gta = evalue_gta if evalue_gta is not None else 1e+100
+    evalue_gtb = run_local_blast(protein_sequence, gtb_db_path)
+    evalue_gtb = evalue_gtb if evalue_gtb is not None else 1e+100
+    print("E-value GT-A:", evalue_gta, "E-value GT-B:", evalue_gtb)
+    print("Blast finished running. Checking sequence against known data.")
+    # Determine which models to use based on the best E-value
+    model_fam = "GTA_fam.pth" if evalue_gta < evalue_gtb else "GTB_fam.pth"
+    model_don = "GTA_don.pth" if evalue_gta < evalue_gtb else "GTB_don.pth"
+    print("Selected model for family:", model_fam, "and donor:", model_don)
+    # Adjust your existing condition to check if both E-values exceed the threshold
+    if evalue_gta > 1e-2 and evalue_gtb > 1e-2:
+        # If both E-values are above the threshold, it suggests the sequence does not match well with either database
+        return None, None, None, "**Warning:** The sequence does not appear to be a GT-A or GT-B. Please ensure you are submitting a sequence from these families."
+    return protein_sequence, model_fam, model_don, None
+def process_family_sequence(protein_sequence, modelfam, label_dict):
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
     input_idsfam = encoded_input["input_ids"]
     attention_maskfam = encoded_input["attention_mask"]
         probabilitiesfam = F.softmax(logitsfam, dim=1)
         _, predicted_labelsfam = torch.max(logitsfam, dim=1)
+    predicted_label_index_fam = predicted_labelsfam.item()  # Assuming single sample prediction
+    decoded_label_fam = label_dict.get(predicted_label_index_fam, "Unknown Label")  # Decoding label using the dictionary
+    family_info = get_family_info(decoded_label_fam)
+    figfam = plt.figure(figsize=(10, 5))
+    # probabilitiesfam_flat = probabilitiesfam.squeeze().tolist()  # Flatten probabilities
+    # Extract and sort top 5 label probabilities
+    top5_probs, top5_labels = torch.topk(probabilitiesfam, 5)
+    top5_labels = top5_labels.squeeze().tolist()
+    top5_decoded_labels = [label_dict.get(label, "Unknown") for label in top5_labels]
+    # For debugging
+    print("Top 5 labels:", top5_labels)
+    print("Available keys in label_dict:", label_dict.keys())
+    y_posfam = np.arange(len(top5_decoded_labels))
+    plt.barh(y_posfam, [prob * 100 for prob in top5_probs.squeeze().tolist()], align='center', alpha=0.5)
+    plt.yticks(y_posfam, top5_decoded_labels)
     plt.xlabel('Probability (%)')
     plt.title('Top 5 Family Class Probabilities')
     plt.xlim(0, 100)
     img = fig_to_img(figfam)
     if len(protein_sequence) < 100:
+        return decoded_label_fam, img, None, "**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions. \n\n {family_info}"
+    return decoded_label_fam, img, None, family_info
+def process_donor_sequence(protein_sequence, modeldon, label_dict):
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
+    input_idsdon = encoded_input["input_ids"]
+    attention_maskdon = encoded_input["attention_mask"]
     with torch.no_grad():
+        outputdon = modeldon(input_idsdon, attention_mask=attention_maskdon)
+        logitsdon = outputdon.logits
+        probabilitiesdon = F.softmax(logitsdon, dim=1)
+        _, predicted_labelsdon = torch.max(logitsdon, dim=1)
+    predicted_label_index_don = predicted_labelsdon.item()  # Assuming single sample prediction
+    decoded_label_don = label_dict.get(predicted_label_index_don, "Unknown Label")  # Decoding label using the dictionary
+    figdon = plt.figure(figsize=(10, 5))
+    probabilitiesdon_flat = probabilitiesdon.squeeze().tolist()  # Flatten probabilities
+    # Extract and sort top 5 label probabilities
+    top3_probs, top3_labels = torch.topk(probabilitiesdon, 3)
+    top3_labels = top3_labels.squeeze().tolist()
+    top3_decoded_labels = [label_dict.get(label, "Unknown") for label in top3_labels]
+    y_posdon = np.arange(len(top3_decoded_labels))
+    plt.barh(y_posdon, [prob * 100 for prob in top3_probs.squeeze().tolist()], align='center', alpha=0.5)
+    plt.yticks(y_posdon, top3_decoded_labels)
     plt.xlabel('Probability (%)')
     plt.title('Top 3 Donor Class Probabilities')
+    plt.xlim(0, 100)
+    plt.close(figdon)
+    img = fig_to_img(figdon)
     if len(protein_sequence) < 100:
+        return decoded_label_don, img, None, "**Warning:** The sequence is relatively short. Fragmentary and partial sequences may result in incorrect predictions. \n\n {family_info}"
+    return decoded_label_don, img, None
+def main_function_single(sequence):
+    # Initial preprocessing including BLAST-based model selection
+    protein_sequence, model_fam_path, model_don_path, error_msg = preprocess_protein_sequence(sequence)
+    if error_msg:
+        print(error_msg)
+        return None, None, error_msg, None, None
+    model_config = {
+        "GTA_fam.pth": {"num_labels": 41, "label_dict": GTA_fam_dict},
+        "GTB_fam.pth": {"num_labels": 27, "label_dict": GTB_fam_dict},
+        "GTA_don.pth": {"num_labels": 10, "label_dict": GTA_don_dict},
+        "GTB_don.pth": {"num_labels": 9, "label_dict": GTB_don_dict},
+    }
+    # Load the model for family classification
+    config_fam = model_config[model_fam_path]
+    model_fam = EsmForSequenceClassification.from_pretrained("facebook/esm2_t12_35M_UR50D", num_labels=config_fam["num_labels"])
+    model_fam.load_state_dict(torch.load(model_fam_path, map_location=torch.device('cpu')), strict=False)
+    model_fam.eval()
+    model_fam.to('cpu')
+    # Load the model for donor classification
+    config_don = model_config[model_don_path]
+    model_don = EsmForSequenceClassification.from_pretrained("facebook/esm2_t12_35M_UR50D", num_labels=config_don["num_labels"])
+    model_don.load_state_dict(torch.load(model_don_path, map_location=torch.device('cpu')), strict=False)
+    model_don.eval()
+    model_don.to('cpu')
+    print(config_fam["label_dict"])
+    # Pass the label dictionary along with the model to the processing functions
+    family_label, family_img, _, family_info = process_family_sequence(protein_sequence, model_fam, config_fam["label_dict"])
+    donor_label, donor_img, _ = process_donor_sequence(protein_sequence, model_don, config_don["label_dict"])
+    return family_label, family_img, family_info, donor_label, donor_img
 prediction_imagefam = gr.outputs.Image(type='pil', label="Family prediction graph")
 prediction_imagedonor = gr.outputs.Image(type='pil', label="Donor prediction graph")
 with gr.Blocks() as app:
+    gr.Markdown("# Glydentify (alpha v0.5)")
     with gr.Tab("Single Sequence Prediction"):
         with gr.Row().style(equal_height=True):
             with gr.Column():
                 sequence = gr.inputs.Textbox(lines=16, placeholder='Enter Protein Sequence Here...', label="Protein Sequence")
+                # explanation_checkbox = gr.inputs.Checkbox(label="Show Explanation", default=False)
             with gr.Column():
                 with gr.Accordion("Example:"):
                     gr.Markdown("""
         with gr.Row().style(equal_height=True):
             with gr.Column():
                 predict_button = gr.Button("Predict")
+                predict_button.click(main_function_single, inputs=[sequence],
                                      outputs=[family_prediction, prediction_imagefam, info_markdown,
+                                              donor_prediction, prediction_imagedonor])
         # Family & Donor Section
         with gr.Row().style(equal_height=True):
             with gr.Column():
+                with gr.Accordion("Family Prediction:"):
                     prediction_imagefam.render() # = gr.outputs.Image(type='pil', label="Family prediction graph")
             with gr.Column():
+                with gr.Accordion("Donor Prediction:"):
+                    prediction_imagedonor.render() # = gr.outputs.Image(type='pil', label="Donor prediction graph")
 app.launch(show_error=True)

backup/family.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:03dcff847ada129cd2889ea3f62071b666b009087829dabf85301210c7fe8382
-size 136199341

backup/family_labels.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3e8fe9ddb883008ab377fba3837200626ee609fbe892950b1fada9ff078eca4
-size 4559

best_model_35M_t12_5v5.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1621bc6500a0dc3510af6d53cc405d4ac7cc8e0827e23b74f488867d321bc0e8
-size 136069629

donor_labels.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:504b291ac3a1de0e767117935a5546dd8d38b1150bc9183c9a3a8fbce3897a96
-size 679

family.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:725b2904a82171be55bf702f10e01d6185806e2556578d4cc99e1af9711b3952
-size 136163661

family_labels.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9f0cac818ec047a4e6c0ca9a9d3026bd3d224a8d492ea533dae107a4a8269db5
-size 3419

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ transformers==4.31.0
 scikit-learn==1.3.0
 torch==2.0.1
 torchaudio==2.0.2
-torchvision==0.15.2

 scikit-learn==1.3.0
 torch==2.0.1
 torchaudio==2.0.2
+torchvision==0.15.2
+accelerate==0.29.1