Spaces:

gregH
/

gradient_cuff

Sleeping

App Files Files Community

gregH commited on 20 days ago

Commit

0a218bc

•

1 Parent(s): 354d51b

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -6

app.py CHANGED Viewed

@@ -48,7 +48,6 @@ def set_seed(_hashed_seed):
 set_seed(13)
-original_response=""
 print(f"Starting to load the model to memory")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -155,12 +154,12 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
         responses = engine(input_embeds.to(torch.bfloat16))
     original_response=responses[0]
     if not with_defense:
-        return (False,None,None)
     labels=get_labels(responses)
     results=[None for _ in range(perturb_times+1)]
     results[0]=sum(labels)/len(labels)
     if results[0]>=0.5:
-        return (True,1-results[0],None)
     # second-stage rejection
     shift_embeddings=[]
@@ -182,9 +181,9 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
     est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
     est_grad=sum(est_grad)/len(est_grad)
     if est_grad.norm().item()>threshold:
-        return (True,1-results[0],est_grad.norm().item())
-    return (False,1-results[0],est_grad.norm().item())
 def chat(message, history, with_defense,threshold):
     perturb_times=9
@@ -212,7 +211,7 @@ def chat(message, history, with_defense,threshold):
     input_ids = tok([messages], return_tensors="pt")["input_ids"]
     #response= "[Gradient Cuff Checking: "+reject_information + "]\n"+ chat_engine(input_ids)
     #response=chat_engine(input_ids)
-    response=original_response
     response=response.split(" ")
     # Initialize an empty string to store the generated text

 set_seed(13)
 print(f"Starting to load the model to memory")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         responses = engine(input_embeds.to(torch.bfloat16))
     original_response=responses[0]
     if not with_defense:
+        return (False,None,None,original_response)
     labels=get_labels(responses)
     results=[None for _ in range(perturb_times+1)]
     results[0]=sum(labels)/len(labels)
     if results[0]>=0.5:
+        return (True,1-results[0],None,original_response)
     # second-stage rejection
     shift_embeddings=[]
     est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
     est_grad=sum(est_grad)/len(est_grad)
     if est_grad.norm().item()>threshold:
+        return (True,1-results[0],est_grad.norm().item(),original_response)
+    return (False,1-results[0],est_grad.norm().item(),original_response)
 def chat(message, history, with_defense,threshold):
     perturb_times=9
     input_ids = tok([messages], return_tensors="pt")["input_ids"]
     #response= "[Gradient Cuff Checking: "+reject_information + "]\n"+ chat_engine(input_ids)
     #response=chat_engine(input_ids)
+    response=return_value[-1]
     response=response.split(" ")
     # Initialize an empty string to store the generated text