Spaces:

gregH
/

gradient_cuff

Sleeping

App Files Files Community

gregH commited on Feb 11

Commit

071d58e

•

1 Parent(s): 86cfe0b

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -32

app.py CHANGED Viewed

@@ -98,9 +98,10 @@ def user(message, history):
 def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
     #to determine whether the query is malicious
-    if threshold==0:
-        return True
     # first-stage rejection
     sft_embed=shift_direction_embedding[0]*0.0
     original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
     original_embedding=embedding_func(original_input_id.to(device)).cpu()
@@ -115,39 +116,42 @@ def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
     results[0]=sum(labels)/len(labels)
     if results[0]>0.5:
         return True
-    # second-stage rejection
-    shift_embeddings=[]
-    for sft_embed in shift_direction_embedding[:perturb_times]:
-        #original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
-        #original_embedding=embedding_func(original_input_id.to(device)).cpu()
-        shift_embeddings+=[0.02*sft_embed for _ in range(sample_times)]
-    input_embeds=embedding_shift(
-        original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
-    )
-    with torch.no_grad():
-        responses = engine(input_embeds)
-    for idx in range(perturb_times):
-        labels=get_labels(
-            responses[idx*sample_times:(idx+1)*sample_times]
         )
-        results[idx+1]=sum(labels)/len(labels)
-    est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
-    est_grad=sum(est_grad)/len(est_grad)
-    if est_grad.norm().item()>threshold:
-        return True
     return False
-def chat(message, history, sample_times, perturb_times):
-    if sample_times*perturb_times>0:
-        if gradient_cuff_reject(message,sample_times,perturb_times,100):
-            answer="[Gradient Cuff Rejection] I cannot fulfill your request".split(" ")
-            partial_text = ""
-            for new_text in answer:
-                partial_text += (new_text+" ")
-                # Yield an empty string to cleanup the message textbox and the updated conversation history
-                yield partial_text
-            return 0
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
@@ -181,7 +185,8 @@ def chat(message, history, sample_times, perturb_times):
 #demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
 with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b",additional_inputs=[
     gr.Slider(minimum=0, maximum=10, step=1, value=2, label="N - Sample times"),
-    gr.Slider(minimum=0, maximum=10, step=1, value=2, label="P - Perturb times")
 ]
 ) as demo:
     with gr.Tab("benign"):

 def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
     #to determine whether the query is malicious
     # first-stage rejection
+    if sample_times==0:
+        return False
     sft_embed=shift_direction_embedding[0]*0.0
     original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
     original_embedding=embedding_func(original_input_id.to(device)).cpu()
     results[0]=sum(labels)/len(labels)
     if results[0]>0.5:
         return True
+    if perturb_times>0:
+        # second-stage rejection
+        if threshold==0:
+            return True
+        shift_embeddings=[]
+        for sft_embed in shift_direction_embedding[:perturb_times]:
+            #original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
+            #original_embedding=embedding_func(original_input_id.to(device)).cpu()
+            shift_embeddings+=[0.02*sft_embed for _ in range(sample_times)]
+        input_embeds=embedding_shift(
+            original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
         )
+        with torch.no_grad():
+            responses = engine(input_embeds)
+        for idx in range(perturb_times):
+            labels=get_labels(
+                responses[idx*sample_times:(idx+1)*sample_times]
+            )
+            results[idx+1]=sum(labels)/len(labels)
+        est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
+        est_grad=sum(est_grad)/len(est_grad)
+        if est_grad.norm().item()>threshold:
+            return True
     return False
+def chat(message, history, sample_times, perturb_times,threshold):
+    if gradient_cuff_reject(message,sample_times,perturb_times,threshold):
+        answer="[Gradient Cuff Rejection] I cannot fulfill your request".split(" ")
+        partial_text = ""
+        for new_text in answer:
+            partial_text += (new_text+" ")
+            # Yield an empty string to cleanup the message textbox and the updated conversation history
+            yield partial_text
+        return 0
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
 #demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
 with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b",additional_inputs=[
     gr.Slider(minimum=0, maximum=10, step=1, value=2, label="N - Sample times"),
+    gr.Slider(minimum=0, maximum=10, step=1, value=2, label="P - Perturb times"),
+    gr.Slider(minimum=0, maximum=100, step=1, value=50, label="t - threshold")
 ]
 ) as demo:
     with gr.Tab("benign"):