aws-neuron
/

optimum-neuron-cache

dacorvo HF staff commited on Apr 9

Commit

2d87237

•

1 Parent(s): aeec59c

Add more llama config

Files changed (1) hide show

inference-cache-config/llama.json CHANGED Viewed

@@ -59,13 +59,25 @@
       "sequence_length": 4096,
       "num_cores": 24,
       "auto_cast_type": "fp16"
     }
   ],
   "meta-llama/Llama-2-13b-chat-hf": [
     {
       "batch_size": 1,
       "sequence_length": 4096,
-      "num_cores": 8,
       "auto_cast_type": "fp16"
     },
     {
@@ -77,7 +89,7 @@
     {
       "batch_size": 4,
       "sequence_length": 4096,
-      "num_cores": 8,
       "auto_cast_type": "fp16"
     },
     {
@@ -89,7 +101,7 @@
     {
       "batch_size": 8,
       "sequence_length": 4096,
-      "num_cores": 8,
       "auto_cast_type": "fp16"
     },
     {
@@ -97,6 +109,30 @@
       "sequence_length": 4096,
       "num_cores": 24,
       "auto_cast_type": "fp16"
     }
   ],
   "meta-llama/Llama-2-70b-chat-hf": [

       "sequence_length": 4096,
       "num_cores": 24,
       "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 32,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 32,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
     }
   ],
   "meta-llama/Llama-2-13b-chat-hf": [
     {
       "batch_size": 1,
       "sequence_length": 4096,
+      "num_cores": 12,
       "auto_cast_type": "fp16"
     },
     {
     {
       "batch_size": 4,
       "sequence_length": 4096,
+      "num_cores": 12,
       "auto_cast_type": "fp16"
     },
     {
     {
       "batch_size": 8,
       "sequence_length": 4096,
+      "num_cores": 12,
       "auto_cast_type": "fp16"
     },
     {
       "sequence_length": 4096,
       "num_cores": 24,
       "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 16,
+      "sequence_length": 4096,
+      "num_cores": 12,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 16,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 32,
+      "sequence_length": 4096,
+      "num_cores": 12,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 32,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
     }
   ],
   "meta-llama/Llama-2-70b-chat-hf": [