Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +12 -0
Mistral-Nemo-Instruct-2407.IQ1_M.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ1_S.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ2_M.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ2_S.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ2_XS.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ2_XXS.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ3_M.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ3_S.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ3_XS.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ3_XXS.gguf +3 -0
Mistral-Nemo-Instruct-2407.IQ4_XS.gguf +3 -0
Mistral-Nemo-Instruct-2407.imatrix.dat +3 -0
README.md +286 -3

.gitattributes CHANGED Viewed

@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.imatrix.dat filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ1_M.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ1_S.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ2_S.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ2_XS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ3_S.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ3_XS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-Nemo-Instruct-2407.IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text

Mistral-Nemo-Instruct-2407.IQ1_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b940cc0ffd6e1a493afd429842fe99d967d2173b136eba35b0828a72f56f562a
+size 3221627296

Mistral-Nemo-Instruct-2407.IQ1_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82dd0d71d3bae34a9776ec56b9b521bad2193b2f0e7d29002efed379db99d29a
+size 2999214496

Mistral-Nemo-Instruct-2407.IQ2_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97f9afd43bc903b36d49781de3152e25ca6f91f848f01312647868250936b938
+size 4435026336

Mistral-Nemo-Instruct-2407.IQ2_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:269e5f67b72c449603b58ac4ca7deb0b54ba688803d85542d02483f105770ebe
+size 4138475936

Mistral-Nemo-Instruct-2407.IQ2_XS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b4ff647558433d2d013201553c72c2e27d819d055435f75ff70fae3e3e723d2
+size 3915080096

Mistral-Nemo-Instruct-2407.IQ2_XXS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:332d9a99a4d1012c6adc650e696c6b3762dba47e05abcc21cde5925837bc2a30
+size 3592315296

Mistral-Nemo-Instruct-2407.IQ3_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adda54bc47e014739d3f54700dc352bfe9d7dc939a752402e4b562b65110bb5b
+size 5722235296

Mistral-Nemo-Instruct-2407.IQ3_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9888c00e27c193ac59e230a04cad89f9925f66f54253e4e5eff0d423390dea7
+size 5562081696

Mistral-Nemo-Instruct-2407.IQ3_XS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:150ff66d862134c5a54423f11230c40233e3dc22af8f04fd8d129c6184965c36
+size 5306491296

Mistral-Nemo-Instruct-2407.IQ3_XXS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f62b4cd119b6270dd92ec9effa9cefd97b910e1aa0dbdab6eaa4a05d30e91d20
+size 4945387936

Mistral-Nemo-Instruct-2407.IQ4_XS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75cd95b015d33455a76b71a2cdeedc80d2100569654d62d375e5ce0f5b0982f4
+size 6742712736

Mistral-Nemo-Instruct-2407.imatrix.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1379ab50b21551efc0c561e9ca8d864dcc95d42c10082d7f7e3ae4e991d79dd6
+size 7054413

README.md CHANGED Viewed

@@ -1,3 +1,286 @@
----
-license: apache-2.0
----

+---
+base_model: mistralai/Mistral-Nemo-Instruct-2407
+language:
+- en
+pipeline_tag: text-generation
+license: apache-2.0
+model_creator: Mistral AI
+model_name: Mistral-Nemo-Instruct-2407
+model_type: mistral
+quantized_by: CISC
+---
+# Mistral-Nemo-Instruct-2407 - SOTA GGUF
+- Model creator: [Mistral AI](https://huggingface.co/mistralai)
+- Original model: [Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)
+<!-- description start -->
+## Description
+This repo contains State Of The Art quantized GGUF format model files for [Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407).
+Quantization was done with an importance matrix that was trained for ~1M tokens (256 batches of 4096 tokens) of [groups_merged.txt](https://github.com/ggerganov/llama.cpp/discussions/5263#discussioncomment-8395384) and [wiki.train.raw](https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/train.txt) concatenated.
+The embedded chat template is the updated one with correct Tekken tokenization and function calling support via OpenAI-compatible `tools` parameter, see [example](#simple-llama-cpp-python-example-function-calling-code).
+<!-- description end -->
+<!-- prompt-template start -->
+## Prompt template: Mistral Tekken
+```
+[AVAILABLE_TOOLS][{"name": "function_name", "description": "Description", "parameters": {...}}, ...][/AVAILABLE_TOOLS][INST]{prompt}[/INST]
+```
+<!-- prompt-template end -->
+<!-- compatibility_gguf start -->
+## Compatibility
+These quantised GGUFv3 files are compatible with llama.cpp from July 22nd 2024 onwards, as of commit [50e0535](https://github.com/ggerganov/llama.cpp/commit/50e05353e88d50b644688caa91f5955e8bdb9eb9)
+They are also compatible with many third party UIs and libraries provided they are built using a recent llama.cpp.
+## Explanation of quantisation methods
+<details>
+  <summary>Click to see details</summary>
+The new methods available are:
+* GGML_TYPE_IQ1_S - 1-bit quantization in super-blocks with an importance matrix applied, effectively using 1.56 bits per weight (bpw)
+* GGML_TYPE_IQ1_M - 1-bit quantization in super-blocks with an importance matrix applied, effectively using 1.75 bpw
+* GGML_TYPE_IQ2_XXS - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.06 bpw
+* GGML_TYPE_IQ2_XS - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.31 bpw
+* GGML_TYPE_IQ2_S - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.5 bpw
+* GGML_TYPE_IQ2_M - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.7 bpw
+* GGML_TYPE_IQ3_XXS - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.06 bpw
+* GGML_TYPE_IQ3_XS - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.3 bpw
+* GGML_TYPE_IQ3_S - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.44 bpw
+* GGML_TYPE_IQ3_M - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.66 bpw
+* GGML_TYPE_IQ4_XS - 4-bit quantization in super-blocks with an importance matrix applied, effectively using 4.25 bpw
+* GGML_TYPE_IQ4_NL - 4-bit non-linearly mapped quantization with an importance matrix applied, effectively using 4.5 bpw
+Refer to the Provided Files table below to see what files use which methods, and how.
+</details>
+<!-- compatibility_gguf end -->
+<!-- README_GGUF.md-provided-files start -->
+## Provided files
+| Name | Quant method | Bits | Size | Max RAM required | Use case |
+| ---- | ---- | ---- | ---- | ---- | ----- |
+| [Mistral-Nemo-Instruct-2407.IQ1_S.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ1_S.gguf) | IQ1_S | 1 | 2.8 GB| 3.4 GB | smallest, significant quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ1_M.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ1_M.gguf) | IQ1_M | 1 | 3.0 GB| 3.6 GB | very small, significant quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ2_XXS.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ2_XXS.gguf) | IQ2_XXS | 2 | 3.3 GB| 3.9 GB | very small, high quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ2_XS.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ2_XS.gguf) | IQ2_XS | 2 | 3.6 GB| 4.2 GB | very small, high quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ2_S.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ2_S.gguf) | IQ2_S | 2 | 3.9 GB| 4.4 GB | small, substantial quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ2_M.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ2_M.gguf) | IQ2_M | 2 | 4.1 GB| 4.7 GB | small, greater quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ3_XXS.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ3_XXS.gguf) | IQ3_XXS | 3 | 4.6 GB| 5.2 GB | very small, high quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ3_XS.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ3_XS.gguf) | IQ3_XS | 3 | 4.9 GB| 5.5 GB | small, substantial quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ3_S.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ3_S.gguf) | IQ3_S | 3 | 5.2 GB| 5.8 GB | small, greater quality loss |
+| [Mistral-Nemo-Instruct-2407.IQ3_M.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ3_M.gguf) | IQ3_M | 3 | 5.3 GB| 5.9 GB | medium, balanced quality - recommended |
+| [Mistral-Nemo-Instruct-2407.IQ4_XS.gguf](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.IQ4_XS.gguf) | IQ4_XS | 4 | 6.3 GB| 6.9 GB | small, substantial quality loss |
+Generated importance matrix file: [Mistral-Nemo-Instruct-2407.imatrix.dat](https://huggingface.co/CISCai/Mistral-Nemo-Instruct-2407-SOTA-GGUF/blob/main/Mistral-Nemo-Instruct-2407.imatrix.dat)
+**Note**: the above RAM figures assume no GPU offloading with 4K context. If layers are offloaded to the GPU, this will reduce RAM usage and use VRAM instead.
+<!-- README_GGUF.md-provided-files end -->
+<!-- README_GGUF.md-how-to-run start -->
+## Example `llama.cpp` command
+Make sure you are using `llama.cpp` from commit [50e0535](https://github.com/ggerganov/llama.cpp/commit/50e05353e88d50b644688caa91f5955e8bdb9eb9) or later.
+```shell
+./llama-cli -ngl 41 -m Mistral-Nemo-Instruct-2407.IQ4_XS.gguf --color -c 131072 --temp 0.3 --repeat-penalty 1.1 -p "[AVAILABLE_TOOLS]{tools}[/AVAILABLE_TOOLS][INST]{prompt}[/INST]"
+```
+This model is very temperature sensitive, keep it between 0.3 and 0.4 for best results! Also note the lack of spaces between special tokens and input in the prompt; this model is not using the regular Mistral chat template.
+Change `-ngl 41` to the number of layers to offload to GPU. Remove it if you don't have GPU acceleration.
+Change `-c 131072` to the desired sequence length.
+If you are low on V/RAM try quantizing the K-cache with `-ctk q8_0` or even `-ctk q4_0` for big memory savings (depending on context size).
+There is a similar option for V-cache (`-ctv`), however that is [not working yet](https://github.com/ggerganov/llama.cpp/issues/4425) unless you enable Flash Attention (`-fa`) too.
+For other parameters and how to use them, please refer to [the llama.cpp documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/main/README.md)
+## How to run from Python code
+You can use GGUF models from Python using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) module.
+### How to load this model in Python code, using llama-cpp-python
+For full documentation, please see: [llama-cpp-python docs](https://llama-cpp-python.readthedocs.io/en/latest/).
+#### First install the package
+Run one of the following commands, according to your system:
+```shell
+# Prebuilt wheel with basic CPU support
+pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+# Prebuilt wheel with NVidia CUDA acceleration
+pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 (or cu122 etc.)
+# Prebuilt wheel with Metal GPU acceleration
+pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
+# Build base version with no GPU acceleration
+pip install llama-cpp-python
+# With NVidia CUDA acceleration
+CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
+# Or with OpenBLAS acceleration
+CMAKE_ARGS="-DGGML_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
+# Or with AMD ROCm GPU acceleration (Linux only)
+CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install llama-cpp-python
+# Or with Metal GPU acceleration for macOS systems only
+CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python
+# Or with Vulkan acceleration
+CMAKE_ARGS="-DGGML_VULKAN=on" pip install llama-cpp-python
+# Or with SYCL acceleration
+CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
+# In windows, to set the variables CMAKE_ARGS in PowerShell, follow this format; eg for NVidia CUDA:
+$env:CMAKE_ARGS = "-DGGML_CUDA=on"
+pip install llama-cpp-python
+```
+#### Simple llama-cpp-python example code
+```python
+from llama_cpp import Llama
+# Chat Completion API
+llm = Llama(model_path="./Mistral-Nemo-Instruct-2407.IQ4_XS.gguf", n_gpu_layers=41, n_ctx=131072)
+print(llm.create_chat_completion(
+    messages = [
+        {
+            "role": "user",
+            "content": "Pick a LeetCode challenge and solve it in Python."
+        }
+    ]
+))
+```
+#### Simple llama-cpp-python example function calling code
+```python
+from llama_cpp import Llama
+# Chat Completion API
+grammar = LlamaGrammar.from_json_schema(json.dumps({
+    "type": "array",
+    "items": {
+        "type": "object",
+        "required": [ "name", "arguments" ],
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "arguments": {
+                "type": "object"
+            }
+        }
+    }
+}))
+llm = Llama(model_path="./Mistral-Nemo-Instruct-2407.IQ4_XS.gguf", n_gpu_layers=41, n_ctx=131072)
+response = llm.create_chat_completion(
+      temperature = 0.0,
+      repeat_penalty = 1.1,
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo and Stockholm?"
+        }
+      ],
+      tools=[{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }],
+      grammar = grammar
+)
+print(json.loads(response["choices"][0]["text"]))
+print(llm.create_chat_completion(
+      temperature = 0.0,
+      repeat_penalty = 1.1,
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo?"
+        },
+        { # The tool_calls is from the response to the above with tool_choice active
+          "role": "assistant",
+          "content": None,
+          "tool_calls": [
+            {
+              "id": "call__0_get_current_weather_cmpl-..."[:9], # Make sure to truncate ID (chat template requires it)
+              "type": "function",
+              "function": {
+                "name": "get_current_weather",
+                "arguments": '{ "location": "Oslo, NO" ,"unit": "celsius"} '
+              }
+            }
+          ]
+        },
+        { # The tool_call_id is from tool_calls and content is the result from the function call you made
+          "role": "tool",
+          "content": "20",
+          "tool_call_id": "call__0_get_current_weather_cmpl-..."[:9] # Make sure to truncate ID (chat template requires it)
+        }
+      ],
+      tools=[{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }],
+      #tool_choice={
+      #  "type": "function",
+      #  "function": {
+      #    "name": "get_current_weather"
+      #  }
+      #}
+))
+```
+<!-- README_GGUF.md-how-to-run end -->