update tokenizer

Browse files

Files changed (5) hide show

README.md +14 -7
config.json +0 -1
special_tokens_map.json +9 -1
tokenizer.json +47 -1
tokenizer_config.json +5 -5

README.md CHANGED Viewed

@@ -168,11 +168,11 @@ model-index:
       verified: false
 ---
-# SantaCoder
 ![banner](https://huggingface.co/datasets/bigcode/admin/resolve/main/banner.png)
-Play with the model on the [SantaCoder Space Demo](https://huggingface.co/spaces/bigcode/santacoder-demo).
 #  Table of Contents
@@ -191,7 +191,7 @@ In addition there are several models that were trained on datasets with differen
 - **Repository:** [bigcode/Megatron-LM](https://github.com/bigcode-project/Megatron-LM)
 - **Project Website:** [bigcode-project.org](www.bigcode-project.org)
-- **Paper:** [🎅SantaCoder: Don't reach for the stars!🌟](https://t.co/YV3pzUbYOr)
 - **Point of Contact:** [[email protected]](mailto:[email protected])
 - **Languages:** Python, Java, and JavaScript
@@ -224,7 +224,7 @@ You should phrase commands like they occur in source code such as comments (e.g.
 # pip install -q transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
-checkpoint = "bigcode/santacoder"
 device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
@@ -246,11 +246,11 @@ print(tokenizer.decode(outputs[0]))
 ```
 ### Load other checkpoints
-We upload the checkpoint of each experiment to a seperate branch as well as the intermediate checkpoints as commits on the branches. You can load them with the `revision` flag:
 ```python
 model = AutoModelForCausalLM.from_pretrained(
-    "bigcode/santacoder",
     revision="no-fim", # name of branch or commit hash
     trust_remote_code=True
 )
@@ -289,4 +289,11 @@ The model has been trained on source code in Python, Java, and JavaScript. The p
 The model is licenses under the CodeML Open RAIL-M v0.1 license. You can find the full license [here](https://huggingface.co/spaces/bigcode/license).
 # Citation
-**TODO**

       verified: false
 ---
+# Optimizd SantaCoder
 ![banner](https://huggingface.co/datasets/bigcode/admin/resolve/main/banner.png)
+A up to 60% faster version of bigcode/santacoder.
 #  Table of Contents
 - **Repository:** [bigcode/Megatron-LM](https://github.com/bigcode-project/Megatron-LM)
 - **Project Website:** [bigcode-project.org](www.bigcode-project.org)
+- **Paper:** [🎅SantaCoder: Don't reach for the stars!🌟](https://arxiv.org/abs/2301.03988)
 - **Point of Contact:** [[email protected]](mailto:[email protected])
 - **Languages:** Python, Java, and JavaScript
 # pip install -q transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
+checkpoint = "olivierdehaene/optimized-santacoder"
 device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 ```
 ### Load other checkpoints
+We upload the checkpoint of each experiment to a separate branch as well as the intermediate checkpoints as commits on the branches. You can load them with the `revision` flag:
 ```python
 model = AutoModelForCausalLM.from_pretrained(
+    "olivierdehaene/optimized-santacoder",
     revision="no-fim", # name of branch or commit hash
     trust_remote_code=True
 )
 The model is licenses under the CodeML Open RAIL-M v0.1 license. You can find the full license [here](https://huggingface.co/spaces/bigcode/license).
 # Citation
+```
+@article{allal2023santacoder,
+  title={SantaCoder: don't reach for the stars!},
+  author={Allal, Loubna Ben and Li, Raymond and Kocetkov, Denis and Mou, Chenghao and Akiki, Christopher and Ferrandis, Carlos Munoz and Muennighoff, Niklas and Mishra, Mayank and Gu, Alex and Dey, Manan and others},
+  journal={arXiv preprint arXiv:2301.03988},
+  year={2023}
+}
+```

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "bigcode/santacoder",
   "activation_function": "gelu_fast",
   "architectures": [
     "GPT2LMHeadCustomModel"

 {
   "activation_function": "gelu_fast",
   "architectures": [
     "GPT2LMHeadCustomModel"

special_tokens_map.json CHANGED Viewed

	@@ -1 +1,9 @@
1	- {}

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim-prefix>",
+    "<fim-middle>",
+    "<fim-suffix>",
+    "<fim-pad>"
+  ]
+}

tokenizer.json CHANGED Viewed

@@ -2,7 +2,53 @@
   "version": "1.0",
   "truncation": null,
   "padding": null,
-  "added_tokens": [],
   "normalizer": null,
   "pre_tokenizer": {
     "type": "Sequence",

   "version": "1.0",
   "truncation": null,
   "padding": null,
+  "added_tokens": [
+    {
+      "id": 49152,
+      "content": "<|endoftext|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49153,
+      "content": "<fim-prefix>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49154,
+      "content": "<fim-middle>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49155,
+      "content": "<fim-suffix>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49156,
+      "content": "<fim-pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
   "normalizer": null,
   "pre_tokenizer": {
     "type": "Sequence",

tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "name_or_path": "bigcode/digit-bytelevel-bpe-jss-v1.1-49152",
-  "special_tokens_map_file": "/Users/leandro/.cache/huggingface/hub/models--bigcode--digit-bytelevel-bpe-jss-v1.1-49152/snapshots/fa09b77949689a484afafc5f89534e6b6ba2c151/special_tokens_map.json",
-  "tokenizer_class": "PreTrainedTokenizerFast",
-  "vocab_size": 49152,
   "model_max_length": 2048
-}

 {
+  "errors": "replace",
+  "tokenizer_class": "GPT2TokenizerFast",
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
   "model_max_length": 2048
+}