OlivierDehaene
commited on
Commit
•
691db44
1
Parent(s):
7d2ded6
update tokenizer
Browse files- README.md +14 -7
- config.json +0 -1
- special_tokens_map.json +9 -1
- tokenizer.json +47 -1
- tokenizer_config.json +5 -5
README.md
CHANGED
@@ -168,11 +168,11 @@ model-index:
|
|
168 |
verified: false
|
169 |
---
|
170 |
|
171 |
-
# SantaCoder
|
172 |
|
173 |
![banner](https://huggingface.co/datasets/bigcode/admin/resolve/main/banner.png)
|
174 |
|
175 |
-
|
176 |
|
177 |
# Table of Contents
|
178 |
|
@@ -191,7 +191,7 @@ In addition there are several models that were trained on datasets with differen
|
|
191 |
|
192 |
- **Repository:** [bigcode/Megatron-LM](https://github.com/bigcode-project/Megatron-LM)
|
193 |
- **Project Website:** [bigcode-project.org](www.bigcode-project.org)
|
194 |
-
- **Paper:** [🎅SantaCoder: Don't reach for the stars!🌟](https://
|
195 |
- **Point of Contact:** [[email protected]](mailto:[email protected])
|
196 |
- **Languages:** Python, Java, and JavaScript
|
197 |
|
@@ -224,7 +224,7 @@ You should phrase commands like they occur in source code such as comments (e.g.
|
|
224 |
# pip install -q transformers
|
225 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
226 |
|
227 |
-
checkpoint = "
|
228 |
device = "cuda" # for GPU usage or "cpu" for CPU usage
|
229 |
|
230 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
@@ -246,11 +246,11 @@ print(tokenizer.decode(outputs[0]))
|
|
246 |
```
|
247 |
|
248 |
### Load other checkpoints
|
249 |
-
We upload the checkpoint of each experiment to a
|
250 |
|
251 |
```python
|
252 |
model = AutoModelForCausalLM.from_pretrained(
|
253 |
-
"
|
254 |
revision="no-fim", # name of branch or commit hash
|
255 |
trust_remote_code=True
|
256 |
)
|
@@ -289,4 +289,11 @@ The model has been trained on source code in Python, Java, and JavaScript. The p
|
|
289 |
The model is licenses under the CodeML Open RAIL-M v0.1 license. You can find the full license [here](https://huggingface.co/spaces/bigcode/license).
|
290 |
|
291 |
# Citation
|
292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
verified: false
|
169 |
---
|
170 |
|
171 |
+
# Optimizd SantaCoder
|
172 |
|
173 |
![banner](https://huggingface.co/datasets/bigcode/admin/resolve/main/banner.png)
|
174 |
|
175 |
+
A up to 60% faster version of bigcode/santacoder.
|
176 |
|
177 |
# Table of Contents
|
178 |
|
|
|
191 |
|
192 |
- **Repository:** [bigcode/Megatron-LM](https://github.com/bigcode-project/Megatron-LM)
|
193 |
- **Project Website:** [bigcode-project.org](www.bigcode-project.org)
|
194 |
+
- **Paper:** [🎅SantaCoder: Don't reach for the stars!🌟](https://arxiv.org/abs/2301.03988)
|
195 |
- **Point of Contact:** [[email protected]](mailto:[email protected])
|
196 |
- **Languages:** Python, Java, and JavaScript
|
197 |
|
|
|
224 |
# pip install -q transformers
|
225 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
226 |
|
227 |
+
checkpoint = "olivierdehaene/optimized-santacoder"
|
228 |
device = "cuda" # for GPU usage or "cpu" for CPU usage
|
229 |
|
230 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
|
|
246 |
```
|
247 |
|
248 |
### Load other checkpoints
|
249 |
+
We upload the checkpoint of each experiment to a separate branch as well as the intermediate checkpoints as commits on the branches. You can load them with the `revision` flag:
|
250 |
|
251 |
```python
|
252 |
model = AutoModelForCausalLM.from_pretrained(
|
253 |
+
"olivierdehaene/optimized-santacoder",
|
254 |
revision="no-fim", # name of branch or commit hash
|
255 |
trust_remote_code=True
|
256 |
)
|
|
|
289 |
The model is licenses under the CodeML Open RAIL-M v0.1 license. You can find the full license [here](https://huggingface.co/spaces/bigcode/license).
|
290 |
|
291 |
# Citation
|
292 |
+
```
|
293 |
+
@article{allal2023santacoder,
|
294 |
+
title={SantaCoder: don't reach for the stars!},
|
295 |
+
author={Allal, Loubna Ben and Li, Raymond and Kocetkov, Denis and Mou, Chenghao and Akiki, Christopher and Ferrandis, Carlos Munoz and Muennighoff, Niklas and Mishra, Mayank and Gu, Alex and Dey, Manan and others},
|
296 |
+
journal={arXiv preprint arXiv:2301.03988},
|
297 |
+
year={2023}
|
298 |
+
}
|
299 |
+
```
|
config.json
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "bigcode/santacoder",
|
3 |
"activation_function": "gelu_fast",
|
4 |
"architectures": [
|
5 |
"GPT2LMHeadCustomModel"
|
|
|
1 |
{
|
|
|
2 |
"activation_function": "gelu_fast",
|
3 |
"architectures": [
|
4 |
"GPT2LMHeadCustomModel"
|
special_tokens_map.json
CHANGED
@@ -1 +1,9 @@
|
|
1 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|endoftext|>",
|
4 |
+
"<fim-prefix>",
|
5 |
+
"<fim-middle>",
|
6 |
+
"<fim-suffix>",
|
7 |
+
"<fim-pad>"
|
8 |
+
]
|
9 |
+
}
|
tokenizer.json
CHANGED
@@ -2,7 +2,53 @@
|
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
"padding": null,
|
5 |
-
"added_tokens": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"normalizer": null,
|
7 |
"pre_tokenizer": {
|
8 |
"type": "Sequence",
|
|
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
"padding": null,
|
5 |
+
"added_tokens": [
|
6 |
+
{
|
7 |
+
"id": 49152,
|
8 |
+
"content": "<|endoftext|>",
|
9 |
+
"single_word": false,
|
10 |
+
"lstrip": false,
|
11 |
+
"rstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"special": true
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"id": 49153,
|
17 |
+
"content": "<fim-prefix>",
|
18 |
+
"single_word": false,
|
19 |
+
"lstrip": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"normalized": false,
|
22 |
+
"special": true
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"id": 49154,
|
26 |
+
"content": "<fim-middle>",
|
27 |
+
"single_word": false,
|
28 |
+
"lstrip": false,
|
29 |
+
"rstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"special": true
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"id": 49155,
|
35 |
+
"content": "<fim-suffix>",
|
36 |
+
"single_word": false,
|
37 |
+
"lstrip": false,
|
38 |
+
"rstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"special": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"id": 49156,
|
44 |
+
"content": "<fim-pad>",
|
45 |
+
"single_word": false,
|
46 |
+
"lstrip": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"special": true
|
50 |
+
}
|
51 |
+
],
|
52 |
"normalizer": null,
|
53 |
"pre_tokenizer": {
|
54 |
"type": "Sequence",
|
tokenizer_config.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"
|
4 |
-
"
|
5 |
-
"
|
6 |
"model_max_length": 2048
|
7 |
-
}
|
|
|
1 |
{
|
2 |
+
"errors": "replace",
|
3 |
+
"tokenizer_class": "GPT2TokenizerFast",
|
4 |
+
"bos_token": "<|endoftext|>",
|
5 |
+
"eos_token": "<|endoftext|>",
|
6 |
"model_max_length": 2048
|
7 |
+
}
|