add low VRAM example
Browse files
README.md
CHANGED
@@ -313,4 +313,32 @@ def generate(text):
|
|
313 |
|
314 |
# Now you can simply call the generate function with an English text you want to translate:
|
315 |
generate("I'm super excited about this Norwegian NORA model! Can it translate these sentences?")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
```
|
|
|
313 |
|
314 |
# Now you can simply call the generate function with an English text you want to translate:
|
315 |
generate("I'm super excited about this Norwegian NORA model! Can it translate these sentences?")
|
316 |
+
```
|
317 |
+
|
318 |
+
## Example usage on a GPU with ~16GB VRAM (try for yourself [in Google Colab](https://colab.research.google.com/drive/1AQgJ8lN-SNOqkUKj4xpQI5rr0R7V2Xzy?usp=sharing))
|
319 |
+
Install bitsandbytes if you want to load in 8bit
|
320 |
+
|
321 |
+
```bash
|
322 |
+
pip install bitsandbytes
|
323 |
+
pip install accelerate
|
324 |
+
```
|
325 |
+
|
326 |
+
|
327 |
+
```python
|
328 |
+
import torch
|
329 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
330 |
+
|
331 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
332 |
+
"norallm/norbloom-7b-scratch"
|
333 |
+
)
|
334 |
+
|
335 |
+
# This setup needs about 8gb VRAM
|
336 |
+
# Setting `load_in_8bit=False` -> 15gb VRAM
|
337 |
+
# Using `torch.float32` and `load_in_8bit=False` -> 21gb VRAM
|
338 |
+
model = AutoModelForCausalLM.from_pretrained(
|
339 |
+
"norallm/norbloom-7b-scratch",
|
340 |
+
device_map='auto',
|
341 |
+
load_in_8bit=True,
|
342 |
+
torch_dtype=torch.bfloat16
|
343 |
+
)
|
344 |
```
|