markussagen
commited on
Commit
•
71b4b78
1
Parent(s):
4ee7c27
initial commit
Browse files- README.md +69 -0
- config.json +38 -0
- pytorch_model.bin +3 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## XLM-R Longformer Model
|
2 |
+
XLM-R Longformer is a XLM-R model, that has been extended to allow sequence lengths up to 4096 tokens, instead of the regular 512. The model was pre-trained from the XLM-RoBERTa checkpoint using the Longformer [pre-training scheme](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) on the English WikiText-103 corpus.
|
3 |
+
|
4 |
+
The reason for this was to investigate methods for creating efficient Transformers for low-resource languages, such as Swedish, without the need to pre-train them on long-context datasets in each respecitve language. The trained model came as a result of a master thesis project at [Peltarion](https://peltarion.com/) and was fine-tuned on multilingual quesion-answering tasks, with code available [here](https://github.com/MarkusSagen/Master-Thesis-Multilingual-Longformer#xlm-r).
|
5 |
+
|
6 |
+
Since both XLM-R model and Longformer models are large models, it it recommended to run the models with NVIDIA Apex (16bit precision), large GPU and several gradient accumulation steps.
|
7 |
+
|
8 |
+
## How to Use
|
9 |
+
The model can be used as expected to fine-tune on a downstream task.
|
10 |
+
For instance for QA.
|
11 |
+
|
12 |
+
```python
|
13 |
+
import torch
|
14 |
+
from transformers import AutoModel, AutoTokenizer
|
15 |
+
|
16 |
+
MAX_SEQUENCE_LENGTH = 4096
|
17 |
+
MODEL_NAME_OR_PATH = "markussagen/xlm-roberta-longformer-base-4096"
|
18 |
+
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
20 |
+
MODEL_NAME_OR_PATH,
|
21 |
+
max_length=MAX_SEQUENCE_LENGTH,
|
22 |
+
padding="max_length",
|
23 |
+
truncation=True,
|
24 |
+
)
|
25 |
+
|
26 |
+
model = AutoModelForQuestionAnswering.from_pretrained(
|
27 |
+
MODEL_NAME_OR_PATH,
|
28 |
+
max_length=MAX_SEQUENCE_LENGTH,
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
```
|
33 |
+
|
34 |
+
## Training Procedure
|
35 |
+
The model have been trained on the WikiText-103 corpus, using a **48GB** GPU with the following training script and parameters. The model was pre-trained for 6000 iterations and took ~5 days. See the full [training script](https://github.com/MarkusSagen/Master-Thesis-Multilingual-Longformer/blob/main/scripts/finetune_qa_models.py) and [Github repo](https://github.com/MarkusSagen/Master-Thesis-Multilingual-Longformer) for more information
|
36 |
+
```sh
|
37 |
+
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
|
38 |
+
unzip wikitext-103-raw-v1.zip
|
39 |
+
|
40 |
+
export DATA_DIR=./wikitext-103-raw
|
41 |
+
|
42 |
+
scripts/run_long_lm.py \
|
43 |
+
--model_name_or_path xlm-roberta-base \
|
44 |
+
--model_name xlm-roberta-to-longformer \
|
45 |
+
--output_dir ./output \
|
46 |
+
--logging_dir ./logs \
|
47 |
+
--val_file_path $DATA_DIR/wiki.valid.raw \
|
48 |
+
--train_file_path $DATA_DIR/wiki.train.raw \
|
49 |
+
--seed 42 \
|
50 |
+
--max_pos 4096 \
|
51 |
+
--adam_epsilon 1e-8 \
|
52 |
+
--warmup_steps 500 \
|
53 |
+
--learning_rate 3e-5 \
|
54 |
+
--weight_decay 0.01 \
|
55 |
+
--max_steps 6000 \
|
56 |
+
--evaluate_during_training \
|
57 |
+
--logging_steps 50 \
|
58 |
+
--eval_steps 50 \
|
59 |
+
--save_steps 6000 \
|
60 |
+
--max_grad_norm 1.0 \
|
61 |
+
--per_device_eval_batch_size 2 \
|
62 |
+
--per_device_train_batch_size 1 \
|
63 |
+
--gradient_accumulation_steps 64 \
|
64 |
+
--overwrite_output_dir \
|
65 |
+
--fp16 \
|
66 |
+
--do_train \
|
67 |
+
--do_eval
|
68 |
+
```
|
69 |
+
|
config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/workspace/models/xlm-roberta-base-4096-seed-42-fastest-lm-complete",
|
3 |
+
"architectures": [
|
4 |
+
"LongModelForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"attention_window": [
|
8 |
+
512,
|
9 |
+
512,
|
10 |
+
512,
|
11 |
+
512,
|
12 |
+
512,
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"bos_token_id": 0,
|
22 |
+
"eos_token_id": 2,
|
23 |
+
"gradient_checkpointing": false,
|
24 |
+
"hidden_act": "gelu",
|
25 |
+
"hidden_dropout_prob": 0.1,
|
26 |
+
"hidden_size": 768,
|
27 |
+
"initializer_range": 0.02,
|
28 |
+
"intermediate_size": 3072,
|
29 |
+
"layer_norm_eps": 1e-05,
|
30 |
+
"max_position_embeddings": 4098,
|
31 |
+
"model_type": "xlm-roberta",
|
32 |
+
"num_attention_heads": 12,
|
33 |
+
"num_hidden_layers": 12,
|
34 |
+
"output_past": true,
|
35 |
+
"pad_token_id": 1,
|
36 |
+
"type_vocab_size": 1,
|
37 |
+
"vocab_size": 250002
|
38 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6424e018ecc504ad17de8a41f749f442696b4df17e1d893c6d3befdd9754627a
|
3 |
+
size 1124321824
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 4096, "name_or_path": "xlm-roberta-base"}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5981796ba7fa7515e0a01f5f9877710858d1d660e6e4669ccda4356a5224e278
|
3 |
+
size 1903
|