fabiannagel commited on
Commit
ded35c4
1 Parent(s): e6410a2

Upload 13 files

Browse files

Configured German model added

.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Dense/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"in_features": 768, "out_features": 128, "bias": false, "activation_function": "torch.nn.modules.linear.Identity"}
1_Dense/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:082cc1e4b9bf47d7eb8979a9e2cc643f3c3ea07dc68a706c58823316235025e1
3
+ size 393304
config.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "antoinelouis/colbert-xm",
3
+ "adapter_layer_norm": false,
4
+ "adapter_reduction_factor": 2,
5
+ "adapter_reuse_layer_norm": true,
6
+ "architectures": [
7
+ "XmodModel"
8
+ ],
9
+ "attention_probs_dropout_prob": 0.1,
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": null,
12
+ "default_language": "de_DE",
13
+ "eos_token_id": 2,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 768,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "languages": [
20
+ "en_XX",
21
+ "id_ID",
22
+ "vi_VN",
23
+ "ru_RU",
24
+ "fa_IR",
25
+ "sv_SE",
26
+ "ja_XX",
27
+ "fr_XX",
28
+ "de_DE",
29
+ "ro_RO",
30
+ "ko_KR",
31
+ "hu_HU",
32
+ "es_XX",
33
+ "fi_FI",
34
+ "uk_UA",
35
+ "da_DK",
36
+ "pt_XX",
37
+ "no_XX",
38
+ "th_TH",
39
+ "pl_PL",
40
+ "bg_BG",
41
+ "nl_XX",
42
+ "zh_CN",
43
+ "he_IL",
44
+ "el_GR",
45
+ "it_IT",
46
+ "sk_SK",
47
+ "hr_HR",
48
+ "tr_TR",
49
+ "ar_AR",
50
+ "cs_CZ",
51
+ "lt_LT",
52
+ "hi_IN",
53
+ "zh_TW",
54
+ "ca_ES",
55
+ "ms_MY",
56
+ "sl_SI",
57
+ "lv_LV",
58
+ "ta_IN",
59
+ "bn_IN",
60
+ "et_EE",
61
+ "az_AZ",
62
+ "sq_AL",
63
+ "sr_RS",
64
+ "kk_KZ",
65
+ "ka_GE",
66
+ "tl_XX",
67
+ "ur_PK",
68
+ "is_IS",
69
+ "hy_AM",
70
+ "ml_IN",
71
+ "mk_MK",
72
+ "be_BY",
73
+ "la_VA",
74
+ "te_IN",
75
+ "eu_ES",
76
+ "gl_ES",
77
+ "mn_MN",
78
+ "kn_IN",
79
+ "ne_NP",
80
+ "sw_KE",
81
+ "si_LK",
82
+ "mr_IN",
83
+ "af_ZA",
84
+ "gu_IN",
85
+ "cy_GB",
86
+ "eo_EO",
87
+ "km_KH",
88
+ "ky_KG",
89
+ "uz_UZ",
90
+ "ps_AF",
91
+ "pa_IN",
92
+ "ga_IE",
93
+ "ha_NG",
94
+ "am_ET",
95
+ "lo_LA",
96
+ "ku_TR",
97
+ "so_SO",
98
+ "my_MM",
99
+ "or_IN",
100
+ "sa_IN"
101
+ ],
102
+ "layer_norm_eps": 1e-05,
103
+ "ln_before_adapter": true,
104
+ "max_position_embeddings": 514,
105
+ "model_type": "xmod",
106
+ "num_attention_heads": 12,
107
+ "num_hidden_layers": 12,
108
+ "pad_token_id": 1,
109
+ "position_embedding_type": "absolute",
110
+ "pre_norm": false,
111
+ "torch_dtype": "float32",
112
+ "transformers_version": "4.45.1",
113
+ "type_vocab_size": 1,
114
+ "use_cache": true,
115
+ "vocab_size": 250004
116
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.45.1",
5
+ "pytorch": "2.4.1"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null,
10
+ "query_prefix": "[unused0]",
11
+ "document_prefix": "[unused1]",
12
+ "query_length": 32,
13
+ "document_length": 180,
14
+ "attend_to_expansion_tokens": false,
15
+ "skiplist_words": [
16
+ "!",
17
+ "\"",
18
+ "#",
19
+ "$",
20
+ "%",
21
+ "&",
22
+ "'",
23
+ "(",
24
+ ")",
25
+ "*",
26
+ "+",
27
+ ",",
28
+ "-",
29
+ ".",
30
+ "/",
31
+ ":",
32
+ ";",
33
+ "<",
34
+ "=",
35
+ ">",
36
+ "?",
37
+ "@",
38
+ "[",
39
+ "\\",
40
+ "]",
41
+ "^",
42
+ "_",
43
+ "`",
44
+ "{",
45
+ "|",
46
+ "}",
47
+ "~"
48
+ ]
49
+ }
convert_model.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pylate import models
2
+ # pip install pylate==1.1.2
3
+
4
+
5
+ def get_pylate_model(language: str, train: bool) -> 'pylate.models.ColBERT':
6
+ """
7
+ Configures the antoinelouis/colbert-xm model for usage with PyLate
8
+ See discussion here: https://github.com/lightonai/pylate/discussions/50#discussioncomment-10691630
9
+ For language, use a code from https://huggingface.co/facebook/xmod-base#languages
10
+ """
11
+ colbert_model = models.ColBERT(model_name_or_path='antoinelouis/colbert-xm')
12
+ backbone = colbert_model[0].auto_model
13
+
14
+ if backbone.__class__.__name__.lower().startswith("xmod"):
15
+ backbone.set_default_language(language)
16
+
17
+ if train:
18
+ backbone.freeze_embeddings_and_language_adapters()
19
+
20
+ training = '' if not train else '_train'
21
+ colbert_model.save_pretrained(f'pylate-colbert-xm-{language}{training}/')
22
+
23
+
24
+ if __name__ == '__main__':
25
+ get_pylate_model(language='de_DE', train=False)
model.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: antoinelouis/colbert-xm
3
+ datasets: []
4
+ language: []
5
+ library_name: sentence-transformers
6
+ pipeline_tag: sentence-similarity
7
+ tags:
8
+ - sentence-transformers
9
+ - sentence-similarity
10
+ - feature-extraction
11
+ widget: []
12
+ ---
13
+
14
+ # SentenceTransformer based on antoinelouis/colbert-xm
15
+
16
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [antoinelouis/colbert-xm](https://huggingface.co/antoinelouis/colbert-xm). It maps sentences & paragraphs to a 128-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+ - **Model Type:** Sentence Transformer
22
+ - **Base model:** [antoinelouis/colbert-xm](https://huggingface.co/antoinelouis/colbert-xm) <!-- at revision f406563b621f86d96899d4652dfbc562692bb526 -->
23
+ - **Maximum Sequence Length:** 514 tokens
24
+ - **Output Dimensionality:** 128 tokens
25
+ - **Similarity Function:** Cosine Similarity
26
+ <!-- - **Training Dataset:** Unknown -->
27
+ <!-- - **Language:** Unknown -->
28
+ <!-- - **License:** Unknown -->
29
+
30
+ ### Model Sources
31
+
32
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
33
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
34
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
35
+
36
+ ### Full Model Architecture
37
+
38
+ ```
39
+ ColBERT(
40
+ (0): Transformer({'max_seq_length': 514, 'do_lower_case': False}) with Transformer model: XmodModel
41
+ (1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
42
+ )
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ ### Direct Usage (Sentence Transformers)
48
+
49
+ First install the Sentence Transformers library:
50
+
51
+ ```bash
52
+ pip install -U sentence-transformers
53
+ ```
54
+
55
+ Then you can load this model and run inference.
56
+ ```python
57
+ from sentence_transformers import SentenceTransformer
58
+
59
+ # Download from the 🤗 Hub
60
+ model = SentenceTransformer("sentence_transformers_model_id")
61
+ # Run inference
62
+ sentences = [
63
+ 'The weather is lovely today.',
64
+ "It's so sunny outside!",
65
+ 'He drove to the stadium.',
66
+ ]
67
+ embeddings = model.encode(sentences)
68
+ print(embeddings.shape)
69
+ # [3, 128]
70
+
71
+ # Get the similarity scores for the embeddings
72
+ similarities = model.similarity(embeddings, embeddings)
73
+ print(similarities.shape)
74
+ # [3, 3]
75
+ ```
76
+
77
+ <!--
78
+ ### Direct Usage (Transformers)
79
+
80
+ <details><summary>Click to see the direct usage in Transformers</summary>
81
+
82
+ </details>
83
+ -->
84
+
85
+ <!--
86
+ ### Downstream Usage (Sentence Transformers)
87
+
88
+ You can finetune this model on your own dataset.
89
+
90
+ <details><summary>Click to expand</summary>
91
+
92
+ </details>
93
+ -->
94
+
95
+ <!--
96
+ ### Out-of-Scope Use
97
+
98
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
99
+ -->
100
+
101
+ <!--
102
+ ## Bias, Risks and Limitations
103
+
104
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
105
+ -->
106
+
107
+ <!--
108
+ ### Recommendations
109
+
110
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
111
+ -->
112
+
113
+ ## Training Details
114
+
115
+ ### Framework Versions
116
+ - Python: 3.12.5
117
+ - Sentence Transformers: 3.0.1
118
+ - Transformers: 4.45.1
119
+ - PyTorch: 2.4.1
120
+ - Accelerate: 0.34.2
121
+ - Datasets: 3.0.1
122
+ - Tokenizers: 0.20.0
123
+
124
+ ## Citation
125
+
126
+ ### BibTeX
127
+
128
+ <!--
129
+ ## Glossary
130
+
131
+ *Clearly define terms in order to be accessible across audiences.*
132
+ -->
133
+
134
+ <!--
135
+ ## Model Card Authors
136
+
137
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
138
+ -->
139
+
140
+ <!--
141
+ ## Model Card Contact
142
+
143
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
144
+ -->
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b02095e5c4746382923f4c26caf58e2433a34134ef57a8f1ede723f4ef9cab
3
+ size 3410427464
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Dense",
12
+ "type": "pylate.models.Dense"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 514,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": "<mask>",
31
+ "sep_token": {
32
+ "content": "</s>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ "unk_token": {
39
+ "content": "<unk>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ }
45
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:410ff101f2c7d6eb4f670a9410e9e27063dc10cd0c82fb1925ee779788d3036d
3
+ size 17083106
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "250002": {
44
+ "content": "[unused0]",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "250003": {
52
+ "content": "[unused1]",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ }
59
+ },
60
+ "bos_token": "<s>",
61
+ "clean_up_tokenization_spaces": true,
62
+ "cls_token": "<s>",
63
+ "eos_token": "</s>",
64
+ "mask_token": "<mask>",
65
+ "model_max_length": 1000000000000000019884624838656,
66
+ "pad_token": "<mask>",
67
+ "sep_token": "</s>",
68
+ "tokenizer_class": "XLMRobertaTokenizer",
69
+ "unk_token": "<unk>"
70
+ }