system HF staff commited on
Commit
44c70f0
1 Parent(s): 64d3029

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +52 -0
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ```python
3
+ import json
4
+ import os
5
+ from transformers.configuration_roberta import RobertaConfig
6
+ from transformers import RobertaForMaskedLM, TFRobertaForMaskedLM
7
+
8
+ DIRNAME = "./dummy-unknown"
9
+
10
+
11
+ config = RobertaConfig(10, 20, 1, 1, 40)
12
+
13
+ model = RobertaForMaskedLM(config)
14
+ model.save_pretrained(DIRNAME)
15
+
16
+ tf_model = TFRobertaForMaskedLM.from_pretrained(DIRNAME, from_pt=True)
17
+ tf_model.save_pretrained(DIRNAME)
18
+
19
+ # Tokenizer:
20
+
21
+ vocab = [
22
+ "l",
23
+ "o",
24
+ "w",
25
+ "e",
26
+ "r",
27
+ "s",
28
+ "t",
29
+ "i",
30
+ "d",
31
+ "n",
32
+ "\u0120",
33
+ "\u0120l",
34
+ "\u0120n",
35
+ "\u0120lo",
36
+ "\u0120low",
37
+ "er",
38
+ "\u0120lowest",
39
+ "\u0120newer",
40
+ "\u0120wider",
41
+ "<unk>",
42
+ ]
43
+ vocab_tokens = dict(zip(vocab, range(len(vocab))))
44
+ merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
45
+
46
+ vocab_file = os.path.join(DIRNAME, "vocab.json")
47
+ merges_file = os.path.join(DIRNAME, "merges.txt")
48
+ with open(vocab_file, "w", encoding="utf-8") as fp:
49
+ fp.write(json.dumps(vocab_tokens) + "\n")
50
+ with open(merges_file, "w", encoding="utf-8") as fp:
51
+ fp.write("\n".join(merges))
52
+ ```