Thunpitcha commited on
Commit
fc64810
1 Parent(s): a9b7548

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +2 -2
  2. tokenizer_config.json +4 -4
  3. vocab.json +88 -87
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 89,
3
- "<s>": 88
4
  }
 
1
  {
2
+ "</s>": 90,
3
+ "<s>": 89
4
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "86": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "87": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "88": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "89": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "87": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "88": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "89": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "90": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
vocab.json CHANGED
@@ -1,90 +1,91 @@
1
  {
2
- "'": 41,
3
- "[PAD]": 87,
4
- "[UNK]": 86,
5
- "_": 30,
6
- "a": 4,
7
- "c": 39,
8
- "e": 5,
9
- "h": 80,
10
- "i": 40,
11
- "j": 61,
12
- "k": 81,
13
- "l": 63,
14
- "m": 16,
15
- "n": 70,
16
- "o": 44,
17
- "r": 31,
18
- "s": 11,
19
- "t": 45,
20
- "y": 21,
21
- "|": 29,
22
- "~": 23,
23
- "ก": 3,
24
- "ข": 58,
25
- "ค": 36,
26
- "ฆ": 12,
27
- "ง": 37,
28
- "จ": 38,
29
- "ฉ": 75,
30
- "ช": 35,
31
- "ซ": 8,
32
- "ฌ": 74,
33
- "ญ": 73,
34
- "ฎ": 13,
35
- "ฏ": 48,
36
- "ฐ": 85,
37
- "ฑ": 60,
38
- "ฒ": 82,
39
- "ณ": 34,
40
- "ด": 57,
41
- "ต": 28,
42
- "ถ": 72,
43
- "ท": 51,
44
- "ธ": 76,
45
- "น": 6,
46
- "บ": 42,
47
- "ป": 78,
48
- "ผ": 46,
49
- "ฝ": 69,
50
- "พ": 1,
51
- "ฟ": 67,
52
- "ภ": 68,
53
- "ม": 77,
54
- "ย": 26,
55
- "ร": 79,
56
- "ฤ": 71,
57
- "ล": 65,
58
- "ว": 54,
59
- "ศ": 2,
60
- "ษ": 64,
61
- "ส": 56,
62
- "ห": 20,
63
- "ฬ": 55,
64
- "อ": 15,
65
- "ฮ": 49,
66
- "ะ": 83,
67
- "ั": 47,
68
- "า": 10,
69
- "ำ": 19,
70
- "ิ": 27,
71
- "ี": 0,
72
- "ึ": 33,
73
- "ื": 53,
74
- "ุ": 43,
75
  "ู": 52,
76
- "เ": 17,
77
- "แ": 32,
78
- "โ": 9,
79
- "ใ": 59,
80
- "ไ": 18,
81
- "ๅ": 66,
82
- "ๆ": 14,
83
- "็": 62,
84
- "่": 50,
85
- "้": 22,
86
- "๊": 7,
87
- "๋": 25,
88
- "์": 84,
89
- "": 24
 
90
  }
 
1
  {
2
+ "'": 38,
3
+ "[PAD]": 88,
4
+ "[UNK]": 87,
5
+ "_": 53,
6
+ "a": 73,
7
+ "c": 72,
8
+ "e": 83,
9
+ "h": 21,
10
+ "i": 6,
11
+ "j": 18,
12
+ "k": 36,
13
+ "l": 78,
14
+ "m": 70,
15
+ "n": 47,
16
+ "o": 22,
17
+ "r": 82,
18
+ "s": 7,
19
+ "t": 79,
20
+ "y": 65,
21
+ "|": 71,
22
+ "~": 24,
23
+ "ก": 35,
24
+ "ข": 54,
25
+ "ค": 14,
26
+ "ฆ": 10,
27
+ "ง": 76,
28
+ "จ": 28,
29
+ "ฉ": 59,
30
+ "ช": 81,
31
+ "ซ": 33,
32
+ "ฌ": 0,
33
+ "ญ": 23,
34
+ "ฎ": 30,
35
+ "ฏ": 67,
36
+ "ฐ": 12,
37
+ "ฑ": 75,
38
+ "ฒ": 42,
39
+ "ณ": 77,
40
+ "ด": 61,
41
+ "ต": 2,
42
+ "ถ": 57,
43
+ "ท": 37,
44
+ "ธ": 34,
45
+ "น": 69,
46
+ "บ": 29,
47
+ "ป": 32,
48
+ "ผ": 19,
49
+ "ฝ": 55,
50
+ "พ": 20,
51
+ "ฟ": 56,
52
+ "ภ": 26,
53
+ "ม": 49,
54
+ "ย": 8,
55
+ "ร": 50,
56
+ "ฤ": 39,
57
+ "ล": 43,
58
+ "ว": 27,
59
+ "ศ": 9,
60
+ "ษ": 5,
61
+ "ส": 40,
62
+ "ห": 85,
63
+ "ฬ": 64,
64
+ "อ": 1,
65
+ "ฮ": 45,
66
+ "ะ": 15,
67
+ "ั": 25,
68
+ "า": 17,
69
+ "ำ": 51,
70
+ "ิ": 4,
71
+ "ี": 66,
72
+ "ึ": 16,
73
+ "ื": 62,
74
+ "ุ": 3,
75
  "ู": 52,
76
+ "เ": 63,
77
+ "แ": 44,
78
+ "โ": 74,
79
+ "ใ": 41,
80
+ "ไ": 31,
81
+ "ๅ": 46,
82
+ "ๆ": 13,
83
+ "็": 84,
84
+ "่": 60,
85
+ "้": 86,
86
+ "๊": 80,
87
+ "๋": 68,
88
+ "์": 11,
89
+ "": 48,
90
+ "’": 58
91
  }