Initial commit

Browse files

Files changed (9) hide show

README.md +42 -1
config.json +45 -0
opus+bt.spm32k-spm32k.transformer-align.valid1.log +16 -0
pytorch_model.bin +3 -0
source.spm +0 -0
special_tokens_map.json +5 -0
target.spm +0 -0
tokenizer_config.json +14 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,44 @@
 ---
-license: cc-by-nc-sa-4.0
 ---

 ---
+language:
+- en
+- ja
+tags:
+- translation
+- opus-mt-tc
+license: cc-by-4.0
+model-index:
+- name: opus-mt-tc-base-en-ja
+  results:
+  - task:
+      name: Translation eng-jpg
+      type: translation
+      args: eng-jpg
+    dataset:
+      name: tatoeba-test-v2021-08-07
+      type: tatoeba_mt
+      args: eng-jpg
+    metrics:
+    - name: BLEU
+      type: bleu
+      value: 15.2
 ---
+# Opus Tatoeba English-Japanese
+*This model was obtained by running the script [convert_marian_to_pytorch.py](https://github.com/huggingface/transformers/blob/master/src/transformers/models/marian/convert_marian_to_pytorch.py) with the flag `-m eng-pol`. The original models were trained by [Jörg Tiedemann](https://blogs.helsinki.fi/tiedeman/) using the [MarianNMT](https://marian-nmt.github.io/) library. See all available `MarianMTModel` models on the profile of the [Helsinki NLP](https://huggingface.co/Helsinki-NLP) group.*
+* dataset: opus+bt
+* model: transformer-align
+* source language(s): eng
+* target language(s): jpn
+* model: transformer-align
+* pre-processing: normalization + SentencePiece (spm32k,spm32k)
+* download: [opus+bt-2021-04-10.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-jpn/opus+bt-2021-04-10.zip)
+* test set translations: [opus+bt-2021-04-10.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-jpn/opus+bt-2021-04-10.test.txt)
+* test set scores: [opus+bt-2021-04-10.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-jpn/opus+bt-2021-04-10.eval.txt)
+## Benchmarks
+| testset | BLEU  | chr-F | #sent | #words | BP |
+|---------|-------|-------|-------|--------|----|
+| Tatoeba-test.eng-jpn 	| 15.2 	| 0.258 	| 10000 	| 99206 	| 1.000 |

config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "swish",
+  "architectures": [
+    "MarianMTModel"
+  ],
+  "attention_dropout": 0.0,
+  "bad_words_ids": [
+    [
+      65000
+    ]
+  ],
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 512,
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 65000,
+  "decoder_vocab_size": 65001,
+  "dropout": 0.1,
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 512,
+  "max_position_embeddings": 512,
+  "model_type": "marian",
+  "normalize_embedding": false,
+  "num_beams": 6,
+  "num_hidden_layers": 6,
+  "pad_token_id": 65000,
+  "scale_embedding": true,
+  "share_encoder_decoder_embeddings": true,
+  "static_position_embeddings": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.21.0.dev0",
+  "use_cache": true,
+  "vocab_size": 65001
+}

opus+bt.spm32k-spm32k.transformer-align.valid1.log ADDED Viewed

	@@ -0,0 +1,16 @@

+[2021-03-26 13:28:20] [valid] Ep. 1 : Up. 2500 : perplexity : 12.964 : new best
+[2021-03-26 13:40:07] [valid] Ep. 1 : Up. 5000 : perplexity : 13.8338 : stalled 1 times (last best: 12.964)
+[2021-03-26 13:51:55] [valid] Ep. 1 : Up. 7500 : perplexity : 14.2534 : stalled 2 times (last best: 12.964)
+[2021-03-26 14:03:46] [valid] Ep. 1 : Up. 10000 : perplexity : 14.6025 : stalled 3 times (last best: 12.964)
+[2021-03-26 14:15:34] [valid] Ep. 1 : Up. 12500 : perplexity : 14.756 : stalled 4 times (last best: 12.964)
+[2021-03-26 14:27:21] [valid] Ep. 1 : Up. 15000 : perplexity : 15.0628 : stalled 5 times (last best: 12.964)
+[2021-03-26 14:39:09] [valid] Ep. 1 : Up. 17500 : perplexity : 15.609 : stalled 6 times (last best: 12.964)
+[2021-03-26 14:50:58] [valid] Ep. 1 : Up. 20000 : perplexity : 15.7898 : stalled 7 times (last best: 12.964)
+[2021-03-26 15:02:46] [valid] Ep. 1 : Up. 22500 : perplexity : 15.7109 : stalled 8 times (last best: 12.964)
+[2021-03-26 15:14:39] [valid] Ep. 1 : Up. 25000 : perplexity : 15.7126 : stalled 9 times (last best: 12.964)
+[2021-03-26 15:26:43] [valid] Ep. 2 : Up. 27500 : perplexity : 15.5996 : stalled 10 times (last best: 12.964)
+[2021-03-26 15:38:35] [valid] Ep. 2 : Up. 30000 : perplexity : 15.5451 : stalled 11 times (last best: 12.964)
+[2021-03-26 15:50:25] [valid] Ep. 2 : Up. 32500 : perplexity : 15.4719 : stalled 12 times (last best: 12.964)
+[2021-03-26 16:02:16] [valid] Ep. 2 : Up. 35000 : perplexity : 15.367 : stalled 13 times (last best: 12.964)
+[2021-03-26 16:14:06] [valid] Ep. 2 : Up. 37500 : perplexity : 15.2384 : stalled 14 times (last best: 12.964)
+[2021-03-26 16:25:56] [valid] Ep. 2 : Up. 40000 : perplexity : 15.1453 : stalled 15 times (last best: 12.964)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5010d7e2ba0b133f0eb52a57780c4006ba266ae9f48a79daf7313cea52e4ab1
+size 221610115

source.spm ADDED Viewed

Binary file (808 kB). View file

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

target.spm ADDED Viewed

Binary file (834 kB). View file

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "eos_token": "</s>",
+  "model_max_length": 512,
+  "name_or_path": ".",
+  "pad_token": "<pad>",
+  "separate_vocabs": false,
+  "source_lang": "eng",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": null,
+  "target_lang": "jpn",
+  "tokenizer_class": "MarianTokenizer",
+  "tokenizer_file": null,
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff