Upload folder using huggingface_hub

86e38db verified 29 days ago

15.8 kB

	---
	language:
	- en
	license: apache-2.0
	datasets:
	- Locutusque/TM-DATA-V2
	- LLM360/TxT360
	- mlfoundations/dclm-baseline-1.0
	- Skylion007/openwebtext
	- JeanKaddour/minipile
	- eminorhan/gutenberg_en
	model-index:
	- name: TinyMistral-248M-v3
	results:
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: IFEval (0-Shot)
	type: HuggingFaceH4/ifeval
	args:
	num_few_shot: 0
	metrics:
	- type: inst_level_strict_acc and prompt_level_strict_acc
	value: 16.39
	name: strict accuracy
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: BBH (3-Shot)
	type: BBH
	args:
	num_few_shot: 3
	metrics:
	- type: acc_norm
	value: 1.78
	name: normalized accuracy
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: MATH Lvl 5 (4-Shot)
	type: hendrycks/competition_math
	args:
	num_few_shot: 4
	metrics:
	- type: exact_match
	value: 0.0
	name: exact match
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: GPQA (0-shot)
	type: Idavidrein/gpqa
	args:
	num_few_shot: 0
	metrics:
	- type: acc_norm
	value: 0.0
	name: acc_norm
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: MuSR (0-shot)
	type: TAUR-Lab/MuSR
	args:
	num_few_shot: 0
	metrics:
	- type: acc_norm
	value: 5.15
	name: acc_norm
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: MMLU-PRO (5-shot)
	type: TIGER-Lab/MMLU-Pro
	config: main
	split: test
	args:
	num_few_shot: 5
	metrics:
	- type: acc
	value: 1.47
	name: accuracy
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
	name: Open LLM Leaderboard
	---

	still in training. Trained on about ~21 billion tokens so far.

	\| Tasks \|Version\| Filter \|n-shot\| Metric \| \| Value \| \|Stderr\|
	\|----------------------------------------\|-------\|----------------\|-----:\|-----------\|---\|------:\|---\|-----:\|
	\|Open LLM Leaderboard \| N/A\| \| \| \| \| \| \| \|
	\| - arc_challenge \| 1\|none \| 25\|acc \|↑ \| 0.2005\|± \|0.0117\|
	\| \| \|none \| 25\|acc_norm \|↑ \| 0.2406\|± \|0.0125\|
	\| - gsm8k \| 3\|flexible-extract\| 5\|exact_match\|↑ \| 0.0083\|± \|0.0025\|
	\| \| \|strict-match \| 5\|exact_match\|↑ \| 0.0000\|± \|0.0000\|
	\| - hellaswag \| 1\|none \| 10\|acc \|↑ \| 0.2724\|± \|0.0044\|
	\| \| \|none \| 10\|acc_norm \|↑ \| 0.2838\|± \|0.0045\|
	\| - mmlu \| 2\|none \| \|acc \|↑ \| 0.2290\|± \|0.0035\|
	\| - humanities \| 2\|none \| \|acc \|↑ \| 0.2380\|± \|0.0062\|
	\| - formal_logic \| 1\|none \| 5\|acc \|↑ \| 0.2460\|± \|0.0385\|
	\| - high_school_european_history \| 1\|none \| 5\|acc \|↑ \| 0.1818\|± \|0.0301\|
	\| - high_school_us_history \| 1\|none \| 5\|acc \|↑ \| 0.2647\|± \|0.0310\|
	\| - high_school_world_history \| 1\|none \| 5\|acc \|↑ \| 0.2911\|± \|0.0296\|
	\| - international_law \| 1\|none \| 5\|acc \|↑ \| 0.2149\|± \|0.0375\|
	\| - jurisprudence \| 1\|none \| 5\|acc \|↑ \| 0.2685\|± \|0.0428\|
	\| - logical_fallacies \| 1\|none \| 5\|acc \|↑ \| 0.2209\|± \|0.0326\|
	\| - moral_disputes \| 1\|none \| 5\|acc \|↑ \| 0.2457\|± \|0.0232\|
	\| - moral_scenarios \| 1\|none \| 5\|acc \|↑ \| 0.2369\|± \|0.0142\|
	\| - philosophy \| 1\|none \| 5\|acc \|↑ \| 0.1865\|± \|0.0221\|
	\| - prehistory \| 1\|none \| 5\|acc \|↑ \| 0.1975\|± \|0.0222\|
	\| - professional_law \| 1\|none \| 5\|acc \|↑ \| 0.2432\|± \|0.0110\|
	\| - world_religions \| 1\|none \| 5\|acc \|↑ \| 0.3099\|± \|0.0355\|
	\| - other \| 2\|none \| \|acc \|↑ \| 0.2375\|± \|0.0076\|
	\| - business_ethics \| 1\|none \| 5\|acc \|↑ \| 0.3200\|± \|0.0469\|
	\| - clinical_knowledge \| 1\|none \| 5\|acc \|↑ \| 0.2226\|± \|0.0256\|
	\| - college_medicine \| 1\|none \| 5\|acc \|↑ \| 0.1965\|± \|0.0303\|
	\| - global_facts \| 1\|none \| 5\|acc \|↑ \| 0.1800\|± \|0.0386\|
	\| - human_aging \| 1\|none \| 5\|acc \|↑ \| 0.3004\|± \|0.0308\|
	\| - management \| 1\|none \| 5\|acc \|↑ \| 0.1942\|± \|0.0392\|
	\| - marketing \| 1\|none \| 5\|acc \|↑ \| 0.2735\|± \|0.0292\|
	\| - medical_genetics \| 1\|none \| 5\|acc \|↑ \| 0.3000\|± \|0.0461\|
	\| - miscellaneous \| 1\|none \| 5\|acc \|↑ \| 0.2478\|± \|0.0154\|
	\| - nutrition \| 1\|none \| 5\|acc \|↑ \| 0.2222\|± \|0.0238\|
	\| - professional_accounting \| 1\|none \| 5\|acc \|↑ \| 0.2021\|± \|0.0240\|
	\| - professional_medicine \| 1\|none \| 5\|acc \|↑ \| 0.1912\|± \|0.0239\|
	\| - virology \| 1\|none \| 5\|acc \|↑ \| 0.2590\|± \|0.0341\|
	\| - social sciences \| 2\|none \| \|acc \|↑ \| 0.2203\|± \|0.0075\|
	\| - econometrics \| 1\|none \| 5\|acc \|↑ \| 0.2368\|± \|0.0400\|
	\| - high_school_geography \| 1\|none \| 5\|acc \|↑ \| 0.2020\|± \|0.0286\|
	\| - high_school_government_and_politics\| 1\|none \| 5\|acc \|↑ \| 0.1865\|± \|0.0281\|
	\| - high_school_macroeconomics \| 1\|none \| 5\|acc \|↑ \| 0.2205\|± \|0.0210\|
	\| - high_school_microeconomics \| 1\|none \| 5\|acc \|↑ \| 0.2143\|± \|0.0267\|
	\| - high_school_psychology \| 1\|none \| 5\|acc \|↑ \| 0.1908\|± \|0.0168\|
	\| - human_sexuality \| 1\|none \| 5\|acc \|↑ \| 0.2672\|± \|0.0388\|
	\| - professional_psychology \| 1\|none \| 5\|acc \|↑ \| 0.2386\|± \|0.0172\|
	\| - public_relations \| 1\|none \| 5\|acc \|↑ \| 0.1727\|± \|0.0362\|
	\| - security_studies \| 1\|none \| 5\|acc \|↑ \| 0.2367\|± \|0.0272\|
	\| - sociology \| 1\|none \| 5\|acc \|↑ \| 0.2488\|± \|0.0306\|
	\| - us_foreign_policy \| 1\|none \| 5\|acc \|↑ \| 0.2600\|± \|0.0441\|
	\| - stem \| 2\|none \| \|acc \|↑ \| 0.2157\|± \|0.0073\|
	\| - abstract_algebra \| 1\|none \| 5\|acc \|↑ \| 0.2200\|± \|0.0416\|
	\| - anatomy \| 1\|none \| 5\|acc \|↑ \| 0.1778\|± \|0.0330\|
	\| - astronomy \| 1\|none \| 5\|acc \|↑ \| 0.1908\|± \|0.0320\|
	\| - college_biology \| 1\|none \| 5\|acc \|↑ \| 0.2778\|± \|0.0375\|
	\| - college_chemistry \| 1\|none \| 5\|acc \|↑ \| 0.2200\|± \|0.0416\|
	\| - college_computer_science \| 1\|none \| 5\|acc \|↑ \| 0.2100\|± \|0.0409\|
	\| - college_mathematics \| 1\|none \| 5\|acc \|↑ \| 0.2100\|± \|0.0409\|
	\| - college_physics \| 1\|none \| 5\|acc \|↑ \| 0.2157\|± \|0.0409\|
	\| - computer_security \| 1\|none \| 5\|acc \|↑ \| 0.2700\|± \|0.0446\|
	\| - conceptual_physics \| 1\|none \| 5\|acc \|↑ \| 0.2638\|± \|0.0288\|
	\| - electrical_engineering \| 1\|none \| 5\|acc \|↑ \| 0.2483\|± \|0.0360\|
	\| - elementary_mathematics \| 1\|none \| 5\|acc \|↑ \| 0.2037\|± \|0.0207\|
	\| - high_school_biology \| 1\|none \| 5\|acc \|↑ \| 0.1774\|± \|0.0217\|
	\| - high_school_chemistry \| 1\|none \| 5\|acc \|↑ \| 0.2020\|± \|0.0282\|
	\| - high_school_computer_science \| 1\|none \| 5\|acc \|↑ \| 0.2500\|± \|0.0435\|
	\| - high_school_mathematics \| 1\|none \| 5\|acc \|↑ \| 0.2148\|± \|0.0250\|
	\| - high_school_physics \| 1\|none \| 5\|acc \|↑ \| 0.2053\|± \|0.0330\|
	\| - high_school_statistics \| 1\|none \| 5\|acc \|↑ \| 0.1481\|± \|0.0242\|
	\| - machine_learning \| 1\|none \| 5\|acc \|↑ \| 0.3125\|± \|0.0440\|
	\| - truthfulqa_gen \| 3\|none \| 0\|bleu_acc \|↑ \| 0.2362\|± \|0.0149\|
	\| \| \|none \| 0\|bleu_diff \|↑ \|-1.0138\|± \|0.2569\|
	\| \| \|none \| 0\|bleu_max \|↑ \| 7.9522\|± \|0.4088\|
	\| \| \|none \| 0\|rouge1_acc \|↑ \| 0.2595\|± \|0.0153\|
	\| \| \|none \| 0\|rouge1_diff\|↑ \|-1.9129\|± \|0.4349\|
	\| \| \|none \| 0\|rouge1_max \|↑ \|21.7885\|± \|0.7307\|
	\| \| \|none \| 0\|rouge2_acc \|↑ \| 0.1200\|± \|0.0114\|
	\| \| \|none \| 0\|rouge2_diff\|↑ \|-1.9771\|± \|0.3475\|
	\| \| \|none \| 0\|rouge2_max \|↑ \| 9.0199\|± \|0.5842\|
	\| \| \|none \| 0\|rougeL_acc \|↑ \| 0.2570\|± \|0.0153\|
	\| \| \|none \| 0\|rougeL_diff\|↑ \|-1.8812\|± \|0.4185\|
	\| \| \|none \| 0\|rougeL_max \|↑ \|19.6284\|± \|0.6850\|
	\| - truthfulqa_mc1 \| 2\|none \| 0\|acc \|↑ \| 0.1983\|± \|0.0140\|
	\| - truthfulqa_mc2 \| 2\|none \| 0\|acc \|↑ \| 0.3861\|± \|0.0147\|
	\| - winogrande \| 1\|none \| 5\|acc \|↑ \| 0.4972\|± \|0.0141\|

	\| Groups \|Version\|Filter\|n-shot\|Metric\| \|Value \| \|Stderr\|
	\|-------------------\|------:\|------\|------\|------\|---\|-----:\|---\|-----:\|
	\| - mmlu \| 2\|none \| \|acc \|↑ \|0.2290\|± \|0.0035\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2380\|± \|0.0062\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2375\|± \|0.0076\|
	\| - social sciences\| 2\|none \| \|acc \|↑ \|0.2203\|± \|0.0075\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2157\|± \|0.0073\|

	\| Tasks \|Version\|Filter\|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|---------------------------------\|------:\|------\|-----:\|--------\|---\|-----:\|---\|-----:\|
	\|agieval_nous \| 0\|none \| \|acc_norm\|↑ \|0.2133\|± \|0.0081\|
	\| - agieval_aqua_rat \| 1\|none \| 0\|acc \|↑ \|0.2047\|± \|0.0254\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.1969\|± \|0.0250\|
	\| - agieval_logiqa_en \| 1\|none \| 0\|acc \|↑ \|0.2043\|± \|0.0158\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.2304\|± \|0.0165\|
	\| - agieval_lsat_ar \| 1\|none \| 0\|acc \|↑ \|0.1739\|± \|0.0250\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.1957\|± \|0.0262\|
	\| - agieval_lsat_lr \| 1\|none \| 0\|acc \|↑ \|0.1549\|± \|0.0160\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.1608\|± \|0.0163\|
	\| - agieval_lsat_rc \| 1\|none \| 0\|acc \|↑ \|0.1636\|± \|0.0226\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.2119\|± \|0.0250\|
	\| - agieval_sat_en \| 1\|none \| 0\|acc \|↑ \|0.2670\|± \|0.0309\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.2621\|± \|0.0307\|
	\| - agieval_sat_en_without_passage\| 1\|none \| 0\|acc \|↑ \|0.2670\|± \|0.0309\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.2621\|± \|0.0307\|
	\| - agieval_sat_math \| 1\|none \| 0\|acc \|↑ \|0.2182\|± \|0.0279\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.2318\|± \|0.0285\|
	\|arc_challenge \| 1\|none \| 0\|acc \|↑ \|0.1945\|± \|0.0116\|
	\| \| \|none \| 0\|acc_norm\|↑ \|0.2372\|± \|0.0124\|
	\|truthfulqa_mc2 \| 2\|none \| 0\|acc \|↑ \|0.3861\|± \|0.0147\|

	\| Groups \|Version\|Filter\|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|------------\|------:\|------\|------\|--------\|---\|-----:\|---\|-----:\|
	\|agieval_nous\| 0\|none \| \|acc_norm\|↑ \|0.2133\|± \|0.0081\|
	# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
	Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_M4-ai__TinyMistral-248M-v3)

	\| Metric \|Value\|
	\|-------------------\|----:\|
	\|Avg. \| 4.13\|
	\|IFEval (0-Shot) \|16.39\|
	\|BBH (3-Shot) \| 1.78\|
	\|MATH Lvl 5 (4-Shot)\| 0.00\|
	\|GPQA (0-shot) \| 0.00\|
	\|MuSR (0-shot) \| 5.15\|
	\|MMLU-PRO (5-shot) \| 1.47\|