kiddothe2b commited on
Commit
92daafb
1 Parent(s): 0fed369

Training in progress, step 32000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a82d4998e15048f7276ff7bf21cf172b2b8f99b8e3bce01b447dd4dc2e0f4219
3
  size 745634697
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf3ee108c8f2ba9b23f3060c596f2d7671294eb95fbed83a626ba0411e4518d
3
  size 745634697
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85efd468f59e090bda67c9e694bf55407f51a1a6d9bede51d725c6b288ff9330
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f40a8a210779702f96c649bc9dbed9a90c12a3b8bab107ef98032748d83bd704
3
  size 372832803
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5885efec8b7366a4aa17af5e032d3298449da4e1fd163c7c8437f60c984450c3
3
  size 15523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:307891890a6886a0f2866d18bee92a472e09068da5f61f3406e9a78c6d34b755
3
  size 15523
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac4294ae0275bdf2fd072eb3d13fea356c3c27e1570dc0dcf8759f2decf14230
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c02242982b7083ab1d96e3b6483fb121704f1319dc8785b58239e35376918c1
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4,
5
- "global_step": 25600,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1578,11 +1578,404 @@
1578
  "eval_samples_per_second": 47.571,
1579
  "eval_steps_per_second": 2.973,
1580
  "step": 25600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1581
  }
1582
  ],
1583
  "max_steps": 64000,
1584
  "num_train_epochs": 9223372036854775807,
1585
- "total_flos": 1.353967057502208e+17,
1586
  "trial_name": null,
1587
  "trial_params": null
1588
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5,
5
+ "global_step": 32000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1578
  "eval_samples_per_second": 47.571,
1579
  "eval_steps_per_second": 2.973,
1580
  "step": 25600
1581
+ },
1582
+ {
1583
+ "epoch": 0.4,
1584
+ "learning_rate": 0.001,
1585
+ "loss": 8.0132,
1586
+ "step": 25700
1587
+ },
1588
+ {
1589
+ "epoch": 0.4,
1590
+ "learning_rate": 0.001,
1591
+ "loss": 8.0267,
1592
+ "step": 25800
1593
+ },
1594
+ {
1595
+ "epoch": 0.4,
1596
+ "learning_rate": 0.001,
1597
+ "loss": 8.0349,
1598
+ "step": 25900
1599
+ },
1600
+ {
1601
+ "epoch": 0.41,
1602
+ "learning_rate": 0.001,
1603
+ "loss": 8.0377,
1604
+ "step": 26000
1605
+ },
1606
+ {
1607
+ "epoch": 0.41,
1608
+ "learning_rate": 0.001,
1609
+ "loss": 8.0409,
1610
+ "step": 26100
1611
+ },
1612
+ {
1613
+ "epoch": 0.41,
1614
+ "learning_rate": 0.001,
1615
+ "loss": 8.0425,
1616
+ "step": 26200
1617
+ },
1618
+ {
1619
+ "epoch": 0.41,
1620
+ "learning_rate": 0.001,
1621
+ "loss": 8.0298,
1622
+ "step": 26300
1623
+ },
1624
+ {
1625
+ "epoch": 0.41,
1626
+ "learning_rate": 0.001,
1627
+ "loss": 8.0544,
1628
+ "step": 26400
1629
+ },
1630
+ {
1631
+ "epoch": 0.41,
1632
+ "learning_rate": 0.001,
1633
+ "loss": 8.0618,
1634
+ "step": 26500
1635
+ },
1636
+ {
1637
+ "epoch": 0.42,
1638
+ "learning_rate": 0.001,
1639
+ "loss": 8.0472,
1640
+ "step": 26600
1641
+ },
1642
+ {
1643
+ "epoch": 0.42,
1644
+ "learning_rate": 0.001,
1645
+ "loss": 8.0336,
1646
+ "step": 26700
1647
+ },
1648
+ {
1649
+ "epoch": 0.42,
1650
+ "learning_rate": 0.001,
1651
+ "loss": 8.0259,
1652
+ "step": 26800
1653
+ },
1654
+ {
1655
+ "epoch": 0.42,
1656
+ "learning_rate": 0.001,
1657
+ "loss": 8.0586,
1658
+ "step": 26900
1659
+ },
1660
+ {
1661
+ "epoch": 0.42,
1662
+ "learning_rate": 0.001,
1663
+ "loss": 8.0368,
1664
+ "step": 27000
1665
+ },
1666
+ {
1667
+ "epoch": 0.42,
1668
+ "learning_rate": 0.001,
1669
+ "loss": 8.0449,
1670
+ "step": 27100
1671
+ },
1672
+ {
1673
+ "epoch": 0.42,
1674
+ "learning_rate": 0.001,
1675
+ "loss": 8.0363,
1676
+ "step": 27200
1677
+ },
1678
+ {
1679
+ "epoch": 0.43,
1680
+ "learning_rate": 0.001,
1681
+ "loss": 8.0408,
1682
+ "step": 27300
1683
+ },
1684
+ {
1685
+ "epoch": 0.43,
1686
+ "learning_rate": 0.001,
1687
+ "loss": 8.0384,
1688
+ "step": 27400
1689
+ },
1690
+ {
1691
+ "epoch": 0.43,
1692
+ "learning_rate": 0.001,
1693
+ "loss": 8.0441,
1694
+ "step": 27500
1695
+ },
1696
+ {
1697
+ "epoch": 0.43,
1698
+ "learning_rate": 0.001,
1699
+ "loss": 8.0367,
1700
+ "step": 27600
1701
+ },
1702
+ {
1703
+ "epoch": 0.43,
1704
+ "learning_rate": 0.001,
1705
+ "loss": 8.007,
1706
+ "step": 27700
1707
+ },
1708
+ {
1709
+ "epoch": 0.43,
1710
+ "learning_rate": 0.001,
1711
+ "loss": 8.0189,
1712
+ "step": 27800
1713
+ },
1714
+ {
1715
+ "epoch": 0.44,
1716
+ "learning_rate": 0.001,
1717
+ "loss": 8.0445,
1718
+ "step": 27900
1719
+ },
1720
+ {
1721
+ "epoch": 0.44,
1722
+ "learning_rate": 0.001,
1723
+ "loss": 8.0319,
1724
+ "step": 28000
1725
+ },
1726
+ {
1727
+ "epoch": 0.44,
1728
+ "learning_rate": 0.001,
1729
+ "loss": 8.0251,
1730
+ "step": 28100
1731
+ },
1732
+ {
1733
+ "epoch": 0.44,
1734
+ "learning_rate": 0.001,
1735
+ "loss": 8.0329,
1736
+ "step": 28200
1737
+ },
1738
+ {
1739
+ "epoch": 0.44,
1740
+ "learning_rate": 0.001,
1741
+ "loss": 8.0335,
1742
+ "step": 28300
1743
+ },
1744
+ {
1745
+ "epoch": 0.44,
1746
+ "learning_rate": 0.001,
1747
+ "loss": 8.0351,
1748
+ "step": 28400
1749
+ },
1750
+ {
1751
+ "epoch": 0.45,
1752
+ "learning_rate": 0.001,
1753
+ "loss": 8.0346,
1754
+ "step": 28500
1755
+ },
1756
+ {
1757
+ "epoch": 0.45,
1758
+ "learning_rate": 0.001,
1759
+ "loss": 8.0238,
1760
+ "step": 28600
1761
+ },
1762
+ {
1763
+ "epoch": 0.45,
1764
+ "learning_rate": 0.001,
1765
+ "loss": 8.0372,
1766
+ "step": 28700
1767
+ },
1768
+ {
1769
+ "epoch": 0.45,
1770
+ "learning_rate": 0.001,
1771
+ "loss": 8.0329,
1772
+ "step": 28800
1773
+ },
1774
+ {
1775
+ "epoch": 0.45,
1776
+ "learning_rate": 0.001,
1777
+ "loss": 8.0469,
1778
+ "step": 28900
1779
+ },
1780
+ {
1781
+ "epoch": 0.45,
1782
+ "learning_rate": 0.001,
1783
+ "loss": 8.0512,
1784
+ "step": 29000
1785
+ },
1786
+ {
1787
+ "epoch": 0.45,
1788
+ "learning_rate": 0.001,
1789
+ "loss": 8.0712,
1790
+ "step": 29100
1791
+ },
1792
+ {
1793
+ "epoch": 0.46,
1794
+ "learning_rate": 0.001,
1795
+ "loss": 8.0281,
1796
+ "step": 29200
1797
+ },
1798
+ {
1799
+ "epoch": 0.46,
1800
+ "learning_rate": 0.001,
1801
+ "loss": 8.0215,
1802
+ "step": 29300
1803
+ },
1804
+ {
1805
+ "epoch": 0.46,
1806
+ "learning_rate": 0.001,
1807
+ "loss": 8.0279,
1808
+ "step": 29400
1809
+ },
1810
+ {
1811
+ "epoch": 0.46,
1812
+ "learning_rate": 0.001,
1813
+ "loss": 8.0259,
1814
+ "step": 29500
1815
+ },
1816
+ {
1817
+ "epoch": 0.46,
1818
+ "learning_rate": 0.001,
1819
+ "loss": 8.0386,
1820
+ "step": 29600
1821
+ },
1822
+ {
1823
+ "epoch": 0.46,
1824
+ "learning_rate": 0.001,
1825
+ "loss": 8.0274,
1826
+ "step": 29700
1827
+ },
1828
+ {
1829
+ "epoch": 0.47,
1830
+ "learning_rate": 0.001,
1831
+ "loss": 8.0392,
1832
+ "step": 29800
1833
+ },
1834
+ {
1835
+ "epoch": 0.47,
1836
+ "learning_rate": 0.001,
1837
+ "loss": 8.0247,
1838
+ "step": 29900
1839
+ },
1840
+ {
1841
+ "epoch": 0.47,
1842
+ "learning_rate": 0.001,
1843
+ "loss": 8.0488,
1844
+ "step": 30000
1845
+ },
1846
+ {
1847
+ "epoch": 0.47,
1848
+ "learning_rate": 0.001,
1849
+ "loss": 8.0593,
1850
+ "step": 30100
1851
+ },
1852
+ {
1853
+ "epoch": 0.47,
1854
+ "learning_rate": 0.001,
1855
+ "loss": 8.0317,
1856
+ "step": 30200
1857
+ },
1858
+ {
1859
+ "epoch": 0.47,
1860
+ "learning_rate": 0.001,
1861
+ "loss": 8.0359,
1862
+ "step": 30300
1863
+ },
1864
+ {
1865
+ "epoch": 0.47,
1866
+ "learning_rate": 0.001,
1867
+ "loss": 8.0255,
1868
+ "step": 30400
1869
+ },
1870
+ {
1871
+ "epoch": 0.48,
1872
+ "learning_rate": 0.001,
1873
+ "loss": 8.0325,
1874
+ "step": 30500
1875
+ },
1876
+ {
1877
+ "epoch": 0.48,
1878
+ "learning_rate": 0.001,
1879
+ "loss": 8.0467,
1880
+ "step": 30600
1881
+ },
1882
+ {
1883
+ "epoch": 0.48,
1884
+ "learning_rate": 0.001,
1885
+ "loss": 8.0361,
1886
+ "step": 30700
1887
+ },
1888
+ {
1889
+ "epoch": 0.48,
1890
+ "learning_rate": 0.001,
1891
+ "loss": 8.033,
1892
+ "step": 30800
1893
+ },
1894
+ {
1895
+ "epoch": 0.48,
1896
+ "learning_rate": 0.001,
1897
+ "loss": 8.033,
1898
+ "step": 30900
1899
+ },
1900
+ {
1901
+ "epoch": 0.48,
1902
+ "learning_rate": 0.001,
1903
+ "loss": 8.0386,
1904
+ "step": 31000
1905
+ },
1906
+ {
1907
+ "epoch": 0.49,
1908
+ "learning_rate": 0.001,
1909
+ "loss": 8.0326,
1910
+ "step": 31100
1911
+ },
1912
+ {
1913
+ "epoch": 0.49,
1914
+ "learning_rate": 0.001,
1915
+ "loss": 8.0219,
1916
+ "step": 31200
1917
+ },
1918
+ {
1919
+ "epoch": 0.49,
1920
+ "learning_rate": 0.001,
1921
+ "loss": 8.0468,
1922
+ "step": 31300
1923
+ },
1924
+ {
1925
+ "epoch": 0.49,
1926
+ "learning_rate": 0.001,
1927
+ "loss": 8.0328,
1928
+ "step": 31400
1929
+ },
1930
+ {
1931
+ "epoch": 0.49,
1932
+ "learning_rate": 0.001,
1933
+ "loss": 8.0347,
1934
+ "step": 31500
1935
+ },
1936
+ {
1937
+ "epoch": 0.49,
1938
+ "learning_rate": 0.001,
1939
+ "loss": 8.0341,
1940
+ "step": 31600
1941
+ },
1942
+ {
1943
+ "epoch": 0.5,
1944
+ "learning_rate": 0.001,
1945
+ "loss": 8.06,
1946
+ "step": 31700
1947
+ },
1948
+ {
1949
+ "epoch": 0.5,
1950
+ "learning_rate": 0.001,
1951
+ "loss": 8.0331,
1952
+ "step": 31800
1953
+ },
1954
+ {
1955
+ "epoch": 0.5,
1956
+ "learning_rate": 0.001,
1957
+ "loss": 8.052,
1958
+ "step": 31900
1959
+ },
1960
+ {
1961
+ "epoch": 0.5,
1962
+ "learning_rate": 0.001,
1963
+ "loss": 8.0273,
1964
+ "step": 32000
1965
+ },
1966
+ {
1967
+ "epoch": 0.5,
1968
+ "eval_accuracy": 0.033282358556154815,
1969
+ "eval_loss": 8.0352144241333,
1970
+ "eval_runtime": 8285.0113,
1971
+ "eval_samples_per_second": 39.572,
1972
+ "eval_steps_per_second": 2.473,
1973
+ "step": 32000
1974
  }
1975
  ],
1976
  "max_steps": 64000,
1977
  "num_train_epochs": 9223372036854775807,
1978
+ "total_flos": 1.69245882187776e+17,
1979
  "trial_name": null,
1980
  "trial_params": null
1981
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85efd468f59e090bda67c9e694bf55407f51a1a6d9bede51d725c6b288ff9330
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f40a8a210779702f96c649bc9dbed9a90c12a3b8bab107ef98032748d83bd704
3
  size 372832803