Yosegi-0604 / README.md
NovelHack-ja's picture
Upload folder using huggingface_hub
c9aed4a verified
metadata
base_model: []
library_name: transformers
tags:
  - mergekit
  - merge

final_model

This is a merge of pre-trained language models created using mergekit.

Merge Details

Merge Method

This model was merged using the breadcrumbs_ties merge method using ./Yosegi-0603 as a base.

Models Merged

The following models were included in the merge:

  • ./Ninja-2B_JP
  • ./Yosegi-0601

Configuration

The following YAML configuration was used to produce this model:

base_model: ./Yosegi-0603
dtype: bfloat16
merge_method: breadcrumbs_ties
parameters:
  int8_mask: 1.0
  normalize: 0.0
slices:
- sources:
  - layer_range: [0, 2]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.050387850856855765
      - filter: mlp
        value: -0.17075015661203768
      - value: -0.008041653902986862
      weight:
      - filter: self_attn
        value: 0.0999312941470471
      - filter: mlp
        value: 0.541727762184749
      - value: 0.6837012779994258
  - layer_range: [0, 2]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.8218846237599902
      - filter: mlp
        value: 1.0
      - value: 0.9254078866667358
      gamma:
      - filter: self_attn
        value: -0.11213758231875963
      - filter: mlp
        value: 0.021586098873668948
      - value: -0.12827998218659437
      weight:
      - filter: self_attn
        value: 0.40391646444657003
      - filter: mlp
        value: 0.623121864641881
      - value: 0.5967833694632534
  - layer_range: [0, 2]
    model: ./Yosegi-0603
- sources:
  - layer_range: [2, 4]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.8079479346300947
      - filter: mlp
        value: 1.0
      - value: 0.710146185559419
      gamma:
      - filter: self_attn
        value: 0.1383609589681566
      - filter: mlp
        value: 0.21188532059635062
      - value: 0.2994723556443468
      weight:
      - filter: self_attn
        value: 0.48107070906079974
      - filter: mlp
        value: 0.5848073552919492
      - value: 0.4583842493359253
  - layer_range: [2, 4]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 0.934378153535579
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.073192612278188
      - filter: mlp
        value: 0.07939126555063317
      - value: -0.06891845030175699
      weight:
      - filter: self_attn
        value: 0.32120386994101
      - filter: mlp
        value: 0.5001108459121922
      - value: 0.9138710221666694
  - layer_range: [2, 4]
    model: ./Yosegi-0603
- sources:
  - layer_range: [4, 6]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 0.7237519222177541
      - value: 0.776951124863642
      gamma:
      - filter: self_attn
        value: -0.2265121048274062
      - filter: mlp
        value: -0.1757947421960496
      - value: -0.11401593728931929
      weight:
      - filter: self_attn
        value: 0.6448742737026658
      - filter: mlp
        value: 0.13809748641457986
      - value: 0.3950550285769662
  - layer_range: [4, 6]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.9649359194114893
      - filter: mlp
        value: 0.916637032428399
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.16291684846287688
      - filter: mlp
        value: -0.19013548712121703
      - value: 0.038409066391918795
      weight:
      - filter: self_attn
        value: 0.1977358472772336
      - filter: mlp
        value: 0.22661167907612348
      - value: 0.6426575016448257
  - layer_range: [4, 6]
    model: ./Yosegi-0603
- sources:
  - layer_range: [6, 8]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.8727809666891416
      - filter: mlp
        value: 1.0
      - value: 0.5160677785559116
      gamma:
      - filter: self_attn
        value: 0.14245180617134273
      - filter: mlp
        value: 0.08189992601998919
      - value: -0.1038827997670827
      weight:
      - filter: self_attn
        value: 0.23575676914257698
      - filter: mlp
        value: 0.4047231670507743
      - value: 0.34207794631274374
  - layer_range: [6, 8]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.576775501046583
      - filter: mlp
        value: -0.046028636298718645
      - value: -0.024161321403060265
      weight:
      - filter: self_attn
        value: 0.833089842843994
      - filter: mlp
        value: 0.5434667434613458
      - value: 0.2946693008513797
  - layer_range: [6, 8]
    model: ./Yosegi-0603
- sources:
  - layer_range: [8, 10]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 0.9930269337531187
      gamma:
      - filter: self_attn
        value: 0.4549980941970383
      - filter: mlp
        value: 0.10362988739411173
      - value: -0.43800391668559174
      weight:
      - filter: self_attn
        value: 0.19663450954683193
      - filter: mlp
        value: 0.16783989984505265
      - value: 0.7465091417598162
  - layer_range: [8, 10]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.797370597380894
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.0665958634205702
      - filter: mlp
        value: -0.058297473060129834
      - value: -0.38206760673090134
      weight:
      - filter: self_attn
        value: 0.7015967347604024
      - filter: mlp
        value: 0.7733694864324641
      - value: 0.7636921732342238
  - layer_range: [8, 10]
    model: ./Yosegi-0603
- sources:
  - layer_range: [10, 12]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.8047576867589878
      - filter: mlp
        value: 0.8852533319203653
      - value: 0.7707342647603538
      gamma:
      - filter: self_attn
        value: -0.054343999574509694
      - filter: mlp
        value: -0.3465154355167133
      - value: 0.022315854655582765
      weight:
      - filter: self_attn
        value: 0.4396484757291151
      - filter: mlp
        value: 0.34318396468602314
      - value: 0.8236034746664869
  - layer_range: [10, 12]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.9058471193805165
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.1221058588826469
      - filter: mlp
        value: -0.4004985640890659
      - value: 0.3219195440395816
      weight:
      - filter: self_attn
        value: 0.3565443612269864
      - filter: mlp
        value: 0.2817057075232181
      - value: 0.5934890337808251
  - layer_range: [10, 12]
    model: ./Yosegi-0603
- sources:
  - layer_range: [12, 14]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.027897116191693133
      - filter: mlp
        value: -0.1765379388255607
      - value: 0.09108936063176161
      weight:
      - filter: self_attn
        value: 0.4499753137521779
      - filter: mlp
        value: 0.901296236087911
      - value: 0.3548680126954006
  - layer_range: [12, 14]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.8973815150776497
      - filter: mlp
        value: 0.6029953465961999
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.10393082898402586
      - filter: mlp
        value: 0.15993577688878796
      - value: 0.011410411917833683
      weight:
      - filter: self_attn
        value: 0.2211644023056492
      - filter: mlp
        value: 0.5677387594231849
      - value: 0.1316535663010981
  - layer_range: [12, 14]
    model: ./Yosegi-0603
- sources:
  - layer_range: [14, 16]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.9584597245055072
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.17789727632680347
      - filter: mlp
        value: 0.2182263440314275
      - value: 0.1449547656126498
      weight:
      - filter: self_attn
        value: 0.4551004762874224
      - filter: mlp
        value: 0.9182082826762857
      - value: 0.3736989395186422
  - layer_range: [14, 16]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.7414465107848625
      - filter: mlp
        value: 1.0
      - value: 0.7894887419395906
      gamma:
      - filter: self_attn
        value: -0.07343933395880992
      - filter: mlp
        value: 0.250800731630588
      - value: -0.2948778134297542
      weight:
      - filter: self_attn
        value: 0.43125199001016495
      - filter: mlp
        value: 0.6182726353394477
      - value: 0.838902157446268
  - layer_range: [14, 16]
    model: ./Yosegi-0603
- sources:
  - layer_range: [16, 18]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.9474287877268394
      - filter: mlp
        value: 1.0
      - value: 0.9613380133344519
      gamma:
      - filter: self_attn
        value: -0.08608895546593046
      - filter: mlp
        value: -0.07275416053291164
      - value: -0.5796137860399382
      weight:
      - filter: self_attn
        value: 0.5593420897751296
      - filter: mlp
        value: 0.7339447992880666
      - value: 0.5447558586689005
  - layer_range: [16, 18]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.9321536960575384
      - filter: mlp
        value: 1.0
      - value: 0.9613033408813294
      gamma:
      - filter: self_attn
        value: 0.20610728738224296
      - filter: mlp
        value: 0.2002206706624053
      - value: -0.45349278793293785
      weight:
      - filter: self_attn
        value: 0.16162975594196963
      - filter: mlp
        value: 0.21262726992327483
      - value: 0.061213622827234075
  - layer_range: [16, 18]
    model: ./Yosegi-0603
- sources:
  - layer_range: [18, 20]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.03922456593148313
      - filter: mlp
        value: 0.3318035822806869
      - value: -0.10373990685028205
      weight:
      - filter: self_attn
        value: 0.8254441016674987
      - filter: mlp
        value: 0.4568039342431161
      - value: 0.3152648515747969
  - layer_range: [18, 20]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 0.9807358937293073
      gamma:
      - filter: self_attn
        value: -0.22734036563128657
      - filter: mlp
        value: 0.26113222150270854
      - value: 0.17739039022957015
      weight:
      - filter: self_attn
        value: 0.33759130475641996
      - filter: mlp
        value: 0.616639215544168
      - value: 0.47560658618977714
  - layer_range: [18, 20]
    model: ./Yosegi-0603
- sources:
  - layer_range: [20, 22]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.9394514442960196
      - filter: mlp
        value: 1.0
      - value: 0.9885037757465567
      gamma:
      - filter: self_attn
        value: -0.17365709450334324
      - filter: mlp
        value: 0.0712279381144505
      - value: 0.11809665485306464
      weight:
      - filter: self_attn
        value: 0.485610337254665
      - filter: mlp
        value: 0.8406593173801935
      - value: 0.5024102481819739
  - layer_range: [20, 22]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.09980202641768818
      - filter: mlp
        value: 0.051454493742856926
      - value: 0.14619126408666103
      weight:
      - filter: self_attn
        value: 0.54772456079406
      - filter: mlp
        value: 0.3440893571099615
      - value: 0.3747271233512448
  - layer_range: [20, 22]
    model: ./Yosegi-0603
- sources:
  - layer_range: [22, 24]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 0.9474712362889293
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.16020032978118146
      - filter: mlp
        value: -0.025085248873309034
      - value: 0.06046174910893976
      weight:
      - filter: self_attn
        value: 0.8654189362345427
      - filter: mlp
        value: 0.6344956382288498
      - value: 0.6383979001549549
  - layer_range: [22, 24]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.8240762427167851
      - filter: mlp
        value: 1.0
      - value: 0.9004913821398048
      gamma:
      - filter: self_attn
        value: -0.12224186789525764
      - filter: mlp
        value: -0.25877585460700525
      - value: 0.35149388360871714
      weight:
      - filter: self_attn
        value: 0.4294356408713786
      - filter: mlp
        value: 0.3920647298630233
      - value: 0.795891295390721
  - layer_range: [22, 24]
    model: ./Yosegi-0603
- sources:
  - layer_range: [24, 26]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.16915580088030202
      - filter: mlp
        value: 0.2602652727555053
      - value: 0.16985672723305376
      weight:
      - filter: self_attn
        value: 0.420377024485687
      - filter: mlp
        value: 0.3401141209432324
      - value: 0.4953511256159331
  - layer_range: [24, 26]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.7290652609253236
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.1039167464696765
      - filter: mlp
        value: -0.18476572570059685
      - value: 0.1221387313921081
      weight:
      - filter: self_attn
        value: 0.2925002157134928
      - filter: mlp
        value: 0.3854740639588027
      - value: 0.555448110317977
  - layer_range: [24, 26]
    model: ./Yosegi-0603
- sources:
  - layer_range: [26, 28]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 1.0
      - filter: mlp
        value: 0.9104496350690235
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.24831264214235005
      - filter: mlp
        value: -0.03903149241855605
      - value: 0.14189425093398259
      weight:
      - filter: self_attn
        value: 0.7685811138035815
      - filter: mlp
        value: 0.06535011571274918
      - value: 0.696502559577317
  - layer_range: [26, 28]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.9236218028490522
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.2451400735890047
      - filter: mlp
        value: -0.21555851418482214
      - value: 0.020418471695148876
      weight:
      - filter: self_attn
        value: 0.451368534421561
      - filter: mlp
        value: 0.27412879847687055
      - value: 0.18339776770537336
  - layer_range: [26, 28]
    model: ./Yosegi-0603
- sources:
  - layer_range: [28, 30]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.8590812961904566
      - filter: mlp
        value: 1.0
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.06934549536310654
      - filter: mlp
        value: -0.28464693250998063
      - value: -0.0588491947891552
      weight:
      - filter: self_attn
        value: 0.26716389671655294
      - filter: mlp
        value: 0.8228280162386532
      - value: 0.24197568479527135
  - layer_range: [28, 30]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.7277181780542642
      - filter: mlp
        value: 0.74166025738732
      - value: 1.0
      gamma:
      - filter: self_attn
        value: 0.1772650150670655
      - filter: mlp
        value: 0.06545031487123437
      - value: -0.28681451125993446
      weight:
      - filter: self_attn
        value: 0.5781944040541174
      - filter: mlp
        value: 0.2288692970435767
      - value: 0.689751088930503
  - layer_range: [28, 30]
    model: ./Yosegi-0603
- sources:
  - layer_range: [30, 32]
    model: ./Yosegi-0601
    parameters:
      density:
      - filter: self_attn
        value: 0.8177341862620365
      - filter: mlp
        value: 0.8875629677599377
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.06572527259889459
      - filter: mlp
        value: -0.18979543285938766
      - value: -0.24122036571646263
      weight:
      - filter: self_attn
        value: 0.5818433594657613
      - filter: mlp
        value: 0.36676821100234736
      - value: 0.3580688869263428
  - layer_range: [30, 32]
    model: ./Ninja-2B_JP
    parameters:
      density:
      - filter: self_attn
        value: 0.8306036003344672
      - filter: mlp
        value: 0.6993970248745297
      - value: 1.0
      gamma:
      - filter: self_attn
        value: -0.20599853236581384
      - filter: mlp
        value: -0.2001187634455465
      - value: -0.07654635090020837
      weight:
      - filter: self_attn
        value: 0.37120677279712305
      - filter: mlp
        value: 0.13105486609905853
      - value: 0.7204857820148367
  - layer_range: [30, 32]
    model: ./Yosegi-0603
tokenizer_source: union