export_hf_checkpoint.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import transformers
5
+ from peft import PeftModel
6
+ from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: F402
7
+
8
+ BASE_MODEL = os.environ.get("BASE_MODEL", None)
9
+ assert (
10
+ BASE_MODEL
11
+ ), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=huggyllama/llama-7b`" # noqa: E501
12
+
13
+ tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
14
+
15
+ base_model = LlamaForCausalLM.from_pretrained(
16
+ BASE_MODEL,
17
+ load_in_8bit=False,
18
+ torch_dtype=torch.float16,
19
+ device_map={"": "cpu"},
20
+ )
21
+
22
+ first_weight = base_model.model.layers[0].self_attn.q_proj.weight
23
+ first_weight_old = first_weight.clone()
24
+
25
+ lora_model = PeftModel.from_pretrained(
26
+ base_model,
27
+ "serpdotai/llama-oasst-lora-13B",
28
+ device_map={"": "cpu"},
29
+ torch_dtype=torch.float16,
30
+ )
31
+
32
+ lora_weight = lora_model.base_model.model.model.layers[
33
+ 0
34
+ ].self_attn.q_proj.weight
35
+
36
+ assert torch.allclose(first_weight_old, first_weight)
37
+
38
+ # merge weights - new merging method from peft
39
+ lora_model = lora_model.merge_and_unload()
40
+
41
+ lora_model.train(False)
42
+
43
+ # did we do anything?
44
+ assert not torch.allclose(first_weight_old, first_weight)
45
+
46
+ lora_model_sd = lora_model.state_dict()
47
+ deloreanized_sd = {
48
+ k.replace("base_model.model.", ""): v
49
+ for k, v in lora_model_sd.items()
50
+ if "lora" not in k
51
+ }
52
+
53
+ LlamaForCausalLM.save_pretrained(
54
+ base_model, "./hf_ckpt", state_dict=deloreanized_sd, max_shard_size="400MB"
55
+ )
merge_percentage.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print("Starting script, plese wait...")
2
+
3
+ import torch
4
+ import shutil
5
+ import json
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
7
+ from tkinter.filedialog import askdirectory, askopenfilename
8
+
9
+ # Rubbish experiment by Concedo for KoboldAI usage
10
+ # Experimenting with the ability to blend weights from 2 LLMs of the same architecture
11
+ # Both models must have the same architecture, number of parameters, layer counts and types, and use the same vocab.
12
+
13
+ #mixer output settings
14
+ blend_ratio = 0.2 #setting to 0 gives first model, and 1 gives second model
15
+ fp16 = False #perform operations in fp16. Saves memory, but CPU inference will not be possible.
16
+ always_output_fp16 = True #if true, will output fp16 even if operating in fp32
17
+ max_shard_size = "2000MiB" #set output shard size
18
+ verbose_info = True #will show model information when loading
19
+ force_cpu = True #only use cpu
20
+ load_sharded = True #load both models shard by shard
21
+
22
+ #test generation settings, only for fp32
23
+ deterministic_test = True #determines if outputs are always the same
24
+ test_prompt = "Test, " #test prompt for generation. only for fp32. set to empty string to skip generating.
25
+ test_max_length = 32 #test generation length
26
+
27
+
28
+ blend_ratio_b = 1.0 - blend_ratio
29
+
30
+ def get_model_info(model):
31
+ with torch.no_grad():
32
+ outfo = ""
33
+ cntent = 0
34
+ outfo += "\n==============================\n"
35
+ for name, para in model.named_parameters():
36
+ cntent += 1
37
+ outfo += ('{}: {}'.format(name, para.shape))+"\n"
38
+ outfo += ("Num Entries: " + str(cntent))+"\n"
39
+ outfo += ("==============================\n")
40
+ return outfo
41
+
42
+ def merge_models(model1,model2):
43
+ with torch.no_grad():
44
+ tensornum = 0
45
+ for p1, p2 in zip(model1.parameters(), model2.parameters()):
46
+ p1 *= blend_ratio
47
+ p2 *= blend_ratio_b
48
+ p1 += p2
49
+ #print(p1)
50
+ #print(p2)
51
+ tensornum += 1
52
+ if verbose_info:
53
+ print("Merging tensor "+str(tensornum))
54
+ pass
55
+
56
+ def read_index_filenames(sourcedir):
57
+ index = json.load(open(sourcedir + '/pytorch_model.bin.index.json','rt'))
58
+ fl = []
59
+ for k,v in index['weight_map'].items():
60
+ if v not in fl:
61
+ fl.append(v)
62
+ return fl
63
+
64
+ print("Opening file dialog, please select FIRST model directory...")
65
+ model_path1 = askdirectory(title="Select Directory of FIRST model to merge")
66
+ print("Opening file dialog, please select SECOND model directory...")
67
+ model_path2 = askdirectory(title="Select Directory of SECOND model to merge")
68
+ print("Opening file dialog, please select OUTPUT model directory...")
69
+ model_path3 = askdirectory(title="Select Output Directory of merged model")
70
+ if not model_path1 or not model_path2:
71
+ print("\nYou must select two directories containing models to merge and one output directory. Exiting.")
72
+ exit()
73
+
74
+ with torch.no_grad():
75
+ if fp16:
76
+ torch.set_default_dtype(torch.float16)
77
+ else:
78
+ torch.set_default_dtype(torch.float32)
79
+
80
+ device = torch.device("cuda") if (torch.cuda.is_available() and not force_cpu) else torch.device("cpu")
81
+ print(device)
82
+
83
+
84
+ print("Loading Model 1...")
85
+ model1 = AutoModelForCausalLM.from_pretrained(model_path1, torch_dtype='auto') #,torch_dtype=torch.float16
86
+ model1 = model1.to(device)
87
+ model1.eval()
88
+ print("Model 1 Loaded. Dtype: " + str(model1.dtype))
89
+ print("Loading Model 2...")
90
+ model2 = AutoModelForCausalLM.from_pretrained(model_path2, torch_dtype='auto') #,torch_dtype=torch.float16
91
+ model2 = model2.to(device)
92
+ model2.eval()
93
+ print("Model 2 Loaded. Dtype: " + str(model2.dtype))
94
+
95
+ #ensure both models have the exact same layout
96
+ m1_info = get_model_info(model1)
97
+ m2_info = get_model_info(model2)
98
+
99
+ print("Merging models...")
100
+ merge_models(model1,model2)
101
+
102
+ if model_path3:
103
+ print("Saving new model...")
104
+ newsavedpath = model_path3+"/converted_model"
105
+ if always_output_fp16 and not fp16:
106
+ model1.half()
107
+ model1.save_pretrained(newsavedpath, max_shard_size=max_shard_size)
108
+ print("\nSaved to: " + newsavedpath)
109
+ else:
110
+ print("\nOutput model was not saved as no output path was selected.")
111
+
112
+ print("\nScript Completed.")
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "model_max_length": 2048,
22
+ "pad_token": null,
23
+ "sp_model_kwargs": {},
24
+ "tokenizer_class": "LlamaTokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }