sync from upstream

Browse files

Files changed (3) hide show

configuration_chatglm.py +2 -0
modeling_chatglm.py +42 -6
quantization.py +370 -56

configuration_chatglm.py CHANGED Viewed

@@ -73,6 +73,7 @@ class ChatGLMConfig(PretrainedConfig):
             inner_hidden_size=16384,
             position_encoding_2d=True,
             quantization_bit=0,
             pre_seq_len=None,
             prefix_projection=False,
             **kwargs
@@ -92,6 +93,7 @@ class ChatGLMConfig(PretrainedConfig):
         self.gmask_token_id = gmask_token_id
         self.position_encoding_2d = position_encoding_2d
         self.quantization_bit = quantization_bit
         self.pre_seq_len = pre_seq_len
         self.prefix_projection = prefix_projection

             inner_hidden_size=16384,
             position_encoding_2d=True,
             quantization_bit=0,
+            quantization_embeddings=False,
             pre_seq_len=None,
             prefix_projection=False,
             **kwargs
         self.gmask_token_id = gmask_token_id
         self.position_encoding_2d = position_encoding_2d
         self.quantization_bit = quantization_bit
+        self.quantization_embeddings = quantization_embeddings
         self.pre_seq_len = pre_seq_len
         self.prefix_projection = prefix_projection

modeling_chatglm.py CHANGED Viewed

@@ -32,6 +32,7 @@ from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaL
 from .configuration_chatglm import ChatGLMConfig
 # flags required to enable jit fusion kernels
 if sys.platform != 'darwin':
@@ -224,7 +225,6 @@ class RotaryEmbedding(torch.nn.Module):
             self.sin_cached = fn(self.sin_cached)
         return super()._apply(fn)
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
     return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
@@ -1059,7 +1059,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         self.quantized = False
         if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
     def get_output_embeddings(self):
         return self.lm_head
@@ -1418,19 +1418,55 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
                 break
             yield input_ids
-    def quantize(self, bits: int, empty_init=False, **kwargs):
         if bits == 0:
             return
-        from .quantization import quantize
         if self.quantized:
-            logger.info("Already quantized.")
             return self
         self.quantized = True
         self.config.quantization_bit = bits
-        self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
         return self

 from .configuration_chatglm import ChatGLMConfig
 # flags required to enable jit fusion kernels
 if sys.platform != 'darwin':
             self.sin_cached = fn(self.sin_cached)
         return super()._apply(fn)
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
     return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
         self.quantized = False
         if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, self.config.quantization_embeddings, use_quantization_cache=True, empty_init=True)
     def get_output_embeddings(self):
         return self.lm_head
                 break
             yield input_ids
+    def quantize(self, bits: int, quantize_embeddings=False, use_quantization_cache=False, empty_init=False, **kwargs):
         if bits == 0:
             return
+        from .quantization import quantize, QuantizedEmbedding, QuantizedLinear, load_cpu_kernel
         if self.quantized:
+            if self.device == torch.device("cpu"):
+                logger.info("Already quantized, reloading cpu kernel.")
+                load_cpu_kernel(**kwargs)
+            else:
+                logger.info("Already quantized.")
             return self
         self.quantized = True
         self.config.quantization_bit = bits
+        self.config.quantization_embeddings = quantize_embeddings
+        self.transformer = quantize(self.transformer, bits, use_quantization_cache=use_quantization_cache, empty_init=empty_init, **kwargs)
+        if self.device == torch.device("cpu"):
+            dtype = torch.float32
+        else:
+            dtype = torch.half
+        if quantize_embeddings:
+            logger.info("Applying quantization to embeddings")
+            self.transformer.word_embeddings = QuantizedEmbedding(
+                weight_bit_width=bits,
+                weight_tensor=self.transformer.word_embeddings.weight.to(self.device),
+                num_embeddings=self.transformer.word_embeddings.num_embeddings,
+                embedding_dim=self.transformer.word_embeddings.embedding_dim,
+                dtype=dtype,
+                empty_init=empty_init,
+                device=self.transformer.word_embeddings.weight.device,
+            )
+            self.lm_head = QuantizedLinear(
+                weight_bit_width=bits,
+                weight_tensor=self.lm_head.weight.to(self.device),
+                bias_tensor=None,
+                in_features=self.lm_head.in_features,
+                out_features=self.lm_head.out_features,
+                bias=False,
+                quantized_weight=self.transformer.word_embeddings.weight,
+                quantized_weight_scale=self.transformer.word_embeddings.weight_scale,
+                dtype=dtype,
+                empty_init=empty_init,
+                device=self.lm_head.weight.device,
+            )
         return self

quantization.py CHANGED Viewed

@@ -1,6 +1,8 @@
-from torch.nn import Linear
 from torch.nn.parameter import Parameter
 import bz2
 import torch
 import base64
@@ -38,7 +40,7 @@ try:
     )
 except Exception as exception:
     kernels = None
-    logger.warning("Failed to load cpm_kernels:" + str(exception))
 class W8A16Linear(torch.autograd.Function):
@@ -64,25 +66,193 @@ class W8A16Linear(torch.autograd.Function):
         return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
 def compress_int4_weight(weight: torch.Tensor):  # (n, m)
-    with torch.cuda.device(weight.device):
         n, m = weight.size(0), weight.size(1)
         assert m % 2 == 0
         m = m // 2
-        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
-        stream = torch.cuda.current_stream()
-        gridDim = (n, 1, 1)
-        blockDim = (min(round_up(m, 32), 1024), 1, 1)
-        kernels.int4WeightCompression(
-            gridDim,
-            blockDim,
-            0,
-            stream,
-            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
         )
         return out
 def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
@@ -117,85 +287,229 @@ def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, sourc
         return out
 class QuantizedLinear(Linear):
-    def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, empty_init=False, *args, **kwargs):
         super(QuantizedLinear, self).__init__(*args, **kwargs)
         self.weight_bit_width = weight_bit_width
-        shape = self.weight.shape
-        del self.weight
-        if weight_tensor is None or empty_init:
-            self.weight = torch.empty(
-                shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
-            )
-            self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
         else:
-            self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
-            self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
-            if weight_bit_width == 4:
-                self.weight = compress_int4_weight(self.weight)
-        self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
-        self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
         if bias_tensor is not None:
             self.bias = Parameter(bias_tensor.to(kwargs["device"]), requires_grad=False)
         else:
             self.bias = None
     def forward(self, input):
-        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
         if self.bias is not None:
             output = output + self.bias
         return output
-def quantize(model, weight_bit_width, empty_init=False, **kwargs):
     """Replace fp16 linear with quantized linear"""
     for layer in model.layers:
-        layer.attention.query_key_value = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight_tensor=layer.attention.query_key_value.weight.to(torch.cuda.current_device()),
             bias_tensor=layer.attention.query_key_value.bias,
             in_features=layer.attention.query_key_value.in_features,
             out_features=layer.attention.query_key_value.out_features,
-            bias=True,
-            dtype=torch.half,
             device=layer.attention.query_key_value.weight.device,
-            empty_init=empty_init
         )
-        layer.attention.dense = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight_tensor=layer.attention.dense.weight.to(torch.cuda.current_device()),
             bias_tensor=layer.attention.dense.bias,
             in_features=layer.attention.dense.in_features,
             out_features=layer.attention.dense.out_features,
-            bias=True,
-            dtype=torch.half,
             device=layer.attention.dense.weight.device,
-            empty_init=empty_init
         )
-        layer.mlp.dense_h_to_4h = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight_tensor=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
             bias_tensor=layer.mlp.dense_h_to_4h.bias,
             in_features=layer.mlp.dense_h_to_4h.in_features,
             out_features=layer.mlp.dense_h_to_4h.out_features,
-            bias=True,
-            dtype=torch.half,
             device=layer.mlp.dense_h_to_4h.weight.device,
-            empty_init=empty_init
         )
-        layer.mlp.dense_4h_to_h = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight_tensor=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
             bias_tensor=layer.mlp.dense_4h_to_h.bias,
             in_features=layer.mlp.dense_4h_to_h.in_features,
             out_features=layer.mlp.dense_4h_to_h.out_features,
-            bias=True,
-            dtype=torch.half,
             device=layer.mlp.dense_4h_to_h.weight.device,
-            empty_init=empty_init
         )
     return model

+from torch.nn import Linear, Embedding
 from torch.nn.parameter import Parameter
+import torch.nn.functional as F
+import os
 import bz2
 import torch
 import base64
     )
 except Exception as exception:
     kernels = None
+    logger.warning("Failed to load cpm_kernels:", exception)
 class W8A16Linear(torch.autograd.Function):
         return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+class W8A16LinearCPU(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width, quantization_cache=None):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_float(quant_w, scale_w, weight_bit_width, quantization_cache=quantization_cache)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_float(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+default_cpu_kernel_code_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "quantization_kernels.c")
+default_cpu_kernel_code = "QlpoOTFBWSZTWXLbSoQAAgzbgERwQXxmTwAAr/ff3kABt0Q2oRVT0hpo9RtEAAAAyBEiSQ9EGjQGQAAAwANGhowjJoNGmgMEUplMTNSMJ5TQaDJpsoMyRMj8P4mZzFSVVwqSXG8GG7MlVwiToYEQwVD7noBxMhNfkeZYtYFtbgOBUSIGtIQjhNHCEnPJsadhb3yBmRIOD3TeAtNLSaU5GgvKUBWSNuuOIHmVt0YhW6rsmDMDUjeUJGJ64R1Jm5lrh0Aa0tKjhFwPdWcGogxLDSXPWQUWTM8Sd3Qz1HMYNxx3HMeiNqNo4jeRDEfZ3gUSHIcU/heomq0vEzL1Msz5KKGxH8FrNOYw3KaxdqaEmNHYMxJFgQbR0DyRknL2L4kwUSxKRdhjRpEtUqilVfggFL1klaMS3PPRDfNqbBOPWO7m4JTVGhS9QTBDDJaEbLbrUQNB+IpJSKQbG5SZZ5gkwJEhJ3aYKJipZ/i7kinChIOW2lQg"
+default_cpu_parallel_kernel_code_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "quantization_kernels_parallel.c")
+default_cpu_parallel_kernel_code = "QlpoOTFBWSZTWUzax5EAALXbgERwSX1mTwAAr/ff3kACNyXUbZYwBpoaNGIyAaADQwRSaVP9QoMg0A2oAPU0AEUkU9GaaKMaQB6gA09T1ARRKnpk0niaJkaaNDJ6g0DTIKVKfZ/g6v1Kem5LJLa0WmkukkuCIHUqWbtJGJMsCSQFiPEIYHgBIZDzR8R6REbYxIqD2Cu7lMkFoPu6LmHeOAy0GF83Tc40jgmTs4HnCe60QfJa2bDBZ0Y1lhgbiZjW8SNsAKCk42UOEdjWN3KoiCIYeQUCCKWIyHewhtSoInLKSG22l4jKM2ZDCVKtBm3OTYBl3jsVqMImtj7PQw7xKxLXQzwgJaPPgW1fRhrvPJICl4YFDYfNbkbBh5JDgrazFml50xEQQwQUjxNwE0IDSofLzSg7UNVKn+Rr1KErzBHUxBqdHRlXzqYsIa5K9Y0UuE2ugw3g5KYofm7AaGNTzJSMhcchhxdaU4JZ0F1UNgQ8XcGDguypqYza8yFaEoGgNRcLej+g2t0feGKFE5OY2PFluQ3q4HgycxlfvzHqo0KcM0JI8OKXtzayJFgsqC1NdUQVu8rChnA6FO3MFyGOoC9KO8ITPpYM5pRqTlczFkLES/4u5IpwoSCZtY8i"
+cpu_kernels = None
+class CPUKernel:
+    def __init__(self, kernel_file="", source_code=default_cpu_kernel_code_path, compile_parallel_kernel=None, parallel_num=None):
+        self.load =False
+        self.int8WeightExtractionFloat = None
+        self.int4WeightExtractionFloat = None
+        self.int4WeightCompression = None
+        self.SetNumThreads = lambda x: x
+        try:
+            if not os.path.exists(default_cpu_kernel_code_path):
+                with open(default_cpu_kernel_code_path, "w", encoding="utf-8") as file:
+                    code = default_cpu_kernel_code
+                    cpu_quantization_code = bz2.decompress(base64.b64decode(code)).decode()
+                    file.write(cpu_quantization_code)
+            if not os.path.exists(default_cpu_parallel_kernel_code_path):
+                with open(default_cpu_parallel_kernel_code_path, "w", encoding="utf-8") as file:
+                    code = default_cpu_parallel_kernel_code
+                    cpu_quantization_code = bz2.decompress(base64.b64decode(code)).decode()
+                    file.write(cpu_quantization_code)
+        except Exception as ex:
+            print("Error when generating default cpu kernel code(can be ignored when using custom kernels).")
+        if compile_parallel_kernel is None:
+            compile_parallel_kernel = bool(int(os.cpu_count()) >= 4)
+        if compile_parallel_kernel and source_code == default_cpu_kernel_code_path:
+            source_code = default_cpu_parallel_kernel_code_path
+        kernels = None
+        if (not kernel_file) or (not os.path.exists(kernel_file)):
+            print("No compiled kernel found.")
+            try:
+                if os.path.exists(source_code):
+                    print("Compiling kernels :", source_code)
+                    kernel_file = source_code[:-2] + ".so"
+                    if compile_parallel_kernel:
+                        compile_command = "gcc -O3 -fPIC -pthread -fopenmp -std=c99 {} -shared -o {}".format(source_code, kernel_file)
+                        print("Compiling", compile_command)
+                        exit_state = os.system(compile_command)
+                        if not exit_state:
+                            try:
+                                kernels = ctypes.cdll.LoadLibrary(kernel_file)
+                                print("Load kernel :", kernel_file)
+                            except:
+                                kernels = None
+                                print("Load parallel cpu kernel failed, using default cpu kernel code:")
+                                import traceback
+                                exception = traceback.format_exc()
+                                print(exception)
+                        else:
+                            print("Compile default cpu kernel failed, using default cpu kernel code.")
+                        if kernels is None:  # adjust config, use default cpu kernel
+                            compile_parallel_kernel = False
+                            source_code = default_cpu_kernel_code_path
+                            kernel_file = source_code[:-2] + ".so"
+                    if kernels is None:
+                        compile_command = "gcc -O3 -fPIC -std=c99 {} -shared -o {}".format(source_code, kernel_file)
+                        print("Compiling", compile_command)
+                        exit_state = os.system(compile_command)
+                        if not exit_state:
+                            try:
+                                kernels = ctypes.cdll.LoadLibrary(kernel_file)
+                                print("Load kernel :", kernel_file)
+                            except:
+                                kernels = None
+                                print("Load default cpu kernel failed:")
+                                import traceback
+                                exception = traceback.format_exc()
+                                print(exception)
+                        else:
+                            print("Compile default cpu kernel failed.")
+                else:
+                    print("Kernel source code not found.")
+                    return
+            except:
+                print("Failed to build cpu kernel:")
+                import traceback
+                exception = traceback.format_exc()
+                print(exception)
+                return
+        else:
+            try:
+                kernels = ctypes.cdll.LoadLibrary(kernel_file)
+                print("Load kernel :", kernel_file)
+            except:
+                kernels = None
+                print("Load custom cpu kernel failed:")
+                import traceback
+                exception = traceback.format_exc()
+                print(exception)
+        if kernels is not None:
+            self.int8WeightExtractionFloat = kernels.extract_int8_weight_to_float
+            self.int4WeightExtractionFloat = kernels.extract_int4_weight_to_float
+            self.int4WeightCompression = kernels.compress_int4_weight
+            if compile_parallel_kernel:
+                try:
+                    self.SetNumThreads = kernels.set_num_threads
+                except:
+                    print("No set_num_threads() found in kernel.")
+            self.load = True
+        else:
+            print("Failed to load kernel.")
+            return
+        if compile_parallel_kernel:
+            if parallel_num is None:
+                parallel_num = max(os.cpu_count() // 2, 1)
+            print("Setting CPU quantization kernel threads to", parallel_num)
+            if parallel_num < 4:
+                print("Parallel kernel is not recommended when parallel num < 4.")
+            self.SetNumThreads(parallel_num)
+        self.parallel_num = parallel_num
 def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    """compress weight on cpu or cuda to int4"""
+    if weight.device == torch.device("cpu"):
+        assert isinstance(cpu_kernels, CPUKernel)
         n, m = weight.size(0), weight.size(1)
         assert m % 2 == 0
         m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cpu")
+        cpu_kernels.int4WeightCompression(
+            ctypes.c_void_p(weight.data_ptr()),
+            ctypes.c_void_p(out.data_ptr()),
+            ctypes.c_int32(n),
+            ctypes.c_int32(m)
         )
         return out
+    else:
+        with torch.cuda.device(weight.device):
+            n, m = weight.size(0), weight.size(1)
+            assert m % 2 == 0
+            m = m // 2
+            out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+            stream = torch.cuda.current_stream()
+            gridDim = (n, 1, 1)
+            blockDim = (min(round_up(m, 32), 1024), 1, 1)
+            kernels.int4WeightCompression(
+                gridDim,
+                blockDim,
+                0,
+                stream,
+                [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+            )
+            return out
 def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
         return out
+def extract_weight_to_float(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int, quantization_cache=None):
+    """extract weight on cpu to float32"""
+    if source_bit_width == 8:
+        func = cpu_kernels.int8WeightExtractionFloat
+    elif source_bit_width == 4:
+        func = cpu_kernels.int4WeightExtractionFloat
+    else:
+        assert False, "Unsupported bit-width"
+    n, m = weight.size(0), weight.size(1)
+    if quantization_cache is not None:
+        out = quantization_cache
+        func(
+            ctypes.c_void_p(weight.data_ptr()),
+            ctypes.c_void_p(scale_list.data_ptr()),
+            ctypes.c_void_p(out.data_ptr()),
+            ctypes.c_int32(n),
+            ctypes.c_int32(m)
+        )
+        return out.tensor
+    else:
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.float, device="cpu")
+        func(
+            ctypes.c_void_p(weight.data_ptr()),
+            ctypes.c_void_p(scale_list.data_ptr()),
+            ctypes.c_void_p(out.data_ptr()),
+            ctypes.c_int32(n),
+            ctypes.c_int32(m)
+        )
+        return out
+class CacheTensor():
+    def __init__(self, *args, **kwargs):
+        self.tensor = torch.empty(*args, **kwargs)
+    def to(self, *args, **kwargs):
+        self.tensor = self.tensor.to(*args, **kwargs)
+    def data_ptr(self):
+        return self.tensor.data_ptr()
 class QuantizedLinear(Linear):
+    def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, quantized_weight=None, quantized_weight_scale=None, quantization_cache=None, empty_init=False, *args, **kwargs):
         super(QuantizedLinear, self).__init__(*args, **kwargs)
         self.weight_bit_width = weight_bit_width
+        self.quantization_cache = quantization_cache
+        if (quantized_weight is not None) and (quantized_weight_scale is not None):
+            del self.weight
+            self.weight = Parameter(quantized_weight.to(kwargs["device"]), requires_grad=False)
+            self.weight_scale = Parameter(quantized_weight_scale.to(kwargs["device"]), requires_grad=False)
         else:
+            shape = self.weight.shape
+            del self.weight
+            if weight_tensor is None or empty_init:
+                self.weight = torch.empty(
+                    shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
+                )
+                self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
+            else:
+                self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).to(kwargs["dtype"])
+                self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
+                if weight_bit_width == 4:
+                    self.weight = compress_int4_weight(self.weight)
+            self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
+            self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
         if bias_tensor is not None:
             self.bias = Parameter(bias_tensor.to(kwargs["device"]), requires_grad=False)
         else:
             self.bias = None
+    def reset_parameters(self):
+        """To accelerate initialization"""
+        pass
     def forward(self, input):
+        if self.weight.device == torch.device("cpu"):
+            output = W8A16LinearCPU.apply(input, self.weight, self.weight_scale, self.weight_bit_width, self.quantization_cache)
+        else:
+            output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
         if self.bias is not None:
             output = output + self.bias
         return output
+    def _apply(self, fn):
+        self_obj = super()._apply(fn)
+        if self.quantization_cache is not None:
+            self.quantization_cache.to(self_obj.weight.device)
+            self.quantization_cache.to(self_obj.weight_scale.dtype)
+        return self_obj
+class QuantizedEmbedding(Embedding):  # TODO: backward, check empty_init
+    def __init__(self, weight_bit_width: int, weight_tensor=None, quantized_weight=None, quantized_weight_scale=None, empty_init=False, *args, **kwargs):
+        super(QuantizedEmbedding, self).__init__(*args, **kwargs)
+        self.weight_bit_width = weight_bit_width
+        if (quantized_weight is not None) and (quantized_weight_scale is not None):
+            del self.weight
+            self.weight = Parameter(quantized_weight.to(kwargs["device"]), requires_grad=False)
+            self.weight_scale = Parameter(quantized_weight_scale.to(kwargs["device"]), requires_grad=False)
+        else:
+            shape = self.weight.shape
+            del self.weight
+            if weight_tensor is None or empty_init:
+                self.weight = torch.empty(
+                    shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
+                )
+                self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
+            else:
+                self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
+                self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
+                if weight_bit_width == 4:
+                    self.weight = compress_int4_weight(self.weight)
+            self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
+            self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
+    def forward(self, input):
+        if self.weight.device == torch.device("cpu"):
+            original_weight = extract_weight_to_float(weight=self.weight, scale_list=self.weight_scale, source_bit_width=self.weight_bit_width)
+        else:
+            original_weight = extract_weight_to_half(weight=self.weight, scale_list=self.weight_scale, source_bit_width=self.weight_bit_width)
+        output = F.embedding(
+            input, original_weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse
+        )
+        return output
+def load_cpu_kernel(**kwargs):
+    global cpu_kernels
+    cpu_kernels = CPUKernel(**kwargs)
+    assert cpu_kernels.load
+def quantize(model, weight_bit_width, use_quantization_cache=False, empty_init=False, **kwargs):
     """Replace fp16 linear with quantized linear"""
+    query_key_value_quantization_cache = None
+    dense_quantization_cache = None
+    dense_h_to_4h_quantization_cache = None
+    dense_4h_to_h_quantization_cache = None
+    try:
+        load_cpu_kernel(**kwargs)
+    except:
+        if kernels is None:  # CUDA kernels failed
+            print("Cannot load cpu or cuda kernel, quantization failed:")
+            assert kernels is not None
+        print("Cannot load cpu kernel, don't use quantized model on cpu.")
+    current_device = model.device
+    if model.device == torch.device("cpu"):
+        dtype=torch.float32
+    else:
+        dtype = torch.half
+    QuantizedLinearWithPara = partial(
+        QuantizedLinear,
+        weight_bit_width=weight_bit_width,
+        bias=True,
+        dtype=dtype,
+        empty_init=empty_init
+    )
+    if use_quantization_cache:
+        print("Using quantization cache")
+        layer = model.layers[0]
+        weight = layer.attention.query_key_value.weight
+        n, m = weight.size(0), weight.size(1)
+        query_key_value_quantization_cache = CacheTensor(n, m, dtype=dtype, device=current_device, requires_grad=False)
+        weight = layer.attention.dense.weight
+        n, m = weight.size(0), weight.size(1)
+        dense_quantization_cache = CacheTensor(n, m, dtype=dtype, device=current_device, requires_grad=False)
+        weight = layer.mlp.dense_h_to_4h.weight
+        n, m = weight.size(0), weight.size(1)
+        dense_h_to_4h_quantization_cache = CacheTensor(n, m, dtype=dtype, device=current_device, requires_grad=False)
+        weight = layer.mlp.dense_4h_to_h.weight
+        n, m = weight.size(0), weight.size(1)
+        dense_4h_to_h_quantization_cache = CacheTensor(n, m, dtype=dtype, device=current_device, requires_grad=False)
+    print("Applying quantization to glm layers")
     for layer in model.layers:
+        layer.attention.query_key_value = QuantizedLinearWithPara(
+            weight_tensor=layer.attention.query_key_value.weight.to(current_device),
             bias_tensor=layer.attention.query_key_value.bias,
             in_features=layer.attention.query_key_value.in_features,
             out_features=layer.attention.query_key_value.out_features,
             device=layer.attention.query_key_value.weight.device,
+            quantization_cache=query_key_value_quantization_cache
         )
+        layer.attention.dense = QuantizedLinearWithPara(
+            weight_tensor=layer.attention.dense.weight.to(current_device),
             bias_tensor=layer.attention.dense.bias,
             in_features=layer.attention.dense.in_features,
             out_features=layer.attention.dense.out_features,
             device=layer.attention.dense.weight.device,
+            quantization_cache=dense_quantization_cache
         )
+        layer.mlp.dense_h_to_4h = QuantizedLinearWithPara(
+            weight_tensor=layer.mlp.dense_h_to_4h.weight.to(current_device),
             bias_tensor=layer.mlp.dense_h_to_4h.bias,
             in_features=layer.mlp.dense_h_to_4h.in_features,
             out_features=layer.mlp.dense_h_to_4h.out_features,
             device=layer.mlp.dense_h_to_4h.weight.device,
+            quantization_cache=dense_h_to_4h_quantization_cache
         )
+        layer.mlp.dense_4h_to_h = QuantizedLinearWithPara(
+            weight_tensor=layer.mlp.dense_4h_to_h.weight.to(current_device),
             bias_tensor=layer.mlp.dense_4h_to_h.bias,
             in_features=layer.mlp.dense_4h_to_h.in_features,
             out_features=layer.mlp.dense_4h_to_h.out_features,
             device=layer.mlp.dense_4h_to_h.weight.device,
+            quantization_cache=dense_4h_to_h_quantization_cache
         )
     return model