add the config files and code

Browse files

Files changed (13) hide show

README.md +15 -0
cache.py +44 -0
config.json +2 -1
configuration_hyena.py +92 -0
engine.py +346 -0
layers.py +147 -0
model.py +425 -0
modeling_hyena.py +145 -0
special_tokens_map.json +1 -0
tokenizer.json +0 -0
tokenizer_config.json +7 -0
utils.py +89 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+license: apache-2.0
+language:
+- en
+---
+## StripedHyena-Hessian-7B (SH-7B)
+### Model Architecture
+StripedHyena is a hybrid architecture composed of multi-head, grouped-query attention and gated convolutions arranged in [Hyena](https://arxiv.org/abs/2302.10866) blocks, different from traditional decoder-only Transformers.
+  - Costant memory decoding in Hyena blocks via representation of convolutions as state-space models (modal or canonical form), or as truncated filters.
+  - Lower latency to preprocess long prompts.
+  - Improvements to training and inference compute-optimal scaling laws, compared to Transformers.

cache.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+from torch import Tensor
+from dataclasses import dataclass, field
+from typing import Optional
+# https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+@dataclass
+class RecurrentInferenceParams:
+    """Inference parameters passed to blocks with recurrent mode."""
+    fir_filter_length: int = 3
+    state_dim: int = 16
+    seqlen_offset: int = 0
+    fir_state_dict: dict = field(default_factory=dict)
+    state_dict: dict = field(default_factory=dict)
+    def reset(self):
+        self.fir_filter_length = 3
+        self.state_dim = 16
+        self.seqlen_offset = 0

config.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
-  "_commit_hash": "521ac0eba9c5f02460319b77a34d2a1a17713d79",
   "_name_or_path": "togethercomputer/StripedHyena-Hessian-7B",
   "architectures": [
     "StripedHyenaModelForCausalLM"
   ],

 {
+  "_commit_hash": "9ae63354fd42cc1e14334bba246276540c8b9017",
   "_name_or_path": "togethercomputer/StripedHyena-Hessian-7B",
+  "model_type": "stripedhyena",
   "architectures": [
     "StripedHyenaModelForCausalLM"
   ],

configuration_hyena.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from transformers import PretrainedConfig
+import json
+class StripedHyenaConfig(PretrainedConfig):
+    model_type = "stripedhyena"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        num_filters=4096,
+        inner_mlp_size=14336,
+        attn_layer_idxs=[],
+        hyena_layer_idxs=[],
+        num_layers=32,
+        tie_embeddings=False,
+        short_filter_length=3,
+        num_attention_heads=32,
+        proj_groups=4,
+        hyena_filter_groups=1,
+        split_k0=True,
+        column_split_hyena=True,
+        column_split=False,
+        model_parallel_size=1,
+        pipe_parallel_size=1,
+        short_filter_bias=True,
+        mha_out_proj_bias=False,
+        qkv_proj_bias=False,
+        final_norm=True,
+        use_cache=True,
+        use_flash_attention_2=True,
+        use_flash_rmsnorm=True,
+        use_flash_depthwise=False,
+        use_flashfft=False,
+        inference_mode=False,
+        prefill_style="fft",
+        max_seqlen=32768,
+        eps=1e-5,
+        state_size=2,
+        rotary_emb_base=500000,
+        smeared_gqa=False,
+        make_vocab_size_divisible_by=8,
+        log_intermediate_values=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.inner_mlp_size = inner_mlp_size
+        self.attn_layer_idxs = attn_layer_idxs
+        self.hyena_layer_idxs = hyena_layer_idxs
+        self.num_layers = num_layers
+        self.tie_embeddings = tie_embeddings
+        self.short_filter_length = short_filter_length
+        self.num_attention_heads = num_attention_heads
+        self.proj_groups = proj_groups
+        self.hyena_filter_groups = hyena_filter_groups
+        self.split_k0 = split_k0
+        self.column_split_hyena = column_split_hyena
+        self.column_split = column_split
+        self.model_parallel_size = model_parallel_size
+        self.pipe_parallel_size = pipe_parallel_size
+        self.short_filter_bias = short_filter_bias
+        self.mha_out_proj_bias = mha_out_proj_bias
+        self.qkv_proj_bias = qkv_proj_bias
+        self.final_norm = final_norm
+        self.use_cache = use_cache
+        self.use_flash_attention_2 = use_flash_attention_2
+        self.use_flash_rmsnorm = use_flash_rmsnorm
+        self.use_flash_depthwise = use_flash_depthwise
+        self.use_flashfft = use_flashfft
+        self.inference_mode = inference_mode
+        self.prefill_style = prefill_style
+        self.max_seqlen = max_seqlen
+        self.eps = eps
+        self.state_size = state_size
+        self.rotary_emb_base = rotary_emb_base
+        self.smeared_gqa = smeared_gqa
+        self.make_vocab_size_divisible_by = make_vocab_size_divisible_by
+        self.log_intermediate_values = log_intermediate_values
+        super().__init__(**kwargs)
+    def to_dict(self):
+        return {attr: getattr(self, attr) for attr in self.__dict__}
+    @classmethod
+    def from_original_config(cls, config_path, **kwargs):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        return cls(**config, **kwargs)

engine.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import conv1d_cpp
+except:
+    pass
+from .utils import column_split
+def canonicalize_modal_system(poles, residues):
+    """Canonicalize a modal system.
+    Args:
+        poles (Tensor): The poles of the system.
+        residues (Tensor): The residues of the system.
+    Returns:
+        Tuple[Tensor, Tensor]: The canonicalized poles and residues.
+    """
+    raise NotImplementedError
+IIR_PREFILL_MODES = [
+    "recurrence",
+    "modal-fft",
+    "hybrid-modal-recurrence",
+    "modal-scan",
+    "canonical-fft",
+    "iir-fir-caching",
+]
+class HyenaInferenceEngine:
+    def __init__(
+        self, fir_fn=None, fftconv_fn=None, iir_prefill_style="modal-fft", layer_idx=None
+    ) -> None:
+        self.fir_fn = fir_fn
+        self.fftconv_fn = fftconv_fn
+        assert (
+            iir_prefill_style in IIR_PREFILL_MODES
+        ), f"iir_prefill_style must be one of {IIR_PREFILL_MODES}"
+        self.iir_prefill_style = iir_prefill_style
+        self.layer_idx = layer_idx
+        self.low_mem_mode = False
+    def parallel_fir(
+        self,
+        fir_fn,
+        u,
+        weight,
+        bias,
+        L,
+        fir_length=3,
+        inference_params=None,
+        prefill_mode=None,
+        padding_mask=None,
+    ):
+        """Compute the output state of the long convolutional filter."""
+        # prepare input layout, dimensions and dispatch to fir kernel
+        if fir_fn != torch.nn.functional.conv1d:
+            z_pre = fir_fn(u)[:, :L]  # B, L, D
+            z_pre = z_pre.permute(0, 2, 1)
+        else:
+            u = u.permute(0, 2, 1)  # B, D, L
+            z_pre = fir_fn(
+                u,
+                weight,
+                bias,
+                stride=1,
+                padding=fir_length - 1,
+                groups=u.shape[1],
+            )[..., :L]
+        # handle padding post fir, the only place with biases
+        if type(padding_mask) == torch.Tensor:
+            z_pre = z_pre * padding_mask[:, None]
+        if inference_params is not None:
+            # handle seqlen last and dim last cases for `u`
+            if fir_fn != torch.nn.functional.conv1d:
+                fir_state = u[:, -fir_length + 1 :].permute(0, 2, 1)
+            else:
+                fir_state = u[..., -fir_length + 1 :]
+        else:
+            fir_state = None
+        return z_pre, fir_state
+    def parallel_iir(
+        self,
+        z_pre,
+        h,
+        D,
+        L,
+        poles,
+        t,
+        dims,
+        layer_idx,
+        inference_params=None,
+        prefill_style="fft",
+        fftconv_fn=None,
+        padding_mask=None,
+        use_flashfft=False,
+        column_split_hyena=False,
+        long_fir_threshold=None,
+    ):
+        """Compute the output state of the short convolutional filter."""
+        fft_size = 2 * L
+        hidden_size, num_attention_heads, hidden_size_per_attention_head, _, _ = dims
+        # Compatibility with training infra that column splits the projections
+        if column_split_hyena:
+            z = z_pre.reshape(
+                z_pre.shape[0],
+                num_attention_heads,
+                3 * hidden_size_per_attention_head,
+                z_pre.shape[2],
+            )
+            x2, x1, v = (
+                z[:, :, :hidden_size_per_attention_head],
+                z[
+                    :,
+                    :,
+                    hidden_size_per_attention_head : 2 * hidden_size_per_attention_head,
+                ],
+                z[:, :, 2 * hidden_size_per_attention_head :],
+            )
+            x2, x1, v = (
+                x2.reshape(x2.shape[0], -1, x2.shape[-1]),
+                x1.reshape(x1.shape[0], -1, x1.shape[-1]),
+                v.reshape(v.shape[0], -1, v.shape[-1]),
+            )
+        else:
+            x2, x1, v = z_pre.split([hidden_size, hidden_size, hidden_size], dim=1)
+        x1v = x1 * v
+        if use_flashfft and (L % 2) == 0:  # only works with even L
+            y = fftconv_fn(
+                x1v.to(dtype=torch.bfloat16).contiguous(),
+                h.to(dtype=torch.float32),
+            )
+            X_s = None
+        elif long_fir_threshold is None:
+            H = torch.fft.rfft(h.to(dtype=torch.float32), n=fft_size) / fft_size
+            X_s = torch.fft.fft(x1v.to(dtype=torch.float32), n=fft_size)
+            X = X_s[..., : H.shape[-1]]
+            if len(z_pre.shape) > 3:
+                H = H.unsqueeze(1)
+            y = torch.fft.irfft(X * H, n=fft_size, norm="forward")[..., :L]
+        else:
+            assert h.shape[0] == 1, "batch size must be 1 for long_fir_threshold"
+            h = h[0][:, None]  # rearrange to d, 1, l for depthwise conv1d
+            h = h[..., :long_fir_threshold]
+            y = F.conv1d(
+                x1v,
+                h.to(dtype=x1v.dtype),
+                stride=1,
+                groups=x1v.shape[1],
+                padding=h.shape[-1] - 1,
+            )[..., :L]
+        y = y.to(dtype=x1v.dtype)
+        y = (y + x1v * D.unsqueeze(-1)) * x2
+        if inference_params is not None:
+            if prefill_style == "fft":
+                self.prefill_via_modal_fft(
+                    inference_params=inference_params,
+                    x1v=x1v,
+                    X_s=X_s,
+                    L=L,
+                    t=t,
+                    poles=poles,
+                    dims=dims,
+                    layer_idx=layer_idx,
+                    use_flashfft=use_flashfft,
+                )
+            elif prefill_style == "recurrence":
+                self.prefill_via_direct_recurrence(
+                    inference_params=inference_params,
+                    x1v=x1v,
+                    L=L,
+                    poles=poles,
+                )
+            else:
+                raise NotImplementedError
+            if self.low_mem_mode:
+                del z_pre, x2, x1, v, x1v, h
+                torch.cuda.empty_cache()
+        return y.permute(0, 2, 1)
+    def step_fir(self, u, fir_state, weight, bias=None):
+        """Step the FIR filter.
+        Note:
+        `fir_state` contains the last `short_filter_length - 1` elements of `u`: `u_(L-2), u_{L-1), ...`
+        We assume dimensions of `short_filter_weight` to be `[d, 1, short_filter_len]` (SISO / multi SISO layout).
+        """
+        h0, h = weight[..., 0, -1], weight[..., 0, :-1]
+        h0, h = h0[None], h[None]
+        y = h0 * u + torch.sum(fir_state * h, dim=-1) + bias
+        # update
+        fir_state = torch.roll(fir_state, -1, dims=2)
+        fir_state[..., -1] = u
+        return y, fir_state
+    def step_iir(self, x2, x1, v, D, residues, poles, iir_state, iir_groups=1):
+        x1v = x1 * v
+        residues, poles = (
+            torch.view_as_complex(residues.to(torch.float32)),
+            torch.view_as_complex(poles.to(torch.float32)),
+        )
+        # squeeze the dummy seqlen dimension
+        # D, state_dim, 1 -> 1, D, state_dim
+        residues, poles = residues[..., 0][None], poles[..., 0][None]
+        iir_state = poles * iir_state + x1v[..., None]
+        res_state = torch.sum(residues * iir_state, dim=-1).real
+        if iir_groups > 1:
+            raise NotImplementedError
+        y = x2 * res_state + D * x1v
+        return y, iir_state
+    def prefill_via_fir_caching(self, u, inference_params, L, *args, **kwargs):
+        """Turns the IIR filter into a FIR and uses a cache for decoding."""
+        raise NotImplementedError(":)")
+    def prefill_via_direct_recurrence(self, inference_params, x1v, L, poles, *args, **kwargs):
+        """
+        Compute the IIR state via explicit SSM recurrence (modal form)
+        """
+        x1v_ = x1v[..., None, None]  # b, d, l, sdim, reim
+        x1v_ = x1v_.repeat(1, 1, 1, 1, 2)  # b, d, l, sdim, reim
+        state = x1v_[:, :, 0]
+        poles = poles[:, :, 0].to(dtype=torch.float32)
+        for i in range(L):
+            state = poles * state + x1v_[:, :, i]
+        inference_params.state_dict[self.layer_idx] = torch.view_as_complex(
+            state.to(dtype=torch.float32)
+        )
+    def prefill_via_hybrid_recurrence(
+        self, inference_params, u, log_poles, x1v_f_a, L, *args, **kwargs
+    ):
+        """
+        Compute the IIR state via hybrid recurrence-convolution over blocks
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_scan(self, u, inference_params=None, *args, **kwargs):
+        raise NotImplementedError
+    def prefill_via_canonical_fft(self, u, inference_params=None, *args, **kwargs):
+        """
+        Compute the IIR state via a single FFT with the denominator of the SSM in companion form.
+        This is the most memory efficient "parallelized" prefilling method for Hyena.
+        From: https://arxiv.org/abs/2310.18780
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_modal_fft(
+        self,
+        inference_params,
+        x1v,
+        L,
+        poles,
+        t,
+        dims,
+        layer_idx,
+        X_s=None,
+        use_flashfft=False,
+        state_dtype=torch.complex64,
+        *args,
+        **kwargs,
+    ):
+        """
+        Compute the IIR state via a single FFT, using the poles of the SSM in modal form.
+        """
+        # When the model has a long convolution derived from a SSM in modal form and prefill_style is "fft",
+        # we split the filter into poles and residues and reuse FFT computation on the input.
+        # This optimization is currently not supported when using flashfftconv.
+        hidden_size, _, _, state_size, hyena_filter_groups = dims
+        if use_flashfft:
+            # using real states
+            poles = poles.squeeze().reshape(poles.shape[0], -1)[..., None]
+            state_s = poles**t
+            if hyena_filter_groups > 1:
+                raise NotImplementedError
+            x1v = x1v[:, :, None].repeat(1, 1, 2 * state_size, 1)
+            x1v = x1v.reshape(x1v.shape[0], -1, x1v.shape[-1])
+            state_s = state_s[None]
+            state = self.fftconv_fn(
+                x1v.contiguous(),
+                state_s.to(dtype=torch.float32),
+            )
+            state = state[..., L - 1].reshape(x1v.shape[0], hidden_size, state_size, 2)
+            state = torch.view_as_complex(state.contiguous())
+            inference_params.state_dict[self.layer_idx] = state.to(dtype=state_dtype)
+        else:
+            assert X_s is not None
+            bs = x1v.shape[0]
+            fft_size = 2 * L
+            poles = torch.view_as_complex(poles.to(torch.float32))
+            state_s = poles**t
+            state_S = torch.fft.fft(state_s, n=fft_size).repeat(
+                bs, 1, 1, 1
+            )  # B, D, state_dim, 2 * L
+            if hyena_filter_groups > 1:
+                state_S = state_S.repeat_interleave(hidden_size // hyena_filter_groups, 1)
+            state = torch.fft.ifft(X_s[..., None, :] * state_S, n=fft_size)
+            inference_params.state_dict[layer_idx] = state[..., L - 1].to(dtype=state_dtype)
+    def _compute_state(self, log_poles, u, t, L, *args, **kwargs):
+        """
+        Compute the IIR state given an input `u` and log_poles of the modal system.
+        """
+        bs = u.shape[0]
+        fft_size = 2 * L
+        U = torch.fft.rfft(u.to(torch.float32), n=fft_size)
+        fft_size = 2 * L
+        x = (log_poles * t).exp()
+        # [batch, hidden_size, state_dim, 2 * seqlen]
+        X = torch.fft.fft(x, n=fft_size).repeat(bs, 1, 1, 1)
+        state = torch.fft.ifft(U[..., None, :] * X, n=fft_size)[..., :L]
+        return state

layers.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+import torch.nn as nn
+class RMSNorm(torch.nn.Module):
+    def __init__(self, config):
+        super(RMSNorm, self).__init__()
+        self.eps, self.hidden_size = config.eps, config.hidden_size
+        self.scale = torch.nn.Parameter(torch.ones(self.hidden_size))
+        self.register_parameter("scale", self.scale)
+        self.use_flash_rmsnorm = config.get("use_flash_rmsnorm", False)
+        if self.use_flash_rmsnorm:
+            try:
+                from flash_attn.ops.rms_norm import rms_norm as rmsnorm_func
+                self.rmsnorm_func = rmsnorm_func
+            except:
+                raise ImportError(
+                    "For `use_flash_rmsnorm`: `pip install git+https://github.com/HazyResearch/flash-attention.git#subdirectory=csrc/layer_norm`"
+                )
+    def forward(self, x):
+        if self.use_flash_rmsnorm:
+            return self.rmsnorm_func(x, self.scale, self.eps)
+        else:
+            y = x / (x.norm(2, dim=-1, keepdim=True) * self.hidden_size ** (-1.0 / 2) + self.eps)
+            return self.scale * y
+class ParallelGatedMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+    ):
+        super().__init__()
+        multiple_of = config.get("inner_size_multiple_of", 64)
+        self.act = F.silu
+        self.multiple_of = multiple_of * config.model_parallel_size
+        inner_size = int(2 * config.hidden_size * 4 / 3)
+        inner_size = self.multiple_of * ((inner_size + self.multiple_of - 1) // self.multiple_of)
+        # if specified in the config, inner_size will be used instead of the calculated value
+        if config.get("inner_mlp_size", None) is not None:
+            inner_size = config.inner_mlp_size
+        self.l1 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l2 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l3 = nn.Linear(
+            in_features=inner_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+    def forward(self, z):
+        z1, z2 = self.l1(z), self.l2(z)
+        if type(z1) == tuple:
+            z1 = z1[0]
+        if type(z2) == tuple:
+            z2 = z2[0]
+        y = self.l3(self.act(z1) * z2)
+        return y[0] if type(y) == tuple else y
+class Embedding(nn.Module):
+    _train_dtype = "bf16"
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+    def embed(self, input_ids, position_ids=None, tokentype_ids=None):
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+    def unembed(self, u):
+        weight = self.word_embeddings.weight
+        return torch.matmul(u, weight)
+class VocabParallelEmbedding(nn.Embedding):
+    "Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/embedding.py"
+    def __init__(self, config):
+        vocab_size, process_group, padding_idx = (
+            config.vocab_size,
+            config.get("process_group", None),
+            config.get("padding_idx", None),
+        )
+        self.process_group = process_group
+        if process_group is not None:
+            world_size = torch.distributed.get_world_size(process_group)
+            if vocab_size % world_size != 0:
+                raise ValueError(
+                    f"vocab_size ({vocab_size}) must be divisible by " f"world_size ({world_size})"
+                )
+            if world_size > 1 and padding_idx is not None:
+                raise RuntimeError("ParallelEmbedding does not support padding_idx")
+        else:
+            world_size = 1
+        super().__init__(
+            vocab_size // world_size,
+            embedding_dim=config.hidden_size,
+            padding_idx=padding_idx,
+        )
+    def embed(self, x: Tensor) -> Tensor:
+        if self.process_group is None:
+            return self.forward(x)
+        else:
+            rank = torch.distributed.get_rank(self.process_group)
+            vocab_size = self.num_embeddings
+            vocab_start_index, vocab_end_index = (
+                rank * vocab_size,
+                (rank + 1) * vocab_size,
+            )
+            # Create a mask of valid vocab ids (1 means it needs to be masked).
+            input_ids_mask = (x < vocab_start_index) | (x >= vocab_end_index)
+            x = x - vocab_start_index
+            x[input_ids_mask] = 0
+            embeddings = self.forward(x)
+            embeddings[input_ids_mask] = 0.0
+            # Reduce to the global process group
+            torch.distributed.all_reduce(embeddings, group=self.process_group)
+            return embeddings
+    def unembed(self, u: Tensor) -> Tensor:
+        if self.process_group is None:
+            return u @ self.weight.T
+        else:
+            raise NotImplementedError

model.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+# Note: MP and PP utilities are removed for ease of use and editing.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .utils import print_rank_0, column_split
+from .cache import InferenceParams, RecurrentInferenceParams
+from .engine import HyenaInferenceEngine
+from .layers import (
+    RMSNorm,
+    ParallelGatedMLP,
+    VocabParallelEmbedding,
+)
+try:
+    from flash_attn.modules.mha import MHA
+except ImportError:
+    "flash_attn not installed"
+class AttentionBlock(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.pre_norm, self.post_norm = RMSNorm(config), RMSNorm(config)
+        self.layer_idx = layer_idx
+        self.proj_groups = config.get("proj_groups", 1)
+        dtype = config.get("attn_block_dtype", torch.bfloat16)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size_per_attention_head = config.hidden_size // config.num_attention_heads
+        self.counter = 0
+        self.inner_mha_cls = MHA(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_heads_kv=config.num_attention_heads // self.proj_groups,
+            rotary_emb_dim=config.hidden_size // config.num_attention_heads,
+            qkv_proj_bias=config.get("qkv_proj_bias", True),
+            rotary_emb_base=config.get("rotary_emb_base", 10000),
+            causal=True,
+            layer_idx=layer_idx,
+            out_proj_bias=config.get("mha_out_proj_bias", True),
+            use_flash_attn=self.config.use_flash_attn,
+        ).to(dtype=dtype)
+        if self.config.get("smeared_gqa", False):
+            self.inner_mha_cls.num_heads_kv = self.inner_mha_cls.num_heads
+        self.inner_mha_cls.rotary_emb.register_buffer(
+            "inv_freq", self.inner_mha_cls.rotary_emb.inv_freq
+        )
+        self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if (
+            type(padding_mask) == torch.Tensor
+        ):  # workaround for masking bug in FA. This works because Wqkv does not have bias
+            # and attention scores will be also automatically zeroed.
+            u = u * padding_mask[..., None]
+        u = (
+            self.inner_mha_cls(
+                self.pre_norm(u),
+                inference_params=inference_params,
+            )
+            + u
+        )
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            u = u * padding_mask[..., None]
+        u = self.mlp(self.post_norm(u)) + u
+        return u, None
+class ParallelHyenaFilter(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hyena_filter_groups = config.get("hyena_filter_groups", self.config.hidden_size)
+        self.use_flashfft = config.get("use_flashfft", False)
+        self.state_size = config.state_size
+        self.hidden_size = config.hidden_size
+        self.num_filters = config.num_filters
+        self.inference_mode = config.get("inference_mode", True)
+        self.counter = 0
+        self.column_split_hyena = config.get("column_split_hyena", True)
+        assert self.hidden_size % self.num_filters == 0 and self.num_filters <= self.hidden_size
+        self.D = nn.Parameter(torch.zeros(self.hidden_size))
+        # attention heads are not used except to split post short_filter
+        # projections in the same way as the checkpoint
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
+        # after preprocessing here we can save the new checkpoint
+        self.short_filter_length = config.short_filter_length
+        self.short_filter_weight = nn.Parameter(
+            torch.randn(3 * config.hidden_size, 1, config.short_filter_length)
+        )
+        self.short_filter_bias = (
+            nn.Parameter(torch.randn(3 * config.hidden_size)) if config.short_filter_bias else None
+        )
+        self.engine = HyenaInferenceEngine(layer_idx=layer_idx)
+        self.use_flash_depthwise = config.get("use_flash_depthwise", False)
+        self.data_dtype = None
+        if self.use_flash_depthwise:
+            self.fir_fn = FlashDepthwiseConv1d(
+                channels=3 * self.hidden_size,
+                kernel_size=self.short_filter_length,
+                padding=self.short_filter_length - 1,
+                weights=self.short_filter_weight,
+                bias=self.short_filter_bias,
+                device=None,
+                dtype=self.config.get("depthwise_dtype", torch.bfloat16),
+            )
+        else:
+            self.fir_fn = F.conv1d
+        self.fftconv_fn = None
+        self.long_fir_threshold = config.get("long_fir_threshold", None)
+        if self.long_fir_threshold is not None:
+            assert (
+                self.use_flashfft is False
+            ), "long_fir_threshold not compatible with fused flashfft"
+        self.num_systems = self.hidden_size // self.hyena_filter_groups
+        self.poles = nn.Parameter(torch.randn(self.num_systems, self.state_size, 1, 2))
+        self.residues = nn.Parameter(torch.randn(self.num_systems, self.state_size, 1, 2))
+        self.h = None
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if (
+            inference_params is not None
+            and self.layer_idx in inference_params.fir_state_dict.keys()
+        ):
+            return self.sequential_forward(u, inference_params)
+        else:
+            return self.parallel_forward(u, inference_params, padding_mask)
+    def parallel_forward(self, u, inference_params=None, padding_mask=None):
+        L = u.shape[1]
+        z_pre, fir_state = self.engine.parallel_fir(
+            self.fir_fn,
+            u,
+            self.short_filter_weight,
+            self.short_filter_bias,
+            L,
+            fir_length=self.short_filter_length,
+            inference_params=inference_params,
+            padding_mask=padding_mask,
+        )
+        if inference_params:
+            inference_params.fir_state_dict[self.layer_idx] = fir_state
+        if self.h is None:
+            h, filter_dtype, poles, residues = self.compute_filter(L, u.device)
+        else:
+            h = self.h
+            filter_dtype = self.h.dtype
+        if self.hyena_filter_groups > 1:
+            h = h.repeat_interleave(self.hidden_size // self.hyena_filter_groups, 1)
+        # if inference_params is not None, we plan to perform generation:
+        # prefilling for the IIR portion of the filter is handled by the engine.
+        dims = (
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size_per_attention_head,
+            self.state_size,
+            self.hyena_filter_groups,
+        )
+        y = self.engine.parallel_iir(
+            z_pre,
+            h,
+            self.D,
+            L,
+            t=self.t,
+            poles=self.poles,
+            dims=dims,
+            inference_params=inference_params,
+            layer_idx=self.layer_idx,
+            prefill_style=self.config.get("prefill_style", "fft"),
+            use_flashfft=self.use_flashfft,
+            fftconv_fn=self.fftconv_fn,
+            column_split_hyena=self.column_split_hyena,
+            long_fir_threshold=self.long_fir_threshold,
+            padding_mask=padding_mask,
+        )
+        return y, inference_params
+    def sequential_forward(self, u, inference_params):
+        if self.data_dtype is None:
+            self.data_dtype = u.dtype
+        if len(u.shape) > 2:
+            u = u[:, -1]
+        fir_state, iir_state = (
+            inference_params.fir_state_dict[self.layer_idx],
+            inference_params.state_dict[self.layer_idx],
+        )
+        z_pre, fir_state = self.engine.step_fir(
+            u, fir_state, weight=self.short_filter_weight, bias=self.short_filter_bias
+        )
+        x2, x1, v = (
+            column_split(z_pre, self.num_attention_heads, self.hidden_size_per_attention_head)
+            if self.column_split_hyena
+            else z_pre.split([self.hidden_size, self.hidden_size, self.hidden_size], dim=1)
+        )
+        y, iir_state = self.engine.step_iir(
+            x2,
+            x1,
+            v,
+            self.D,
+            self.residues,
+            self.poles,
+            iir_state,
+            iir_groups=self.hyena_filter_groups,
+        )
+        inference_params.fir_state_dict[self.layer_idx] = fir_state
+        inference_params.state_dict[self.layer_idx] = iir_state
+        y = y.to(dtype=self.data_dtype)
+        return y[:, None], inference_params
+    def update_time(self, L, device):
+        """
+        Set [0, 1, ..., L-1] where L is the length of the current batch of inputs.
+        If L is greater than the length of the previous batch, then the time vector is
+        reinitialized. Otherwise, the time vector is truncated from cache.
+        """
+        if not hasattr(self, "t"):
+            self.t = torch.arange(L, device=device)[None, None]
+        elif self.t.shape[-1] < L:
+            self.t = torch.arange(L, device=device)[None, None]
+        else:
+            self.t = self.t[..., :L]
+    def compute_filter(self, L, device):
+        self.update_time(L, device)
+        filter_dtype = torch.float32
+        residues, log_poles = (
+            torch.view_as_complex(self.residues.to(filter_dtype)),
+            torch.view_as_complex(self.poles.to(filter_dtype)).log(),
+        )
+        h = (residues * (log_poles * self.t).exp()).real.sum(1)[None]
+        return h, filter_dtype, log_poles, residues
+class ParallelGatedConvBlock(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        dtype = config.get("hyena_block_dtype", torch.float32)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.pre_norm, self.post_norm = RMSNorm(config).to(dtype=dtype), RMSNorm(config).to(
+            dtype=dtype
+        )
+        self.filter = ParallelHyenaFilter(config, layer_idx).to(dtype=dtype)
+        self.projections = nn.Linear(config.hidden_size, 3 * config.hidden_size)
+        self.out_filter_dense = nn.Linear(config.hidden_size, config.hidden_size).to(dtype)
+        self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        z = self.projections(self.pre_norm(u))
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            z = z * padding_mask[..., None]
+        z, inference_params = self.filter(
+            z, inference_params=inference_params, padding_mask=padding_mask
+        )
+        u = self.out_filter_dense(z) + u
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            u = u * padding_mask[..., None]
+        u = self.mlp(self.post_norm(u)) + u
+        return u, inference_params
+def get_block(config, layer_idx, flash_fft=None):
+    if layer_idx in config.attn_layer_idxs:
+        return AttentionBlock(config, layer_idx)
+    elif layer_idx in config.hyena_layer_idxs:
+        block = ParallelGatedConvBlock(config, layer_idx)
+        if config.get("use_flashfft", "False"):
+            block.filter.fftconv_fn = flash_fft
+        return block
+    else:
+        raise NotImplementedError
+class StripedHyena(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embedding_layer = VocabParallelEmbedding(config)
+        self.norm = RMSNorm(config) if config.get("final_norm", True) else None
+        self.unembed = self.emb if config.tie_embeddings else VocabParallelEmbedding(config)
+        self.gradient_checkpointing = False
+        if config.get("use_flashfft", "False"):
+            raise NotImplementedError("Please use standalone SH code for other custom kernels")
+        else:
+            self.flash_fft = None
+        self.blocks = nn.ModuleList(
+            get_block(config, layer_idx, flash_fft=self.flash_fft)
+            for layer_idx in range(config.num_layers)
+        )
+    def forward(self, x, inference_params_dict=None, padding_mask=None):
+        L = x.shape[1]
+        x = self.embedding_layer.embed(x)
+        if inference_params_dict is not None:
+            x, inference_params_dict_out = self.stateful_forward(
+                x,
+                inference_params_dict=inference_params_dict,
+            )
+        else:
+            x, inference_params_dict_out = self.stateless_forward(x, padding_mask=padding_mask)
+        x = self.norm(x)
+        x = self.unembed.unembed(x)
+        return x, inference_params_dict_out
+    def stateful_forward(self, x, inference_params_dict=None):
+        for block_idx, block in enumerate(self.blocks):
+            block_name = "mha" if block_idx in self.config.attn_layer_idxs else "hyena"
+            inference_params = inference_params_dict[block_name]
+            x, _ = block(x, inference_params=inference_params)
+        return x, inference_params_dict
+    def stateless_forward(self, x, padding_mask=None):
+        if type(padding_mask) == torch.Tensor:
+            x = x * padding_mask[..., None]
+        for block_idx, block in enumerate(self.blocks):
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, inference_params=None, padding_mask=padding_mask)
+                    return custom_forward
+                x, _ = checkpoint(create_custom_forward(block), x, use_reentrant=False)
+            else:
+                x, _ = block(x, inference_params=None, padding_mask=padding_mask)
+        return x, None
+    def initialize_inference_params(self):
+        print_rank_0("Initializing inference params...")
+        inference_params_dict = {
+            "mha": InferenceParams(
+                max_seqlen=self.config.get("max_seqlen", 8192),
+                max_batch_size=self.config.get("max_batch_size", 1),
+                seqlen_offset=0,
+            ),
+            "hyena": RecurrentInferenceParams(
+                fir_filter_length=self.config.short_filter_length,
+                state_dim=self.config.state_size,
+                seqlen_offset=0,
+            ),
+        }
+        return inference_params_dict
+    def precompute_filters(self, L, device):
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == ParallelHyenaFilter:
+                    L = block.filter.long_fir_threshold or L
+                    print_rank_0(f"Precomputing filters, L={L}...")
+                    filter_dtype = torch.float16 if L >= 2048 else torch.float32
+                    block.filter._set_time(L, device)
+                    residues, poles = (
+                        torch.view_as_complex(block.filter.residues.to(torch.float16)),
+                        torch.view_as_complex(block.filter.poles.to(torch.float16)),
+                    )
+                    block.filter.h = (residues * poles**block.filter.t).real.sum(1)[None]
+                    block.filter.h = block.filter.h.to(dtype=filter_dtype)
+    def load_poles_residues(self, path):
+        "Load different poles and residues for each layer."
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == ParallelHyenaFilter:
+                    print(f"Loading poles and residues for block {block_idx}")
+                    poles = torch.load(path + f"/approx_poles_{block_idx+1}.pt", map_location="cpu")
+                    poles = torch.view_as_real(poles)
+                    residues = torch.load(
+                        path + f"/approx_residues_{block_idx+1}.pt", map_location="cpu"
+                    )
+                    residues = torch.view_as_real(residues)
+                    poles = poles.permute(1, 0, 2).unsqueeze(-2)
+                    residues = residues.permute(1, 0, 2).unsqueeze(-2)
+                    block.filter.poles = nn.Parameter(poles)
+                    block.filter.residues = nn.Parameter(residues)
+    def to_bfloat16_except_poles_residues(self):
+        """Convert all parameters to bfloat16 except for the poles and residues.
+        Particularly important for longer prompts.
+        """
+        for k, p in self.named_parameters():
+            if "poles" not in k and "residues" not in k:
+                p.data = p.data.to(torch.bfloat16)

modeling_hyena.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# -*- coding: utf-8 -*-
+"""StripedHyena custom code port for the Hugging Face Hub"""
+import torch
+from torch.nn import functional as F
+from .configuration_hyena import StripedHyenaConfig
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+from transformers.utils import logging
+from typing import Optional, Tuple, Union
+from .model import StripedHyena
+from .utils import dotdict
+from .cache import InferenceParams
+from .engine import HyenaInferenceEngine
+from .layers import RMSNorm
+from .utils import dotdict, column_split
+logger = logging.get_logger(__name__)
+class StripedHyenaPreTrainedModel(PreTrainedModel):
+    config_class = StripedHyenaConfig
+    base_model_prefix = "sh"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["AttentionBlock", "ParallelGatedConvBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_missing = [r"freq"]
+    _keys_to_ignore_on_load_unexpected = [r"fftconv", r"twiddle_factors"]
+    _supports_flash_attn_2 = True
+class StripedHyenaModelForCausalLM(StripedHyenaPreTrainedModel):
+    supports_gradient_checkpointing = True
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        model_config = dotdict(config.to_dict())
+        self.backbone = StripedHyena(model_config)
+        self.backbone.gradient_checkpointing = False
+        self.config = config
+        vocab_size = config.vocab_size
+        if vocab_size % config.make_vocab_size_divisible_by != 0:
+            vocab_size += config.make_vocab_size_divisible_by - (
+                vocab_size % config.make_vocab_size_divisible_by
+            )
+        self.vocab_size = vocab_size
+        self.post_init()
+        self.force_dtype()
+    def force_dtype(self):
+        self.backbone.to_bfloat16_except_poles_residues()
+    def _set_gradient_checkpointing(self, enable, gradient_checkpointing_func):
+        self.backbone.gradient_checkpointing = enable
+    def get_input_embeddings(self):
+        return self.backbone.embedding_layer
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values=None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if use_cache:
+            if self.backbone.gradient_checkpointing and self.backbone.training:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+            elif labels is not None:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with loss calculation. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        inputs = input_ids
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = self.backbone.initialize_inference_params()
+                batch_size = input_ids.shape[0]
+                past_key_values["mha"].max_batch_size = batch_size
+                past_key_values["hyena"].max_batch_size = batch_size
+            else:
+                seqlen_offset = past_key_values["mha"].seqlen_offset
+                if seqlen_offset == 0:
+                    # second loop through generate will have prompt_len + 1 as seqlen
+                    seqlen_offset = input_ids.shape[-1] - 1
+                    past_key_values["hyena"].seqlen_offset = seqlen_offset
+                    past_key_values["mha"].seqlen_offset = seqlen_offset
+                else:
+                    past_key_values["mha"].seqlen_offset += 1
+                    past_key_values["hyena"].seqlen_offset += 1
+                inputs = input_ids[
+                    :,
+                    -1:,
+                ]
+        logits, past_key_values = self.backbone(
+            inputs,
+            padding_mask=attention_mask,
+            inference_params_dict=past_key_values if use_cache else None,
+        )
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = F.cross_entropy(shift_logits, shift_labels)
+        if return_dict:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                hidden_states=None,
+                past_key_values=past_key_values if use_cache else None,
+                loss=loss,
+            )
+        else:
+            return logits
+    @classmethod
+    def can_generate(cls) -> bool:
+        return True
+    def prepare_inputs_for_generation(
+        self, input_ids, attention_mask=None, past_key_values=None, **kwargs
+    ):
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+        }

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "</s>"}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "unk_token": "</s>",
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    "add_prefix_space": false,
+    "tokenizer_class": "LlamaTokenizer"
+}

utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+def column_split(x, num_heads, head_size):
+    """Split a tensor with `num_heads` alongside the head dimension, instead of
+    across heads. Fixed to three projections
+    """
+    x_reshaped = x.reshape(
+        x.shape[0],
+        num_heads,
+        3 * head_size,
+    )
+    x2, x1, v = (
+        x_reshaped[:, :, :head_size],
+        x_reshaped[
+            :,
+            :,
+            head_size : 2 * head_size,
+        ],
+        x_reshaped[:, :, 2 * head_size :],
+    )
+    x2, x1, v = (
+        x2.reshape(x2.shape[0], -1),
+        x1.reshape(x1.shape[0], -1),
+        v.reshape(v.shape[0], -1),
+    )
+    return x2, x1, v
+def get_init_from_string(init_str):
+    if type(init_str) == str:
+        if init_str == "torch.nn.init.zeros_":
+            return torch.nn.init.zeros_
+        elif init_str == "torch.nn.init.xavier_uniform_":
+            return torch.nn.init.xavier_uniform_
+        elif init_str == "torch.nn.init.xavier_normal_":
+            return torch.nn.init.xavier_normal_
+        else:
+            raise ValueError(f"Unrecognized init {init_str}")
+def print_rank_0(message, debug=False, end="\n"):
+    """Print from rank 0 only."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True, end=end)
+    else:
+        print(message, flush=True, end=end)
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+    first and last index of the vocabulary belonging to the `rank`
+    partition: Note that indices in [first, last]"""
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff