fix integration with huggingface

by not-lain - opened Apr 3

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+26

-21

Files changed (6) hide show

README.md +10 -0
__init__.py +1 -1
config.json +2 -2
configuration_gemma.py +2 -9
modeling_cerule_gemma.py +9 -9
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -39,6 +39,16 @@ The training setup was `4xA100's 80GB` and took ~6 hours to pretrain and ~13 hou
 | ![extreme_ironing](examples/extreme_ironing.jpg) | **What's funny about this image?**<br>The image is quite humorous as it depicts a man ironing clothes on the back of a yellow taxi cab. This is not a typical sight you'd expect to see in everyday life. |
 ---
 ## Training:
 We will release the training code in some time.

 | ![extreme_ironing](examples/extreme_ironing.jpg) | **What's funny about this image?**<br>The image is quite humorous as it depicts a man ironing clothes on the back of a yellow taxi cab. This is not a typical sight you'd expect to see in everyday life. |
 ---
+## Loading the model
+```
+pip install -qr https://huggingface.co/Tensoic/Cerule-v0.1/resolve/main/requirements.txt
+```
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("Tensoic/Cerule-v0.1", trust_remote_code=True)
+```
 ## Training:
 We will release the training code in some time.

__init__.py CHANGED Viewed

@@ -3,5 +3,5 @@ from .modeling_cerule_gemma import CeruleGemmaForCausalLM
 from transformers import AutoConfig, AutoModelForCausalLM
-AutoConfig.register("cerule-gemma", CeruleGemmaConfig)
 AutoModelForCausalLM.register(CeruleGemmaConfig, CeruleGemmaForCausalLM)

 from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("phi-msft", CeruleGemmaConfig)
 AutoModelForCausalLM.register(CeruleGemmaConfig, CeruleGemmaForCausalLM)

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "Tensoic/Cerule",
   "architectures": [
     "CeruleGemmaForCausalLM"
   ],
@@ -23,7 +23,7 @@
   "mm_projector_lr": null,
   "mm_projector_type": "mlp2x_gelu",
   "mm_vision_tower": "google/siglip-so400m-patch14-384",
-  "model_type": "cerule-gemma",
   "num_attention_heads": 8,
   "num_hidden_layers": 18,
   "num_key_value_heads": 1,

 {
+  "_name_or_path": "Tensoic/Cerule-v0.1",
   "architectures": [
     "CeruleGemmaForCausalLM"
   ],
   "mm_projector_lr": null,
   "mm_projector_type": "mlp2x_gelu",
   "mm_vision_tower": "google/siglip-so400m-patch14-384",
+  "model_type": "phi-msft",
   "num_attention_heads": 8,
   "num_hidden_layers": 18,
   "num_key_value_heads": 1,

configuration_gemma.py CHANGED Viewed

@@ -25,8 +25,8 @@ GEMMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 }
-class GemmaConfig(PretrainedConfig):
-    model_type = "gemma"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
@@ -162,10 +162,3 @@ class SigLipVisionConfig(PretrainedConfig):
         return cls.from_dict(config_dict, **kwargs)
-class CeruleGemmaConfig(GemmaConfig):
-    model_type = "cerule-gemma"
-    def __init__(self, **kwargs):
-        self.gemma_config = GemmaConfig(**kwargs)
-        super().__init__(**kwargs)

 }
+class CeruleGemmaConfig(PretrainedConfig):
+    model_type = "phi-msft"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         return cls.from_dict(config_dict, **kwargs)

modeling_cerule_gemma.py CHANGED Viewed

@@ -853,7 +853,7 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_gemma import GemmaConfig
 if is_flash_attn_2_available():
@@ -872,7 +872,7 @@ if is_torch_fx_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "GemmaConfig"
 def _get_unpad_data(attention_mask):
@@ -1003,7 +1003,7 @@ class GemmaAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     # Ignore copy
-    def __init__(self, config: GemmaConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -1396,7 +1396,7 @@ GEMMA_ATTENTION_CLASSES = {
 # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with LLAMA->GEMMA,Llama->Gemma
 class GemmaDecoderLayer(nn.Module):
-    def __init__(self, config: GemmaConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -1480,7 +1480,7 @@ GEMMA_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`GemmaConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1492,7 +1492,7 @@ GEMMA_START_DOCSTRING = r"""
     GEMMA_START_DOCSTRING,
 )
 class GemmaPreTrainedModel(PreTrainedModel):
-    config_class = GemmaConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _keep_in_fp32_modules = ["inv_freq", "rotary_emb", "cos_cached", "sin_cached"]
@@ -1618,7 +1618,7 @@ class GemmaModel(GemmaPreTrainedModel):
         config: GemmaConfig
     """
-    def __init__(self, config: GemmaConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -2155,7 +2155,7 @@ from .configuration_gemma import CeruleGemmaConfig
 class CeruleGemmaModel(CeruleMetaModel, GemmaModel):
     config_class = CeruleGemmaConfig
-    def __init__(self, config: GemmaConfig):
         super(CeruleGemmaModel, self).__init__(config)
@@ -2264,5 +2264,5 @@ class CeruleGemmaForCausalLM(GemmaForCausalLM, CeruleMetaForCausalLM):
         return new_images
-AutoConfig.register("cerule-gemma", CeruleGemmaConfig)
 AutoModelForCausalLM.register(CeruleGemmaConfig, CeruleGemmaForCausalLM)

     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_gemma import CeruleGemmaConfig
 if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "CeruleGemmaConfig"
 def _get_unpad_data(attention_mask):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     # Ignore copy
+    def __init__(self, config: CeruleGemmaConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
 # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with LLAMA->GEMMA,Llama->Gemma
 class GemmaDecoderLayer(nn.Module):
+    def __init__(self, config: CeruleGemmaConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
     and behavior.
     Parameters:
+        config ([`CeruleGemmaConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
     GEMMA_START_DOCSTRING,
 )
 class GemmaPreTrainedModel(PreTrainedModel):
+    config_class = CeruleGemmaConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _keep_in_fp32_modules = ["inv_freq", "rotary_emb", "cos_cached", "sin_cached"]
         config: GemmaConfig
     """
+    def __init__(self, config: CeruleGemmaConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 class CeruleGemmaModel(CeruleMetaModel, GemmaModel):
     config_class = CeruleGemmaConfig
+    def __init__(self, config: CeruleGemmaConfig):
         super(CeruleGemmaModel, self).__init__(config)
         return new_images
+AutoConfig.register("phi-msft", CeruleGemmaConfig)
 AutoModelForCausalLM.register(CeruleGemmaConfig, CeruleGemmaForCausalLM)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ flash_attn
2	+ transformers>=4.39.1