|
""" NLLB-CLIP model configuration""" |
|
|
|
import os |
|
from collections import OrderedDict |
|
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union |
|
|
|
if TYPE_CHECKING: |
|
from transformers.processing_utils import ProcessorMixin |
|
from transformers.utils import TensorType |
|
|
|
from transformers import CLIPVisionConfig |
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.onnx import OnnxConfig |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class NLLBCLIPTextConfig(PretrainedConfig): |
|
model_type = "clip_text_model" |
|
attribute_map = { |
|
"num_attention_heads": "encoder_attention_heads", |
|
"hidden_size": "d_model", |
|
} |
|
|
|
def __init__( |
|
self, |
|
vocab_size=128112, |
|
max_position_embeddings=1024, |
|
encoder_layers=12, |
|
encoder_ffn_dim=4096, |
|
encoder_attention_heads=16, |
|
encoder_layerdrop=0.05, |
|
use_cache=True, |
|
activation_function="relu", |
|
d_model=1024, |
|
dropout=0.1, |
|
attention_dropout=0.1, |
|
activation_dropout=0.0, |
|
init_std=0.02, |
|
scale_embedding=True, |
|
pad_token_id=1, |
|
bos_token_id=0, |
|
eos_token_id=2, |
|
layer_norm_eps=1e-5, |
|
**kwargs, |
|
): |
|
self.vocab_size = vocab_size |
|
self.max_position_embeddings = max_position_embeddings |
|
self.d_model = d_model |
|
self.encoder_ffn_dim = encoder_ffn_dim |
|
self.encoder_layers = encoder_layers |
|
self.encoder_attention_heads = encoder_attention_heads |
|
self.dropout = dropout |
|
self.attention_dropout = attention_dropout |
|
self.activation_dropout = activation_dropout |
|
self.activation_function = activation_function |
|
self.init_std = init_std |
|
self.encoder_layerdrop = encoder_layerdrop |
|
self.use_cache = use_cache |
|
self.num_hidden_layers = encoder_layers |
|
self.scale_embedding = scale_embedding |
|
self.layer_norm_eps = layer_norm_eps |
|
|
|
super().__init__( |
|
pad_token_id=pad_token_id, |
|
bos_token_id=bos_token_id, |
|
eos_token_id=eos_token_id, |
|
**kwargs, |
|
) |
|
|
|
@classmethod |
|
def from_pretrained( |
|
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs |
|
) -> "PretrainedConfig": |
|
config_dict, kwargs = cls.get_config_dict( |
|
pretrained_model_name_or_path, **kwargs |
|
) |
|
|
|
|
|
if config_dict.get("model_type") == "clip": |
|
config_dict = config_dict["text_config"] |
|
|
|
if ( |
|
"model_type" in config_dict |
|
and hasattr(cls, "model_type") |
|
and config_dict["model_type"] != cls.model_type |
|
): |
|
logger.warning( |
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " |
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." |
|
) |
|
|
|
return cls.from_dict(config_dict, **kwargs) |
|
|
|
|
|
class NLLBCLIPConfig(PretrainedConfig): |
|
model_type = "clip" |
|
|
|
def __init__( |
|
self, |
|
text_config=None, |
|
vision_config=None, |
|
projection_dim=512, |
|
logit_scale_init_value=2.6592, |
|
**kwargs, |
|
): |
|
|
|
|
|
|
|
text_config_dict = kwargs.pop("text_config_dict", None) |
|
vision_config_dict = kwargs.pop("vision_config_dict", None) |
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
|
if text_config_dict is not None: |
|
if text_config is None: |
|
text_config = {} |
|
|
|
|
|
_text_config_dict = NLLBCLIPTextConfig(**text_config_dict).to_dict() |
|
|
|
|
|
for key, value in _text_config_dict.items(): |
|
if ( |
|
key in text_config |
|
and value != text_config[key] |
|
and key not in ["transformers_version"] |
|
): |
|
|
|
if key in text_config_dict: |
|
message = ( |
|
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. " |
|
f'The value `text_config_dict["{key}"]` will be used instead.' |
|
) |
|
|
|
else: |
|
message = ( |
|
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The " |
|
f'value `text_config["{key}"]` will be overriden.' |
|
) |
|
logger.warning(message) |
|
|
|
|
|
text_config.update(_text_config_dict) |
|
|
|
if vision_config_dict is not None: |
|
if vision_config is None: |
|
vision_config = {} |
|
|
|
|
|
_vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict() |
|
|
|
if "id2label" in _vision_config_dict: |
|
_vision_config_dict["id2label"] = { |
|
str(key): value |
|
for key, value in _vision_config_dict["id2label"].items() |
|
} |
|
|
|
|
|
for key, value in _vision_config_dict.items(): |
|
if ( |
|
key in vision_config |
|
and value != vision_config[key] |
|
and key not in ["transformers_version"] |
|
): |
|
|
|
if key in vision_config_dict: |
|
message = ( |
|
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different " |
|
f'values. The value `vision_config_dict["{key}"]` will be used instead.' |
|
) |
|
|
|
else: |
|
message = ( |
|
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. " |
|
f'The value `vision_config["{key}"]` will be overriden.' |
|
) |
|
logger.warning(message) |
|
|
|
|
|
vision_config.update(_vision_config_dict) |
|
|
|
if text_config is None: |
|
text_config = {} |
|
logger.info( |
|
"`text_config` is `None`. Initializing the `NLLBCLIPTextConfig` with default values." |
|
) |
|
|
|
if vision_config is None: |
|
vision_config = {} |
|
logger.info( |
|
"`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values." |
|
) |
|
|
|
self.text_config = NLLBCLIPTextConfig(**text_config) |
|
self.vision_config = CLIPVisionConfig(**vision_config) |
|
|
|
self.projection_dim = projection_dim |
|
self.logit_scale_init_value = logit_scale_init_value |
|
self.initializer_factor = 1.0 |
|
|
|
@classmethod |
|
def from_text_vision_configs( |
|
cls, text_config: NLLBCLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs |
|
): |
|
r""" |
|
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model |
|
configuration. |
|
|
|
Returns: |
|
[`CLIPConfig`]: An instance of a configuration object |
|
""" |
|
|
|
return cls( |
|
text_config=text_config.to_dict(), |
|
vision_config=vision_config.to_dict(), |
|
**kwargs, |
|
) |
|
|
|
|
|
class CLIPOnnxConfig(OnnxConfig): |
|
@property |
|
def inputs(self) -> Mapping[str, Mapping[int, str]]: |
|
return OrderedDict( |
|
[ |
|
("input_ids", {0: "batch", 1: "sequence"}), |
|
("attention_mask", {0: "batch", 1: "sequence"}), |
|
( |
|
"pixel_values", |
|
{0: "batch", 1: "num_channels", 2: "height", 3: "width"}, |
|
), |
|
] |
|
) |
|
|
|
@property |
|
def outputs(self) -> Mapping[str, Mapping[int, str]]: |
|
return OrderedDict( |
|
[ |
|
("logits_per_image", {0: "batch"}), |
|
("logits_per_text", {0: "batch"}), |
|
("text_embeds", {0: "batch"}), |
|
("image_embeds", {0: "batch"}), |
|
] |
|
) |
|
|
|
@property |
|
def atol_for_validation(self) -> float: |
|
return 1e-4 |
|
|
|
def generate_dummy_inputs( |
|
self, |
|
processor: "ProcessorMixin", |
|
batch_size: int = -1, |
|
seq_length: int = -1, |
|
framework: Optional["TensorType"] = None, |
|
) -> Mapping[str, Any]: |
|
text_input_dict = super().generate_dummy_inputs( |
|
processor.tokenizer, |
|
batch_size=batch_size, |
|
seq_length=seq_length, |
|
framework=framework, |
|
) |
|
image_input_dict = super().generate_dummy_inputs( |
|
processor.image_processor, batch_size=batch_size, framework=framework |
|
) |
|
return {**text_input_dict, **image_input_dict} |
|
|
|
@property |
|
def default_onnx_opset(self) -> int: |
|
return 14 |
|
|