mindformers
/
configs
/
clip
/
model_config
/
clip_vit_l_14@336.yaml

model:
  arch:
    type: CLIPModel

  model_config:
    type: CLIPConfig

    text_config:
      type: CLIPTextConfig
      vocab_size: 49408
      hidden_size: 768
      intermediate_size: 2048
      num_hidden_layers: 12
      num_attention_heads: 12
      max_position_embeddings: 77
      hidden_act: "quick_gelu"
      attention_dropout: 0.0
      drop_out: 0.0
      initializer_range: 0.02
      initializer_factor: 1.0

    vision_config:
      type: CLIPVisionConfig
      hidden_size: 1024
      intermediate_size: 3072
      num_hidden_layers: 24
      num_attention_heads: 16
      image_size: 336
      patch_size: 14
      hidden_act: "quick_gelu"
      dropout: 0.0
      attention_dropout: 0.0
      initializer_range: 0.02
      initializer_factor: 1.0

    dtype: float16  # type of tensors
    checkpoint_name_or_path: clip_vit_l_14@336   # the loaded model type
    projection_dim: 768  # feature dims
    logit_scale_init_value: 2.6592

processor:
  type: CLIPProcessor

  image_processor:
    type: CLIPImageProcessor
    image_resolution: 336  # input image size

  tokenizer:
    type: CLIPTokenizer
    pad_token: '!' # corresponding token id is 0