Initial commit

1d5a34cf · wanglch · 1d5a34cf · 1d5a34cf · 1d5a34cf · 1d5a34cf
Commit 1d5a34cf authored Jul 31, 2024 by wanglch
20 changed files
--- a/clip_benchmark/clip_benchmark/models/__init__.py
+++ b/clip_benchmark/clip_benchmark/models/__init__.py
+from typing import Union
+
+import torch
+
+from .internvl import load_internvl
+from .japanese_clip import load_japanese_clip
+from .open_clip import load_open_clip
+
+# loading function must return (model, transform, tokenizer)
+TYPE2FUNC = {
+    'open_clip': load_open_clip,
+    'ja_clip': load_japanese_clip,
+    'internvl': load_internvl,
+}
+MODEL_TYPES = list(TYPE2FUNC.keys())
+
+
+def load_clip(
+        model_type: str,
+        model_name: str,
+        pretrained: str,
+        cache_dir: str,
+        device: Union[str, torch.device] = 'cuda'
+):
+    assert model_type in MODEL_TYPES, f'model_type={model_type} is invalid!'
+    load_func = TYPE2FUNC[model_type]
+    return load_func(model_name=model_name, pretrained=pretrained, cache_dir=cache_dir, device=device)
--- a/clip_benchmark/clip_benchmark/models/intern_vit_6b/configuration_intern_vit.py
+++ b/clip_benchmark/clip_benchmark/models/intern_vit_6b/configuration_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+
+    model_type = 'intern_vit_6b'
+
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
--- a/clip_benchmark/clip_benchmark/models/intern_vit_6b/flash_attention.py
+++ b/clip_benchmark/clip_benchmark/models/intern_vit_6b/flash_attention.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+try:  # v1
+    from flash_attn.flash_attn_interface import \
+        flash_attn_unpadded_qkvpacked_func
+except:  # v2
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+
+from flash_attn.bert_padding import pad_input, unpad_input
+
+
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_unpadded_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_unpadded_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_unpadded_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+
+        return output, None
--- a/clip_benchmark/clip_benchmark/models/intern_vit_6b/modeling_intern_vit.py
+++ b/clip_benchmark/clip_benchmark/models/intern_vit_6b/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+
+from .configuration_intern_vit import InternVisionConfig
+
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+
+
+logger = logging.get_logger(__name__)
+
+
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    InternRMSNorm = FusedRMSNorm  # noqa
+
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+
+
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+
+
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+
+
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    config_class = InternVisionConfig
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
--- a/clip_benchmark/clip_benchmark/models/internvl.py
+++ b/clip_benchmark/clip_benchmark/models/internvl.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .internvl_c_pytorch import load_internvl_c_pytorch
+from .internvl_huggingface import (load_internvl_c_huggingface,
+                                   load_internvl_g_huggingface)
+
+
+def load_internvl(model_name, pretrained, cache_dir, device):
+    if model_name == 'internvl_c_classification':
+        return load_internvl_c_pytorch(pretrained, device, 'classification')
+    elif model_name == 'internvl_c_retrieval':
+        return load_internvl_c_pytorch(pretrained, device, 'retrieval')
+    elif model_name == 'internvl_c_classification_hf':
+        return load_internvl_c_huggingface(pretrained, device, 'classification')
+    elif model_name == 'internvl_c_retrieval_hf':
+        return load_internvl_c_huggingface(pretrained, device, 'retrieval')
+    elif model_name == 'internvl_g_classification_hf':
+        return load_internvl_g_huggingface(pretrained, device, 'classification')
+    elif model_name == 'internvl_g_retrieval_hf':
+        return load_internvl_g_huggingface(pretrained, device, 'retrieval')
--- a/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/__init__.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/__init__.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import os
+
+import torch
+import torchvision.transforms as T
+from torch import nn
+from torchvision.transforms import InterpolationMode
+from transformers import LlamaTokenizer
+
+from .internvl_c import InternVL_C
+
+try:
+    from .flash_attention import FlashAttention
+except:
+    print('FlashAttention is not installed.')
+
+
+class InternVLTokenizer(nn.Module):
+    def __init__(self, model_path):
+        super(InternVLTokenizer, self).__init__()
+        self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
+        self.tokenizer.pad_token = ' '  # allow padding
+        self.tokenizer.add_eos_token = True
+
+    def forward(self, text, prefix='summarize:'):
+        if type(text) == str:
+            text = prefix + text
+        elif type(text) == list:
+            text = [prefix + item for item in text]
+        text = self.tokenizer(text, return_tensors='pt', max_length=80, truncation=True, padding=True).input_ids
+        return text
+
+
+def build_transform(task, image_size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
+    if task == 'retrieval':
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)])
+    else:
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize(image_size, interpolation=InterpolationMode.BICUBIC),
+            T.CenterCrop(image_size),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)])
+    return transform
+
+
+def get_model_and_transform(task, image_size, device):
+    llm_path = os.path.split(os.path.realpath(__file__))[0]
+    llm_path = os.path.join(llm_path, 'chinese_alpaca_lora_7b')
+    model = InternVL_C(img_size=image_size, layerscale_force_fp32=True, llm_path=llm_path)
+    model = model.to(torch.float16).to(device)
+    transform = build_transform(task, image_size)
+    return model, transform
+
+
+def load_internvl_c_pytorch(ckpt_path, device, task, image_size=224):
+    llm_path = os.path.split(os.path.realpath(__file__))[0]
+    llm_path = os.path.join(llm_path, 'chinese_alpaca_lora_7b')
+    tokenizer = InternVLTokenizer(llm_path)
+    model, transform = get_model_and_transform(task=task, image_size=image_size, device=device)
+    ckpt = torch.load(ckpt_path, map_location='cpu')
+    model.load_state_dict(ckpt, strict=False)
+    return model, transform, tokenizer
--- a/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/flash_attention.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/flash_attention.py
+# https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+try:  # v1
+    from flash_attn.flash_attn_interface import \
+        flash_attn_unpadded_qkvpacked_func
+except:  # v2
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+
+from flash_attn.bert_padding import pad_input, unpad_input
+
+
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_unpadded_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_unpadded_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_unpadded_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+
+        return output, None
--- a/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/internvl_c.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_c_pytorch/internvl_c.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath, to_2tuple
+from torch import nn
+from transformers import LlamaConfig, LlamaForCausalLM
+
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None, out_dim=None):
+        super().__init__()
+        if out_dim is None:
+            out_dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        assert all_head_dim == dim
+
+        self.q = nn.Linear(dim, all_head_dim, bias=False)
+        self.k = nn.Linear(dim, all_head_dim, bias=False)
+        self.v = nn.Linear(dim, all_head_dim, bias=False)
+
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, out_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, k=None, v=None):
+        B, N, C = x.shape
+        N_k = k.shape[1]
+        N_v = v.shape[1]
+
+        q_bias, k_bias, v_bias = None, None, None
+        if self.q_bias is not None:
+            q_bias = self.q_bias
+            k_bias = self.k_bias
+            v_bias = self.v_bias
+
+        q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
+        q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)  # (B, N_head, N_q, dim)
+
+        k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
+        k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+
+        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+        v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class AttentiveBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, attn_head_dim=None, out_dim=None):
+        super().__init__()
+
+        self.norm1_q = norm_layer(dim)
+        self.norm1_k = norm_layer(dim)
+        self.norm1_v = norm_layer(dim)
+        self.cross_attn = CrossAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
+            proj_drop=drop, attn_head_dim=attn_head_dim, out_dim=out_dim)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None):
+        x_q = self.norm1_q(x_q + pos_q)
+        x_k = self.norm1_k(x_kv + pos_k)
+        x_v = self.norm1_v(x_kv)
+        x = self.cross_attn(x_q, k=x_k, v=x_v)
+
+        return x
+
+
+class AttentionPoolingBlock(AttentiveBlock):
+
+    def forward(self, x):
+        x_q = x.mean(1, keepdim=True)
+        x_kv, pos_q, pos_k = x, 0, 0
+        x = super().forward(x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None)
+        x = x.squeeze(1)
+        return x
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    RMSNorm = FusedRMSNorm  # noqa
+
+    print('Discovered apex.normalization.FusedRMSNorm - will use it instead of RMSNorm')
+except ImportError:
+    # using the normal RMSNorm
+    pass
+except Exception:
+    print('discovered apex but it failed to load, falling back to RMSNorm')
+    pass
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False, force_fp32=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+        self.force_fp32 = force_fp32
+
+    @torch.cuda.amp.autocast(enabled=False)
+    def forward(self, x):
+        if self.force_fp32:
+            output_type = x.dtype
+            out = x.float().mul_(self.gamma.float()) if self.inplace else x.float() * self.gamma.float()
+            return out.to(dtype=output_type)
+        else:
+            out = x.mul_(self.gamma) if self.inplace else x * self.gamma
+            return out
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., use_flash_attn=False,
+                 causal=False, norm_layer=nn.LayerNorm, qk_normalization=False):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.use_flash_attn = use_flash_attn
+        if use_flash_attn:
+            self.causal = causal
+            self.inner_attn = FlashAttention(attention_dropout=attn_drop)
+
+        self.qk_normalization = qk_normalization
+        self.q_norm = norm_layer(dim) if qk_normalization else nn.Identity()
+        self.k_norm = norm_layer(dim) if qk_normalization else nn.Identity()
+
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+
+    def forward(self, x):
+        x = self._naive_attn(x) if not self.use_flash_attn else self._flash_attn(x)
+        return x
+
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU,
+                 bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., init_values=None,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_flash_attn=False, with_cp=False,
+            qk_normalization=False, layerscale_force_fp32=False):
+        super().__init__()
+
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
+                              use_flash_attn=use_flash_attn, causal=False, norm_layer=norm_layer,
+                              qk_normalization=qk_normalization)
+        self.ls1 = LayerScale(dim, init_values=init_values,
+                              force_fp32=layerscale_force_fp32) if init_values else nn.Identity()
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.ls2 = LayerScale(dim, init_values=init_values,
+                              force_fp32=layerscale_force_fp32) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+        self.with_cp = with_cp
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+            x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+            return x
+
+        if self.with_cp:
+            return checkpoint.checkpoint(_inner_forward, x)
+        else:
+            return _inner_forward(x)
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x, **kwargs):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, H, W
+
+
+class InternVL_C(nn.Module):
+    def __init__(self, in_chans=3, patch_size=14, img_size=224, qkv_bias=False, drop_path_rate=0.0,
+                 embed_dim=3200, num_heads=25, mlp_ratio=4, init_values=0.1, qk_normalization=True, depth=48,
+                 use_flash_attn=True, with_cp=True, layerscale_force_fp32=False, context_length: int = 80,
+                 transformer_width=4096, llm_path=None, attn_pool_num_heads=16, clip_embed_dim=768):
+        super().__init__()
+
+        use_flash_attn = use_flash_attn and has_flash_attn
+        if use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.use_flash_attn = use_flash_attn
+        self.context_length = context_length
+        self.embed_dim = embed_dim
+        self.transformer_width = transformer_width
+
+        """ text encoder of InternVL """
+        llama_config = LlamaConfig.from_pretrained(llm_path)
+        model = LlamaForCausalLM(llama_config)
+        self.transformer = model.model
+
+        self.transformer.gradient_checkpointing = True
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, clip_embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        """ image encoder of InternVL """
+        norm_layer_for_blocks = partial(RMSNorm, eps=1e-6)
+        self.norm_layer_for_blocks = norm_layer_for_blocks
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.num_patches = num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+
+        self.blocks = nn.ModuleList([
+            Block(embed_dim, num_heads, mlp_ratio, qkv_bias=qkv_bias,
+                  norm_layer=norm_layer_for_blocks,
+                  drop_path=dpr[i], init_values=init_values, attn_drop=0.,
+                  use_flash_attn=use_flash_attn,
+                  with_cp=with_cp,
+                  qk_normalization=qk_normalization,
+                  layerscale_force_fp32=layerscale_force_fp32)
+            for i in range(depth)])
+
+        self.clip_projector = AttentionPoolingBlock(
+            dim=embed_dim, num_heads=attn_pool_num_heads, qkv_bias=True, qk_scale=None,
+            drop=0., attn_drop=0., norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=clip_embed_dim)
+
+    @property
+    def dtype(self):
+        return self.patch_embed.proj.weight.dtype
+
+    def forward_features(self, x):
+        x, _, _ = self.patch_embed(x.type(self.dtype))
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+
+        for idx, blk in enumerate(self.blocks):
+            x = blk(x)
+        return x
+
+    def encode_image(self, image):
+        x = self.forward_features(image)
+        x = self.clip_projector(x)
+        return x
+
+    def encode_text(self, text):
+        text_key_padding_mask = text > 0
+        x = self.transformer(input_ids=text, attention_mask=text_key_padding_mask).last_hidden_state
+        x = x[torch.arange(x.shape[0]), text_key_padding_mask.sum(1) - 1]
+        x = x @ self.text_projection
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        return logits_per_image, logits_per_text
--- a/clip_benchmark/clip_benchmark/models/internvl_huggingface/__init__.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_huggingface/__init__.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from torchvision.transforms import InterpolationMode
+from transformers import LlamaTokenizer
+
+from .configuration_intern_vit import InternVisionConfig
+from .configuration_internvl import InternVLConfig
+from .modeling_intern_vit import InternVisionModel
+from .modeling_internvl import InternVL_C, InternVL_G, InternVLModel
+
+__all__ = ['InternVisionConfig', 'InternVisionModel', 'InternVLConfig',
+           'InternVLModel', 'InternVL_C', 'InternVL_G']
+
+
+# Prefix the text "summarize:"
+class InternVLTokenizer(nn.Module):
+    def __init__(self, model_path):
+        super(InternVLTokenizer, self).__init__()
+        self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
+        self.tokenizer.pad_token = ' '  # allow padding
+        self.tokenizer.add_eos_token = True
+
+    def forward(self, text, prefix='summarize:'):
+        if type(text) == str:
+            text = prefix + text
+        elif type(text) == list:
+            text = [prefix + item for item in text]
+        text = self.tokenizer(text, return_tensors='pt', max_length=80, truncation=True, padding='max_length').input_ids
+        return text
+
+
+def build_transform(task, image_size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
+    if task == 'retrieval':
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)])
+    else:
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize(image_size, interpolation=InterpolationMode.BICUBIC),
+            T.CenterCrop(image_size),
+            T.ToTensor(),
+            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+    return transform
+
+
+def load_internvl_c_huggingface(ckpt_path, device, task):
+    model = InternVL_C.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
+    if model.config.use_backbone_lora:
+        model.vision_model.merge_and_unload()
+        model.vision_model = model.vision_model.model
+    if model.config.use_qllama_lora:
+        model.qllama.merge_and_unload()
+        model.qllama = model.qllama.model
+    if model.config.force_image_size is not None:
+        image_size = model.config.force_image_size
+    else:
+        image_size = model.config.vision_config.image_size
+    transform = build_transform(task, image_size)
+    tokenizer = InternVLTokenizer(ckpt_path)
+    return model, transform, tokenizer
+
+
+def load_internvl_g_huggingface(ckpt_path, device, task):
+    model = InternVL_G.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
+    if model.config.use_backbone_lora:
+        model.vision_model.merge_and_unload()
+        model.vision_model = model.vision_model.model
+    if model.config.use_qllama_lora:
+        model.qllama.merge_and_unload()
+        model.qllama = model.qllama.model
+    if model.config.force_image_size is not None:
+        image_size = model.config.force_image_size
+    else:
+        image_size = model.config.vision_config.image_size
+    transform = build_transform(task, image_size)
+    tokenizer = InternVLTokenizer(ckpt_path)
+    return model, transform, tokenizer
--- a/clip_benchmark/clip_benchmark/models/internvl_huggingface/configuration_intern_vit.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_huggingface/configuration_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+
+    model_type = 'intern_vit_6b'
+
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
--- a/clip_benchmark/clip_benchmark/models/internvl_huggingface/configuration_internvl.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_huggingface/configuration_internvl.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+
+from transformers import LlamaConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+from .configuration_intern_vit import InternVisionConfig
+
+logger = logging.get_logger(__name__)
+
+
+class InternVLConfig(PretrainedConfig):
+    r"""
+    [`InternVLConfig`] is the configuration class to store the configuration of a
+    [`InternVLModel`]. It is used to instantiate a InternVLModel according to the specified
+    arguments, defining the InternViT-6B and QLLaMA configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the InternVL architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InternVisionConfig`].
+        qllama_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`LLaMAConfig`].
+        clip_embed_dim (`int`, *optional*, defaults to 768):
+            Size of the embeddings from the CLIP model.
+        attn_pool_num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads used in the attention pooling layers.
+        num_query_token (`int`, *optional*, defaults to 96):
+            Number of query tokens used in the transformer.
+        label_smoothing (`float`, *optional*, defaults to 0.0):
+            The amount of label smoothing to apply.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of cross-attention layers in the model.
+        use_backbone_lora (`int`, *optional*, defaults to 0):
+            If non-zero, indicates the use of LoRA in the backbone of the model.
+        use_qllama_lora (`int`, *optional*, defaults to 0):
+            If non-zero, indicates the use of LoRA in the QLLaMA of the model.
+        force_image_size (`int` or `None`, *optional*):
+            If not None, forces the model to use this specific image size.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        kwargs (*optional*):
+            Dictionary of additional keyword arguments.
+    """
+
+    model_type = 'internvl'
+    is_composition = True
+
+    def __init__(
+            self,
+            vision_config=None,
+            qllama_config=None,
+            clip_embed_dim=768,
+            attn_pool_num_heads=16,
+            num_query_token=96,
+            label_smoothing=0.0,
+            cross_attention_frequency=2,
+            use_backbone_lora=0,
+            use_qllama_lora=0,
+            force_image_size=None,
+            initializer_range=0.02,
+            **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info('vision_config is None. initializing the InternVisionConfig with default values.')
+
+        if qllama_config is None:
+            qllama_config = {}
+            logger.info(
+                'qllama_config is None. Initializing the InternTextConfig config with default values (`LlamaConfig`).')
+
+        self.vision_config = InternVisionConfig(**vision_config)
+        self.qllama_config = LlamaConfig(**qllama_config)
+        self.qllama_config.num_query_token = num_query_token
+        self.qllama_config.cross_attention_frequency = cross_attention_frequency
+        self.hidden_size = self.qllama_config.hidden_size
+
+        self.clip_embed_dim = clip_embed_dim
+        self.attn_pool_num_heads = attn_pool_num_heads
+        self.num_query_token = num_query_token
+        self.label_smoothing = label_smoothing
+        self.use_backbone_lora = use_backbone_lora
+        self.use_qllama_lora = use_qllama_lora
+        self.force_image_size = force_image_size
+        self.initializer_range = initializer_range
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        output['qllama_config'] = self.qllama_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        return output
--- a/clip_benchmark/clip_benchmark/models/internvl_huggingface/flash_attention.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_huggingface/flash_attention.py
+# https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+try:  # v1
+    from flash_attn.flash_attn_interface import \
+        flash_attn_unpadded_qkvpacked_func
+except:  # v2
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+
+from flash_attn.bert_padding import pad_input, unpad_input
+
+
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_unpadded_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_unpadded_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_unpadded_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+
+        return output, None
--- a/clip_benchmark/clip_benchmark/models/internvl_huggingface/modeling_intern_vit.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_huggingface/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+
+from .configuration_intern_vit import InternVisionConfig
+
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+
+
+logger = logging.get_logger(__name__)
+
+
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    InternRMSNorm = FusedRMSNorm  # noqa
+
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+
+
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+
+
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+
+
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    config_class = InternVisionConfig
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
--- a/clip_benchmark/clip_benchmark/models/internvl_huggingface/modeling_internvl.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_huggingface/modeling_internvl.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from functools import partial
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from peft import LoraConfig, get_peft_model
+from timm.models.layers import DropPath
+from torch import nn
+from transformers import GenerationConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+
+from .configuration_internvl import InternVLConfig
+from .modeling_intern_vit import (InternVisionEmbeddings, InternVisionEncoder,
+                                  InternVisionModel)
+from .modeling_qllama import LlamaForCausalLM, _expand_mask, _make_causal_mask
+
+try:
+    from .flash_attention import FlashAttention  # v1/v2
+except:
+    print('FlashAttention is not installed.')
+
+logger = logging.get_logger(__name__)
+
+
+class InternVLPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = InternVLConfig
+    base_model_prefix = 'internvl'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids',
+    ]
+    _no_split_modules = ['InternAttention', 'LlamaDecoderLayer', 'LlamaForCausalLM']
+    _skip_keys_device_placement = 'past_key_values'
+    _keep_in_fp32_modules = ['wo']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, 'bias') and module.bias is not None:
+                module.bias.data.zero_()
+        if isinstance(module, InternVisionEmbeddings):
+            if hasattr(self.config, 'vision_config'):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, InternVisionModel):
+            module.gradient_checkpointing = value
+        if isinstance(module, InternVisionEncoder):
+            module.gradient_checkpointing = value
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None, out_dim=None):
+        super().__init__()
+        if out_dim is None:
+            out_dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        assert all_head_dim == dim
+
+        self.q = nn.Linear(dim, all_head_dim, bias=False)
+        self.k = nn.Linear(dim, all_head_dim, bias=False)
+        self.v = nn.Linear(dim, all_head_dim, bias=False)
+
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, out_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, k=None, v=None):
+        B, N, C = x.shape
+        N_k = k.shape[1]
+        N_v = v.shape[1]
+
+        q_bias, k_bias, v_bias = None, None, None
+        if self.q_bias is not None:
+            q_bias = self.q_bias
+            k_bias = self.k_bias
+            v_bias = self.v_bias
+
+        q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
+        q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)  # (B, N_head, N_q, dim)
+
+        k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
+        k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+
+        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+        v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class AttentiveBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, attn_head_dim=None, out_dim=None):
+        super().__init__()
+
+        self.norm1_q = norm_layer(dim)
+        self.norm1_k = norm_layer(dim)
+        self.norm1_v = norm_layer(dim)
+        self.cross_attn = CrossAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
+            proj_drop=drop, attn_head_dim=attn_head_dim, out_dim=out_dim)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None):
+        x_q = self.norm1_q(x_q + pos_q)
+        x_k = self.norm1_k(x_kv + pos_k)
+        x_v = self.norm1_v(x_kv)
+        x = self.cross_attn(x_q, k=x_k, v=x_v)
+
+        return x
+
+
+class AttentionPoolingBlock(AttentiveBlock):
+
+    def forward(self, x):
+        x_q = x.mean(1, keepdim=True)
+        x_kv, pos_q, pos_k = x, 0, 0
+        x = super().forward(x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None)
+        x = x.squeeze(1)
+        return x
+
+
+class InternVLModel(InternVLPreTrainedModel):
+    config_class = InternVLConfig
+    main_input_name = 'pixel_values'
+
+    def __init__(self, config: InternVLConfig):
+        super().__init__(config)
+
+        text_hidden_size = config.qllama_config.hidden_size
+        vision_hidden_size = config.vision_config.hidden_size
+        clip_embed_dim = config.clip_embed_dim
+        attn_pool_num_heads = config.attn_pool_num_heads
+        config.qllama_config.num_query_token = config.num_query_token
+        self.num_query_token = config.num_query_token
+        self.label_smoothing = config.label_smoothing
+
+        self.vision_model = InternVisionModel(config.vision_config)  # frozen
+        self.qllama = LlamaForCausalLM(config.qllama_config)  # frozen
+        self.query_tokens = nn.Parameter(  # trainable
+            torch.zeros(1, config.num_query_token, text_hidden_size)
+        )
+
+        self.text_projection = nn.Parameter(torch.empty(text_hidden_size, clip_embed_dim))  # frozen
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))  # trainable
+        self.clip_projector = AttentionPoolingBlock(  # frozen
+            dim=vision_hidden_size, num_heads=attn_pool_num_heads, qkv_bias=True, qk_scale=None,
+            drop=0., attn_drop=0., norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=clip_embed_dim)
+        self.clip_projector2 = AttentionPoolingBlock(  # trainable
+            dim=text_hidden_size, num_heads=attn_pool_num_heads, qkv_bias=True, qk_scale=None,
+            drop=0., attn_drop=0., norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=clip_embed_dim)
+        self.itm_head = nn.Linear(text_hidden_size, 2)  # trainable
+        self.gradient_checkpointing = True
+
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+        if config.use_backbone_lora:
+            self.wrap_backbone_lora(r=config.use_backbone_lora)
+        if config.use_qllama_lora:
+            self.wrap_qllama_lora(r=config.use_qllama_lora)
+        if config.force_image_size:
+            self.vision_model.resize_pos_embeddings(
+                old_size=config.vision_config.image_size,
+                new_size=config.force_image_size,
+                patch_size=config.vision_config.patch_size
+            )
+
+    def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+        )
+        self.vision_model = get_peft_model(self.vision_model, lora_config)
+        self.vision_model.print_trainable_parameters()
+
+    def wrap_qllama_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
+                            'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+        )
+        self.qllama = get_peft_model(self.qllama, lora_config)
+        self.qllama.print_trainable_parameters()
+
+    def get_input_embeddings(self):
+        return self.qllama.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.qllama.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.qllama.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.qllama.get_output_embeddings()
+
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.FloatTensor,
+            attention_mask: torch.LongTensor,
+            generation_config: Optional[GenerationConfig] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        image_embeds = vision_outputs[0]
+
+        batch_size = image_embeds.shape[0]
+        input_embeds = self.get_input_embeddings()(input_ids)
+        query_tokens = self.query_tokens.repeat(batch_size, 1, 1)
+        input_embeds = torch.cat([query_tokens, input_embeds], dim=1)
+        image_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        attention_mask = torch.cat([image_attention_mask, attention_mask], dim=1)
+
+        outputs = self.qllama.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            vision_hidden_states=image_embeds,
+            generation_config=generation_config,
+            use_zero_attention_mask=True,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+    def get_text_features(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_embeds = self.get_input_embeddings()(input_ids)
+        attention_mask = _expand_mask(attention_mask, input_embeds.dtype).to(
+            input_embeds.device)  # [bsz, 1, tgt_seq_len, src_seq_len]
+        attention_mask += _make_causal_mask(
+            (attention_mask.shape[0], attention_mask.shape[2]),
+            input_embeds.dtype,
+            device=input_embeds.device
+        )
+        if type(self.qllama.model) == LlamaForCausalLM:
+            outputs = self.qllama.model.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=None,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        else:
+            outputs = self.qllama.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=None,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        return outputs
+
+    def get_image_features(
+            self,
+            pixel_values: torch.FloatTensor,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        image_embeds = vision_outputs[0]
+        backbone_embeds = image_embeds
+
+        batch_size = image_embeds.shape[0]
+        input_embeds = self.query_tokens.repeat(batch_size, 1, 1)
+
+        attention_mask = torch.ones(input_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        attention_mask = _expand_mask(attention_mask, input_embeds.dtype).to(
+            input_embeds.device)  # [bsz, 1, tgt_seq_len, src_seq_len]
+        if type(self.qllama.model) == LlamaForCausalLM:
+            outputs = self.qllama.model.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=image_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        else:
+            outputs = self.qllama.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=image_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        return backbone_embeds, outputs
+
+    def encode_image(self, image, mode):
+        if mode == 'InternVL-C':
+            vision_outputs = self.vision_model(
+                pixel_values=image,
+                output_hidden_states=False,
+                return_dict=True)
+            image_embeds = vision_outputs[0]
+            image_embeds = self.clip_projector(image_embeds)
+        elif mode == 'InternVL-G':
+            backbone_embeds, image_embeds = self.get_image_features(
+                pixel_values=image,
+                output_hidden_states=False,
+                return_dict=True,
+            )
+            backbone_embeds = self.clip_projector(backbone_embeds)
+            image_embeds = self.clip_projector2(image_embeds)
+            # ensemble
+            backbone_embeds = backbone_embeds / backbone_embeds.norm(dim=1, keepdim=True)
+            image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True)
+            image_embeds = image_embeds + backbone_embeds
+        else:
+            raise NotImplementedError
+        return image_embeds
+
+    def encode_text(self, text):
+        attention_mask = text > 0
+        text_embeds = self.get_text_features(
+            input_ids=text,
+            attention_mask=attention_mask,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
+        text_embeds = text_embeds @ self.text_projection
+        return text_embeds
+
+    def forward(self, image, text, mode='InternVL-C'):
+        assert mode in ['InternVL-C', 'InternVL-G'], 'mode must be InternVL-C or InternVL-G'
+        image_features = self.encode_image(image, mode)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        return logits_per_image, logits_per_text
+
+
+class InternVL_C(InternVLModel):
+
+    def encode_image(self, image):
+        vision_outputs = self.vision_model(
+            pixel_values=image,
+            output_hidden_states=False,
+            return_dict=True)
+        image_embeds = vision_outputs[0]
+        image_embeds = self.clip_projector(image_embeds)
+        return image_embeds
+
+    def encode_text(self, text):
+        attention_mask = text > 0
+        text_embeds = self.get_text_features(
+            input_ids=text,
+            attention_mask=attention_mask,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
+        text_embeds = text_embeds @ self.text_projection
+        return text_embeds
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        return logits_per_image, logits_per_text
+
+
+class InternVL_G(InternVLModel):
+
+    def encode_image(self, image):
+        backbone_embeds, image_embeds = self.get_image_features(
+            pixel_values=image,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        backbone_embeds = self.clip_projector(backbone_embeds)
+        image_embeds = self.clip_projector2(image_embeds)
+        # ensemble
+        backbone_embeds = backbone_embeds / backbone_embeds.norm(dim=1, keepdim=True)
+        image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True)
+        image_embeds = image_embeds + backbone_embeds
+        return image_embeds
+
+    def encode_text(self, text):
+        attention_mask = text > 0
+        text_embeds = self.get_text_features(
+            input_ids=text,
+            attention_mask=attention_mask,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
+        text_embeds = text_embeds @ self.text_projection
+        return text_embeds
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        return logits_per_image, logits_per_text
--- a/clip_benchmark/clip_benchmark/models/internvl_huggingface/modeling_qllama.py
+++ b/clip_benchmark/clip_benchmark/models/internvl_huggingface/modeling_qllama.py
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch QLLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import LlamaConfig
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward, logging,
+                                replace_return_docstrings)
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'LlamaConfig'
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+try:
+    from functools import partial
+
+    from apex.normalization import FusedRMSNorm
+
+    LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6)  # noqa
+    print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm')
+except ImportError:
+    # using the normal LlamaRMSNorm
+    pass
+except Exception:
+    print('discovered apex but it failed to load, falling back to LlamaRMSNorm')
+    pass
+
+
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer('inv_freq', inv_freq)
+
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
+            self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+
+
+class FixedLlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+
+
+LlamaRotaryEmbedding = FixedLlamaRotaryEmbedding
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaMLP(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            intermediate_size: int,
+            hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.vision_hidden_size = 3200
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.norm1 = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.k_proj = nn.Linear(self.vision_hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.vision_hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.norm2 = LlamaRMSNorm(self.vision_hidden_size, eps=config.rms_norm_eps)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            vision_hidden_states: torch.Tensor,
+            repeat_time: int = 1,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        hidden_states = self.norm1(hidden_states)
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        vision_hidden_states = self.norm2(vision_hidden_states)
+
+        bs_v, kv_len, _ = vision_hidden_states.size()
+
+        key_states = self.k_proj(vision_hidden_states).view(
+            bs_v, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(vision_hidden_states).view(
+            bs_v, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        key_states = key_states.repeat(repeat_time, 1, 1, 1)
+        value_states = value_states.repeat(repeat_time, 1, 1, 1)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, use_cross_attn: bool):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.cross_attn = LlamaCrossAttention(config=config) if use_cross_attn else None
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.num_query_token = 96
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            vision_hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            repeat_time: int = 1,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # when using generate function and cache mode, the size of hidden_states is 1,
+        # so we should not use cross attention
+        if self.cross_attn is not None and hidden_states.size(1) >= self.num_query_token \
+                and vision_hidden_states is not None:
+            query_feats = hidden_states[:, :self.num_query_token, :]
+            text_feats = hidden_states[:, self.num_query_token:, :]
+            residual = query_feats
+            query_feats, _, _ = self.cross_attn(
+                hidden_states=query_feats,
+                vision_hidden_states=vision_hidden_states,
+                attention_mask=None,  # not use attention mask in cross attention
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                repeat_time=repeat_time,
+            )
+            query_feats = residual + query_feats
+            hidden_states = torch.cat([query_feats, text_feats], dim=1)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['LlamaDecoderLayer']
+    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+        if isinstance(module, LlamaDecoderLayer):
+            module.gradient_checkpointing = value
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.cross_attention_frequency = config.cross_attention_frequency
+        self.num_query_token = config.num_query_token
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        use_cross_attn = [idx % self.cross_attention_frequency == 0 for idx in range(config.num_hidden_layers)]
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, use_cross_attn[idx]) for idx in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            vision_hidden_states: Optional[torch.FloatTensor] = None,
+            repeat_time: Optional[int] = 1,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            use_zero_attention_mask: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError('You have to specify either decoder_input_ids or decoder_inputs_embeds')
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        if use_zero_attention_mask:
+            attention_mask[:, :, :self.num_query_token, :self.num_query_token] = 0
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                vision_hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                repeat_time=repeat_time,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward_train(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            vision_hidden_states: Optional[torch.FloatTensor] = None,
+            repeat_time: Optional[int] = 1,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError('You have to specify either decoder_input_ids or decoder_inputs_embeds')
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        # if attention_mask is None:
+        #     attention_mask = torch.ones(
+        #         (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+        #     )
+        # attention_mask = self._prepare_decoder_attention_mask(
+        #     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        # )
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None, repeat_time)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    vision_hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    vision_hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    repeat_time=repeat_time,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            vision_hidden_states: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            use_zero_attention_mask: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_hidden_states=vision_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_zero_attention_mask=use_zero_attention_mask,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None,
+            vision_hidden_states=None, use_zero_attention_mask=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+
+        model_inputs.update(
+            {
+                'position_ids': position_ids,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+                'vision_hidden_states': vision_hidden_states,
+                'use_zero_attention_mask': use_zero_attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
--- a/clip_benchmark/clip_benchmark/models/japanese_clip.py
+++ b/clip_benchmark/clip_benchmark/models/japanese_clip.py
+from typing import Dict
+
+import torch
+
+
+class DictTensor:
+    """
+    enable to do `tokenizer(texts).to(device)`
+    """
+
+    def __init__(self, d: Dict[str, torch.Tensor]):
+        self.d = d
+
+    def to(self, device):
+        return {k: v.to(device) for k, v in self.d.items()}
+
+
+class JaCLIPForBenchmark:
+    """
+    enable to do model.encode_text(dict_tensor)
+    """
+
+    def __init__(self, model):
+        self.model = model
+
+    def encode_text(self, dict_tensor):
+        return self.model.get_text_features(**dict_tensor)
+
+    def encode_image(self, image):
+        return self.model.get_image_features(image)
+
+
+def load_japanese_clip(pretrained: str, device='cpu', **kwargs):
+    """
+    Load Japanese CLIP/CLOOB by rinna (https://github.com/rinnakk/japanese-clip)
+    Remarks:
+     - You must input not only input_ids but also attention_masks and position_ids when doing `model.encode_text()` to make it work correctly.
+    """
+    try:
+        import japanese_clip as ja_clip
+    except ImportError:
+        raise ImportError('Install `japanese_clip` by `pip install git+https://github.com/rinnakk/japanese-clip.git`')
+    cache_dir = kwargs.pop('cache_dir', None)
+    model, transform = ja_clip.load(pretrained, device=device, cache_dir=cache_dir)
+
+    class JaTokenizerForBenchmark:
+        def __init__(self, ):
+            self.tokenizer = ja_clip.load_tokenizer()
+
+        def __call__(self, texts) -> Dict[str, torch.Tensor]:
+            inputs = ja_clip.tokenize(texts, tokenizer=self.tokenizer, device='cpu')
+            return DictTensor(inputs)
+
+        def __len__(self):
+            return len(self.tokenizer)
+
+    return JaCLIPForBenchmark(model), transform, JaTokenizerForBenchmark()
--- a/clip_benchmark/clip_benchmark/models/open_clip.py
+++ b/clip_benchmark/clip_benchmark/models/open_clip.py
+import open_clip
+
+
+def load_open_clip(model_name: str = 'ViT-B-32-quickgelu', pretrained: str = 'laion400m_e32', cache_dir: str = None,
+                   device='cpu'):
+    model, _, transform = open_clip.create_model_and_transforms(model_name, pretrained=pretrained, cache_dir=cache_dir)
+    model = model.to(device)
+    tokenizer = open_clip.get_tokenizer(model_name)
+    return model, transform, tokenizer
--- a/clip_benchmark/clip_benchmark/webdataset_builder.py
+++ b/clip_benchmark/clip_benchmark/webdataset_builder.py
+# Convert CLIP_benchmark datasets to webdataset format
+
+import argparse
+import io
+import os
+import sys
+
+import torch
+import torch.utils.data
+import webdataset
+from tqdm import tqdm
+
+from .datasets.builder import build_dataset
+
+
+def get_parser_args():
+    parser = argparse.ArgumentParser(description="""
+        Convert a CLIP_benchmark dataset to the webdataset format (TAR files).
+        Datasets can be uploaded to the Huggingface Hub to allow CLIP model
+        evaluation from anywhere with an Internet connection.
+
+        To convert other image classification datasets, use the Python API:
+        >>> import clip_benchmark.webdataset_builder
+        >>> help(clip_benchmark.webdataset_builder.convert_dataset)
+    """)
+    # Main arguments
+    parser.add_argument('--dataset', '-d', required=True, type=str,
+                        help='CLIP_benchmark compatible dataset for conversion')
+    parser.add_argument('--split', '-s', default='test', type=str,
+                        help='Dataset split to use')
+    parser.add_argument('--dataset-root', '-r', default='data', type=str,
+                        help='Root directory for input data')
+    parser.add_argument('--output', '-o', required=True, type=str,
+                        help='Root directory for output data')
+    # Special dataset types
+    parser_special = parser.add_mutually_exclusive_group()
+    parser_special.add_argument('--retrieval', action='store_true',
+                                help='Flag to signal retrieval dataset (text captions instead of classes)')
+    parser_special.add_argument('--multilabel', action='store_true',
+                                help='Flag to signal multilabel classification dataset')
+    # Additional parameters
+    parser.add_argument('--image-format', default='webp', type=str,
+                        help='Image extension for saving: (lossless) webp, png, or jpg (Default: webp)')
+    parser.add_argument('--max-count', default=10_000, type=int,
+                        help='Maximum number of images per TAR shard (Default: 10_000)')
+    parser.add_argument('--max-size', default=1_000_000_000, type=int,
+                        help='Maximum size in bytes per TAR shard (Default: 1_000_000_000)')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_parser_args()
+    run(args)
+
+
+def run(args):
+    # Setup dataset folder
+    os.makedirs(os.path.join(args.output, args.split), exist_ok=True)
+    # Load original dataset
+    dataset = build_dataset(
+        dataset_name=args.dataset,
+        root=args.dataset_root,
+        split=args.split,
+        transform=PIL_to_bytes(args.image_format),
+        download=True,
+    )
+    # Run conversion
+    if args.retrieval:
+        convert_retrieval_dataset(
+            dataset,
+            args.split,
+            args.output,
+            transform=None,
+            image_format=args.image_format,
+            max_count=args.max_count,
+            max_size=args.max_size
+        )
+    else:
+        convert_dataset(
+            dataset,
+            args.split,
+            args.output,
+            transform=None,
+            image_format=args.image_format,
+            max_count=args.max_count,
+            max_size=args.max_size,
+            multilabel=args.multilabel,
+        )
+
+
+def PIL_to_bytes(image_format):
+    OPTIONS = {
+        'webp': dict(format='webp', lossless=True),
+        'png': dict(format='png'),
+        'jpg': dict(format='jpeg'),
+    }
+
+    def transform(image):
+        bytestream = io.BytesIO()
+        image.save(bytestream, **OPTIONS[image_format])
+        return bytestream.getvalue()
+
+    return transform
+
+
+def path_to_bytes(filepath):
+    with open(filepath, 'rb') as fp:
+        return fp.read()
+
+
+def convert_dataset(dataset, split, output_folder, *, transform=None,
+                    image_format='webp', max_count=10_000, max_size=1_000_000_000,
+                    multilabel=False, verbose=True):
+    """
+    Convert an iterable `dataset` of (image, label) pairs to webdataset (.tar) format, and store in `output_folder/split`.
+
+    Images may be passed in as either:
+    * File paths: pass in `transform=path_to_bytes`;
+    * PIL images: pass in `transform=PIL_to_bytes(image_format)` where `image_format` is e.g. "webp"; or
+    * Raw binary data: use a PyTorch `Dataset` that supports `transform=PIL_to_bytes(image_format)`, and pass in `transform=None` here.
+        Be sure that the transform is not applied twice.
+
+    Copying image files directly or writing raw binary data is fastest since it allows multiprocessing;
+    passing in PIL images will be slower, but should work for any format of dataset.
+
+    Labels must be zero-indexed integers (for multilabel datasets, labels must be arrays/tensors).
+
+    Classnames and zero-shot classification templates can be provided as attributes of the dataset (`.classes` and `.templates`)
+    or filled in manually afterward. `dataset.classes` should be a list of strings indexed by the labels,
+    and `dataset.templates` should be a list of strings containing `{c}` to specify where classnames are to be inserted.
+    """
+    # Create output directory
+    os.makedirs(os.path.join(output_folder, split), exist_ok=True)
+    # Multiprocessed dataloader, should work with Dataset or list
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1,
+        num_workers=8,
+        collate_fn=lambda batch: batch[0]  # No collate, only for multiprocessing
+    )
+    if verbose:
+        try:
+            print(f'Dataset size: {len(dataset)}')
+        except TypeError:
+            print('IterableDataset has no len()')
+    # Save classnames
+    if hasattr(dataset, 'classes') and dataset.classes:
+        classnames_fname = os.path.join(output_folder, 'classnames.txt')
+        with open(classnames_fname, 'w') as classnames_file:
+            print(*dataset.classes, sep='\n', end='\n', file=classnames_file)
+        if verbose:
+            print("Saved class names to '%s'" % classnames_fname)
+    elif verbose:
+        print('WARNING: No class names found')
+    # Save zeroshot templates
+    if hasattr(dataset, 'templates') and dataset.templates:
+        templates_fname = os.path.join(output_folder, 'zeroshot_classification_templates.txt')
+        with open(templates_fname, 'w') as templates_file:
+            print(*dataset.templates, sep='\n', end='\n', file=templates_file)
+        if verbose:
+            print("Saved class names to '%s'" % templates_fname)
+    elif verbose:
+        print('WARNING: No zeroshot classification templates found')
+    # Save dataset type
+    if multilabel:
+        type_fname = os.path.join(output_folder, 'dataset_type.txt')
+        with open(type_fname, 'w') as type_file:
+            print('multilabel', end='\n', file=type_file)
+            if verbose:
+                print("Saved dataset type to '%s'" % type_fname)
+    # Write to TAR files
+    data_fname = os.path.join(output_folder, split, r'%d.tar')
+    sink = webdataset.ShardWriter(
+        data_fname,
+        maxcount=max_count,
+        maxsize=max_size
+    )
+    nsamples = 0
+    label_type = 'npy' if multilabel else 'cls'
+    for index, (input, output) in enumerate(tqdm(dataloader, desc='Converting')):
+        nsamples += 1
+        if isinstance(input, str) and transform is path_to_bytes:
+            # If copying file, determine image format from extension
+            extension = os.path.splitext(input)[1].replace('.', '').lower().replace('jpeg', 'jpg') or image_format
+        else:
+            extension = image_format
+        # Convert label if necessary
+        if isinstance(output, torch.Tensor):
+            if multilabel:
+                output = output.detach().cpu().numpy()
+            else:
+                output = output.item()
+        # Write example
+        sink.write({
+            '__key__': 's%07d' % index,
+            extension: transform(input) if transform else input,
+            label_type: output,
+        })
+    num_shards = sink.shard
+    sink.close()
+    if verbose:
+        print("Saved dataset to '%s'" % data_fname.replace(r'%d', '{0..%d}' % (num_shards - 1)))
+    # Save number of shards
+    nshards_fname = os.path.join(output_folder, split, 'nshards.txt')
+    with open(nshards_fname, 'w') as nshards_file:
+        print(num_shards, end='\n', file=nshards_file)
+    if verbose:
+        print("Saved number of shards = %d to '%s'" % (num_shards, nshards_fname))
+    print('Final dataset size:', nsamples)
+
+
+def convert_retrieval_dataset(dataset, split, output_folder, *, transform=None, image_format='webp', max_count=10_000,
+                              max_size=1_000_000_000, verbose=True):
+    """
+    Convert an iterable `dataset` of (image, [caption1, caption2, ...]) pairs to webdataset (.tar) format, and store in `output_folder/split`.
+
+    Labels must be lists of strings, with no newlines.
+
+    Read the documentation of `convert_dataset` for more information.
+    """
+    # Create output directory
+    os.makedirs(os.path.join(output_folder, split), exist_ok=True)
+    # Multiprocessed dataloader, should work with Dataset or list
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1,
+        num_workers=8,
+        collate_fn=lambda batch: batch[0]  # No collate, only for multiprocessing
+    )
+    if verbose:
+        try:
+            print(f'Dataset size: {len(dataset)}')
+        except TypeError:
+            print('IterableDataset has no len()')
+    # No classnames
+    # No zeroshot templates
+    # Save dataset type
+    type_fname = os.path.join(output_folder, 'dataset_type.txt')
+    with open(type_fname, 'w') as type_file:
+        print('retrieval', end='\n', file=type_file)
+    if verbose:
+        print("Saved dataset type to '%s'" % type_fname)
+    # Write to TAR files
+    data_fname = os.path.join(output_folder, split, r'%d.tar')
+    sink = webdataset.ShardWriter(
+        data_fname,
+        maxcount=max_count,
+        maxsize=max_size
+    )
+    nsamples = 0
+    for index, (input, output) in enumerate(tqdm(dataloader, desc='Converting')):
+        nsamples += 1
+        if isinstance(input, str) and transform is path_to_bytes:
+            # If copying file, determine image format from extension
+            extension = os.path.splitext(input)[1].replace('.', '').lower().replace('jpeg', 'jpg') or image_format
+        else:
+            extension = image_format
+        sink.write({
+            '__key__': 's%07d' % index,
+            extension: transform(input) if transform else input,
+            'txt': '\n'.join(caption.replace('\n', r'\n') for caption in output),
+        })
+    num_shards = sink.shard
+    sink.close()
+    if verbose:
+        print("Saved dataset to '%s'" % data_fname.replace(r'%d', '{0..%d}' % (num_shards - 1)))
+    # Save number of shards
+    nshards_fname = os.path.join(output_folder, split, 'nshards.txt')
+    with open(nshards_fname, 'w') as nshards_file:
+        print(num_shards, end='\n', file=nshards_file)
+    if verbose:
+        print("Saved number of shards = %d to '%s'" % (num_shards, nshards_fname))
+    print('Final dataset size:', nsamples)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/clip_benchmark/data/birdsnap/test_images_valid.txt
+++ b/clip_benchmark/data/birdsnap/test_images_valid.txt
+path
+Coopers_Hawk/0561.jpg
+Coopers_Hawk/0629.jpg
+Coopers_Hawk/0717.jpg
+Coopers_Hawk/1847.jpg
+Northern_Goshawk/2629.jpg
+Northern_Goshawk/3329.jpg
+Northern_Goshawk/3387.jpg
+Northern_Goshawk/3413.jpg
+Northern_Goshawk/3616.jpg
+Sharp_shinned_Hawk/3785.jpg
+Sharp_shinned_Hawk/3786.jpg
+Sharp_shinned_Hawk/4940.jpg
+Sharp_shinned_Hawk/5003.jpg
+Golden_Eagle/5479.jpg
+Golden_Eagle/5696.jpg
+Golden_Eagle/5978.jpg
+Golden_Eagle/7006.jpg
+White_tailed_Hawk/7839.jpg
+White_tailed_Hawk/7844.jpg
+White_tailed_Hawk/8062.jpg
+White_tailed_Hawk/8164.jpg
+Zone_tailed_Hawk/8306.jpg
+Red_tailed_Hawk/9193.jpg
+Red_tailed_Hawk/9201.jpg
+Red_tailed_Hawk/9924.jpg
+Rough_legged_Hawk/11186.jpg
+Rough_legged_Hawk/11336.jpg
+Rough_legged_Hawk/11897.jpg
+Rough_legged_Hawk/12960.jpg
+Rough_legged_Hawk/13233.jpg
+Red_shouldered_Hawk/13423.jpg
+Red_shouldered_Hawk/14132.jpg
+Red_shouldered_Hawk/15476.jpg
+Red_shouldered_Hawk/15484.jpg
+Broad_winged_Hawk/15781.jpg
+Broad_winged_Hawk/15914.jpg
+Broad_winged_Hawk/16173.jpg
+Broad_winged_Hawk/16557.jpg
+Broad_winged_Hawk/16558.jpg
+Swainsons_Hawk/18130.jpg
+Swainsons_Hawk/18135.jpg
+Swainsons_Hawk/18592.jpg
+Common_Black_Hawk/20559.jpg
+Common_Black_Hawk/20611.jpg
+Common_Black_Hawk/20699.jpg
+Common_Black_Hawk/20705.jpg
+Northern_Harrier/20822.jpg
+Northern_Harrier/21126.jpg
+Northern_Harrier/21396.jpg
+Northern_Harrier/21604.jpg
+Northern_Harrier/21799.jpg
+Swallow_tailed_Kite/23481.jpg
+Swallow_tailed_Kite/23501.jpg
+Swallow_tailed_Kite/23770.jpg
+Swallow_tailed_Kite/23778.jpg
+White_tailed_Kite/24002.jpg
+White_tailed_Kite/24118.jpg
+White_tailed_Kite/25257.jpg
+White_tailed_Kite/25367.jpg
+White_tailed_Kite/25517.jpg
+Bald_Eagle/25676.jpg
+Bald_Eagle/25922.jpg
+Bald_Eagle/26131.jpg
+Bald_Eagle/26336.jpg
+Mississippi_Kite/28319.jpg
+Mississippi_Kite/28504.jpg
+Mississippi_Kite/28582.jpg
+Harriss_Hawk/28755.jpg
+Harriss_Hawk/30467.jpg
+Snail_Kite/31366.jpg
+Snail_Kite/31384.jpg
+Snail_Kite/31551.jpg
+Snail_Kite/31705.jpg
+Snail_Kite/31847.jpg
+Bushtit/32084.jpg
+Bushtit/32514.jpg
+Bushtit/32653.jpg
+Bushtit/32768.jpg
+Horned_Lark/33214.jpg
+Horned_Lark/33365.jpg
+Horned_Lark/33590.jpg
+Horned_Lark/33863.jpg
+Belted_Kingfisher/34170.jpg
+Belted_Kingfisher/34307.jpg
+Belted_Kingfisher/34460.jpg
+Belted_Kingfisher/34974.jpg
+Belted_Kingfisher/35226.jpg
+Pigeon_Guillemot/35557.jpg
+Pigeon_Guillemot/35777.jpg
+Black_Guillemot/35966.jpg
+Black_Guillemot/35978.jpg
+Black_Guillemot/36346.jpg
+Black_Guillemot/36741.jpg
+Common_Murre/37641.jpg
+Common_Murre/38269.jpg
+Common_Murre/38418.jpg
+Northern_Pintail/42026.jpg
+Northern_Pintail/42339.jpg
+Northern_Pintail/42874.jpg
+Northern_Pintail/43275.jpg
+American_Wigeon/43939.jpg
+American_Wigeon/44572.jpg
+American_Wigeon/45880.jpg
+American_Wigeon/45881.jpg
+Green_winged_Teal/49197.jpg
+Green_winged_Teal/49207.jpg
+Cinnamon_Teal/51293.jpg
+Cinnamon_Teal/51300.jpg
+Cinnamon_Teal/51356.jpg
+Cinnamon_Teal/51906.jpg
+Cinnamon_Teal/51987.jpg
+Blue_winged_Teal/52323.jpg
+Blue_winged_Teal/53186.jpg
+Blue_winged_Teal/53522.jpg
+Mottled_Duck/53971.jpg
+Mottled_Duck/54152.jpg
+Mottled_Duck/54243.jpg
+Eurasian_Wigeon/54388.jpg
+Eurasian_Wigeon/54676.jpg
+Eurasian_Wigeon/55464.jpg
+Mallard/56726.jpg
+Mallard/57203.jpg
+Mallard/57322.jpg
+Mallard/58681.jpg
+American_Black_Duck/59245.jpg
+American_Black_Duck/59256.jpg
+American_Black_Duck/59286.jpg
+American_Black_Duck/59713.jpg
+Gadwall/59958.jpg
+Gadwall/61533.jpg
+Gadwall/62294.jpg
+Gadwall/62311.jpg
+Lesser_Scaup/62421.jpg
+Lesser_Scaup/62593.jpg
+Lesser_Scaup/63628.jpg
+Redhead/63952.jpg
+Redhead/64063.jpg
+Redhead/64730.jpg
+Ring_necked_Duck/64909.jpg
+Ring_necked_Duck/64935.jpg
+Ring_necked_Duck/65894.jpg
+Ring_necked_Duck/66469.jpg
+Greater_Scaup/66983.jpg
+Greater_Scaup/66991.jpg
+Greater_Scaup/67058.jpg
+Canvasback/68553.jpg
+Canvasback/69131.jpg
+Canvasback/69181.jpg
+Canvasback/69198.jpg
+Bufflehead/69347.jpg
+Bufflehead/69648.jpg
+Bufflehead/69917.jpg
+Common_Goldeneye/71721.jpg
+Common_Goldeneye/72137.jpg
+Common_Goldeneye/72704.jpg
+Barrows_Goldeneye/73549.jpg
+Barrows_Goldeneye/73593.jpg
+Barrows_Goldeneye/73720.jpg
+Barrows_Goldeneye/73802.jpg
+Muscovy_Duck/75092.jpg
+Muscovy_Duck/75804.jpg
+Long_tailed_Duck/77250.jpg
+Long_tailed_Duck/77429.jpg
+Long_tailed_Duck/78640.jpg
+Hooded_Merganser/80811.jpg
+Hooded_Merganser/81438.jpg
+Hooded_Merganser/81601.jpg
+Hooded_Merganser/82812.jpg
+Black_Scoter/83217.jpg
+Black_Scoter/83219.jpg
+White_winged_Scoter/83255.jpg
+White_winged_Scoter/83338.jpg
+White_winged_Scoter/83359.jpg
+White_winged_Scoter/83487.jpg
+White_winged_Scoter/83616.jpg
+Surf_Scoter/83789.jpg
+Surf_Scoter/83977.jpg
+Surf_Scoter/83980.jpg
+Surf_Scoter/84003.jpg
+Common_Merganser/84984.jpg
+Common_Merganser/85272.jpg
+Common_Merganser/85681.jpg
+Common_Merganser/86615.jpg
+Common_Merganser/86750.jpg
+Red_breasted_Merganser/87916.jpg
+Red_breasted_Merganser/89049.jpg
+Red_breasted_Merganser/89453.jpg
+Ruddy_Duck/90492.jpg
+Ruddy_Duck/90720.jpg
+Ruddy_Duck/90907.jpg
+Common_Eider/91152.jpg
+Common_Eider/91585.jpg
+Common_Eider/92543.jpg
+Common_Eider/92807.jpg
+Greater_White_fronted_Goose/93924.jpg
+Greater_White_fronted_Goose/95002.jpg
+Brant/95943.jpg
+Brant/97845.jpg
+Canada_Goose/99598.jpg
+Canada_Goose/99607.jpg
+Canada_Goose/99882.jpg
+Canada_Goose/100020.jpg
+Canada_Goose/100172.jpg
+Cackling_Goose/100553.jpg
+Cackling_Goose/100631.jpg
+Snow_Goose/102677.jpg
+Rosss_Goose/103958.jpg
+Trumpeter_Swan/104768.jpg
+Trumpeter_Swan/105885.jpg
+Trumpeter_Swan/106043.jpg
+Tundra_Swan/106805.jpg
+Tundra_Swan/107380.jpg
+Tundra_Swan/107872.jpg
+Mute_Swan/108235.jpg
+Mute_Swan/108298.jpg
+Mute_Swan/108300.jpg
+Mute_Swan/109158.jpg
+Mute_Swan/110279.jpg
+Fulvous_Whistling_Duck/112390.jpg
+Fulvous_Whistling_Duck/112464.jpg
+Fulvous_Whistling_Duck/112722.jpg
+Fulvous_Whistling_Duck/112873.jpg
+Fulvous_Whistling_Duck/112945.jpg
+Anhinga/113184.jpg
+Anhinga/113972.jpg
+Anhinga/114408.jpg
+Anhinga/115393.jpg
+Chimney_Swift/115602.jpg
+Chimney_Swift/115603.jpg
+Limpkin/116454.jpg
+Limpkin/116742.jpg
+Limpkin/116859.jpg
+Great_Egret/117822.jpg
+Great_Egret/117886.jpg
+Great_Egret/118110.jpg
+Great_Egret/118708.jpg
+Great_Blue_Heron/119576.jpg
+Great_Blue_Heron/119926.jpg
+Great_Blue_Heron/120868.jpg
+Great_Blue_Heron/121118.jpg
+Great_Blue_Heron/121470.jpg
+American_Bittern/122455.jpg
+American_Bittern/122457.jpg
+American_Bittern/122699.jpg
+American_Bittern/122825.jpg
+American_Bittern/123020.jpg
+Cattle_Egret/123693.jpg
+Cattle_Egret/123753.jpg
+Cattle_Egret/124183.jpg
+Cattle_Egret/124364.jpg
+Cattle_Egret/124797.jpg
+Little_Blue_Heron/129249.jpg
+Little_Blue_Heron/129505.jpg
+Little_Blue_Heron/130092.jpg
+Little_Blue_Heron/130285.jpg
+Reddish_Egret/131049.jpg
+Reddish_Egret/131057.jpg
+Reddish_Egret/131530.jpg
+Reddish_Egret/131540.jpg
+Reddish_Egret/132577.jpg
+Snowy_Egret/133251.jpg
+Snowy_Egret/133284.jpg
+Snowy_Egret/134285.jpg
+Snowy_Egret/134588.jpg
+Tricolored_Heron/136511.jpg
+Tricolored_Heron/136547.jpg
+Tricolored_Heron/136977.jpg
+Tricolored_Heron/137040.jpg
+Cedar_Waxwing/137780.jpg
+Cedar_Waxwing/138149.jpg
+Cedar_Waxwing/139511.jpg
+Bohemian_Waxwing/139598.jpg
+Bohemian_Waxwing/139646.jpg
+Bohemian_Waxwing/140047.jpg
+Bohemian_Waxwing/140775.jpg
+Bohemian_Waxwing/141717.jpg
+Lapland_Longspur/142113.jpg
+Lapland_Longspur/142292.jpg
+Lapland_Longspur/142705.jpg
+Chestnut_collared_Longspur/142766.jpg
+Chestnut_collared_Longspur/142767.jpg
+Chestnut_collared_Longspur/142802.jpg
+Chestnut_collared_Longspur/142809.jpg
+Snow_Bunting/143535.jpg
+Snow_Bunting/143737.jpg
+Snow_Bunting/143819.jpg
+Lesser_Nighthawk/145457.jpg
+Lesser_Nighthawk/145458.jpg
+Lesser_Nighthawk/145521.jpg
+Lesser_Nighthawk/145545.jpg
+Common_Nighthawk/145652.jpg
+Common_Nighthawk/145744.jpg
+Common_Nighthawk/145751.jpg
+Common_Nighthawk/145939.jpg
+Northern_Cardinal/146223.jpg
+Northern_Cardinal/146529.jpg
+Northern_Cardinal/147174.jpg
+Northern_Cardinal/148340.jpg
+Northern_Cardinal/148486.jpg
+Pyrrhuloxia/148798.jpg
+Pyrrhuloxia/148908.jpg
+Pyrrhuloxia/148939.jpg
+Pyrrhuloxia/148955.jpg
+Lazuli_Bunting/149043.jpg
+Lazuli_Bunting/149250.jpg
+Lazuli_Bunting/149463.jpg
+Lazuli_Bunting/149543.jpg
+Lazuli_Bunting/149553.jpg
+Blue_Grosbeak/149697.jpg
+Blue_Grosbeak/149825.jpg
+Blue_Grosbeak/149978.jpg
+Blue_Grosbeak/150042.jpg
+Painted_Bunting/150219.jpg
+Painted_Bunting/150476.jpg
+Painted_Bunting/151060.jpg
+Painted_Bunting/151235.jpg
+Indigo_Bunting/151346.jpg
+Indigo_Bunting/151895.jpg
+Indigo_Bunting/152329.jpg
+Indigo_Bunting/152535.jpg
+Indigo_Bunting/152582.jpg
+Rose_breasted_Grosbeak/153072.jpg
+Rose_breasted_Grosbeak/153316.jpg
+Rose_breasted_Grosbeak/153342.jpg
+Rose_breasted_Grosbeak/153480.jpg
+Black_headed_Grosbeak/154248.jpg
+Black_headed_Grosbeak/154666.jpg
+Black_headed_Grosbeak/154738.jpg
+Black_headed_Grosbeak/154851.jpg
+Hepatic_Tanager/155281.jpg
+Hepatic_Tanager/155321.jpg
+Hepatic_Tanager/155424.jpg
+Hepatic_Tanager/155426.jpg
+Hepatic_Tanager/155548.jpg
+Western_Tanager/155609.jpg
+Western_Tanager/155669.jpg
+Western_Tanager/155698.jpg
+Western_Tanager/156249.jpg
+Western_Tanager/156307.jpg
+Scarlet_Tanager/156490.jpg
+Scarlet_Tanager/156719.jpg
+Scarlet_Tanager/156837.jpg
+Scarlet_Tanager/157181.jpg
+Scarlet_Tanager/157221.jpg
+Summer_Tanager/157498.jpg
+Summer_Tanager/158027.jpg
+Summer_Tanager/158373.jpg
+Dickcissel/158550.jpg
+Dickcissel/158681.jpg
+Dickcissel/158752.jpg
+Dickcissel/158753.jpg
+Dickcissel/158755.jpg
+Turkey_Vulture/159500.jpg
+Turkey_Vulture/160502.jpg
+Turkey_Vulture/161163.jpg
+Turkey_Vulture/161552.jpg
+Black_Vulture/161825.jpg
+Black_Vulture/162244.jpg
+Black_Vulture/163234.jpg
+Black_Vulture/163235.jpg
+Black_Vulture/163883.jpg
+Brown_Creeper/164068.jpg
+Brown_Creeper/164076.jpg
+Brown_Creeper/164125.jpg
+Brown_Creeper/164195.jpg
+Piping_Plover/165060.jpg
+Piping_Plover/165262.jpg
+Piping_Plover/165946.jpg
+Snowy_Plover/166309.jpg
+Snowy_Plover/166357.jpg
+Snowy_Plover/166358.jpg
+Snowy_Plover/166463.jpg
+Wilsons_Plover/166524.jpg
+Wilsons_Plover/166558.jpg
+Wilsons_Plover/166662.jpg
+Wilsons_Plover/166874.jpg
+American_Golden_Plover/166930.jpg
+American_Golden_Plover/167058.jpg
+American_Golden_Plover/167144.jpg
+American_Golden_Plover/167343.jpg
+American_Golden_Plover/167436.jpg
+Pacific_Golden_Plover/168113.jpg
+Pacific_Golden_Plover/168233.jpg
+Pacific_Golden_Plover/168302.jpg
+Black_bellied_Plover/169674.jpg
+Black_bellied_Plover/170018.jpg
+Black_bellied_Plover/170231.jpg
+Wood_Stork/170871.jpg
+Wood_Stork/170893.jpg
+Wood_Stork/171841.jpg
+Wood_Stork/172104.jpg
+American_Dipper/172861.jpg
+American_Dipper/172956.jpg
+American_Dipper/173154.jpg
+American_Dipper/173319.jpg
+Rock_Pigeon/173531.jpg
+Rock_Pigeon/174265.jpg
+Rock_Pigeon/174297.jpg
+Rock_Pigeon/174727.jpg
+Rock_Pigeon/175183.jpg
+Inca_Dove/176099.jpg
+Inca_Dove/176142.jpg
+Inca_Dove/176184.jpg
+Inca_Dove/176392.jpg
+Common_Ground_Dove/176439.jpg
+Common_Ground_Dove/176477.jpg
+Common_Ground_Dove/176480.jpg
+Common_Ground_Dove/176583.jpg
+Common_Ground_Dove/176676.jpg
+Band_tailed_Pigeon/176837.jpg
+Band_tailed_Pigeon/176880.jpg
+Band_tailed_Pigeon/176891.jpg
+Band_tailed_Pigeon/176986.jpg
+Eurasian_Collared_Dove/177265.jpg
+Eurasian_Collared_Dove/177603.jpg
+Eurasian_Collared_Dove/179004.jpg
+Eurasian_Collared_Dove/179224.jpg
+White_winged_Dove/179532.jpg
+White_winged_Dove/179945.jpg
+White_winged_Dove/180042.jpg
+White_winged_Dove/180180.jpg
+White_winged_Dove/180341.jpg
+Mourning_Dove/180862.jpg
+Mourning_Dove/181425.jpg
+Mourning_Dove/181459.jpg
+Mourning_Dove/181533.jpg
+Mourning_Dove/181615.jpg
+Western_Scrub_Jay/182984.jpg
+Western_Scrub_Jay/183344.jpg
+Western_Scrub_Jay/183553.jpg
+Western_Scrub_Jay/184809.jpg
+Florida_Scrub_Jay/185368.jpg
+Florida_Scrub_Jay/185523.jpg
+Florida_Scrub_Jay/185625.jpg
+American_Crow/185934.jpg
+American_Crow/186786.jpg
+American_Crow/186936.jpg
+Common_Raven/189259.jpg
+Common_Raven/189302.jpg
+Common_Raven/189474.jpg
+Common_Raven/190593.jpg
+Chihuahuan_Raven/191301.jpg
+Fish_Crow/191337.jpg
+Fish_Crow/191446.jpg
+Fish_Crow/191590.jpg
+Blue_Jay/191761.jpg
+Blue_Jay/191858.jpg
+Blue_Jay/193440.jpg
+Blue_Jay/193650.jpg
+Blue_Jay/193947.jpg
+Stellers_Jay/194900.jpg
+Stellers_Jay/195166.jpg
+Stellers_Jay/195417.jpg
+Stellers_Jay/195993.jpg
+Stellers_Jay/196018.jpg
+Green_Jay/196466.jpg
+Green_Jay/196814.jpg
+Green_Jay/197144.jpg
+Green_Jay/197160.jpg
+Green_Jay/197233.jpg
+Clarks_Nutcracker/197461.jpg
+Clarks_Nutcracker/197770.jpg
+Clarks_Nutcracker/197799.jpg
+Gray_Jay/198031.jpg
+Gray_Jay/198039.jpg
+Gray_Jay/198840.jpg
+Gray_Jay/198971.jpg
+Black_billed_Magpie/199002.jpg
+Black_billed_Magpie/199079.jpg
+Black_billed_Magpie/199173.jpg
+Black_billed_Magpie/199340.jpg
+Yellow_billed_Magpie/199569.jpg
+Yellow_billed_Magpie/199694.jpg
+Groove_billed_Ani/199826.jpg
+Groove_billed_Ani/199903.jpg
+Groove_billed_Ani/200046.jpg
+Groove_billed_Ani/200063.jpg
+Groove_billed_Ani/200085.jpg
+Yellow_billed_Cuckoo/200296.jpg
+Yellow_billed_Cuckoo/200386.jpg
+Yellow_billed_Cuckoo/200392.jpg
+Yellow_billed_Cuckoo/200524.jpg
+Yellow_billed_Cuckoo/200618.jpg
+Black_billed_Cuckoo/200734.jpg
+Black_billed_Cuckoo/200763.jpg
+Black_billed_Cuckoo/200764.jpg
+Greater_Roadrunner/201620.jpg
+Greater_Roadrunner/201686.jpg
+Greater_Roadrunner/201752.jpg
+Greater_Roadrunner/201885.jpg
+Rufous_crowned_Sparrow/202169.jpg
+Rufous_crowned_Sparrow/202211.jpg
+Rufous_crowned_Sparrow/202277.jpg
+Saltmarsh_Sparrow/202530.jpg
+Saltmarsh_Sparrow/202582.jpg
+Henslows_Sparrow/202647.jpg
+Henslows_Sparrow/202648.jpg
+Le_Contes_Sparrow/202924.jpg
+Le_Contes_Sparrow/202929.jpg
+Le_Contes_Sparrow/202985.jpg
+Seaside_Sparrow/203056.jpg
+Seaside_Sparrow/203074.jpg
+Seaside_Sparrow/203076.jpg
+Seaside_Sparrow/203077.jpg
+Seaside_Sparrow/203139.jpg
+Nelsons_Sparrow/203238.jpg
+Nelsons_Sparrow/203239.jpg
+Nelsons_Sparrow/203240.jpg
+Nelsons_Sparrow/203386.jpg
+Grasshopper_Sparrow/203422.jpg
+Grasshopper_Sparrow/203430.jpg
+Grasshopper_Sparrow/203435.jpg
+Grasshopper_Sparrow/203755.jpg
+Grasshopper_Sparrow/203848.jpg
+Black_throated_Sparrow/204020.jpg
+Black_throated_Sparrow/204083.jpg
+Black_throated_Sparrow/204088.jpg
+Black_throated_Sparrow/204185.jpg
+Black_throated_Sparrow/204241.jpg
+Olive_Sparrow/204349.jpg
+Olive_Sparrow/204368.jpg
+Olive_Sparrow/204403.jpg
+Lark_Bunting/204573.jpg
+Lark_Bunting/204592.jpg
+Lark_Bunting/204611.jpg
+Lark_Bunting/204612.jpg
+Lark_Sparrow/204909.jpg
+Lark_Sparrow/204953.jpg
+Lark_Sparrow/205114.jpg
+Lark_Sparrow/205179.jpg
+Dark_eyed_Junco/205462.jpg
+Dark_eyed_Junco/206254.jpg
+Dark_eyed_Junco/207033.jpg
+Dark_eyed_Junco/207035.jpg
+Dark_eyed_Junco/207719.jpg
+Yellow_eyed_Junco/207760.jpg
+Yellow_eyed_Junco/207831.jpg
+Yellow_eyed_Junco/207856.jpg
+Yellow_eyed_Junco/207860.jpg
+Swamp_Sparrow/207984.jpg
+Swamp_Sparrow/208132.jpg
+Swamp_Sparrow/208138.jpg
+Swamp_Sparrow/208233.jpg
+Swamp_Sparrow/208316.jpg
+Lincolns_Sparrow/208564.jpg
+Lincolns_Sparrow/208945.jpg
+Lincolns_Sparrow/208974.jpg
+Song_Sparrow/209217.jpg
+Song_Sparrow/209334.jpg
+Song_Sparrow/210050.jpg
+Song_Sparrow/211069.jpg
+California_Towhee/211708.jpg
+California_Towhee/211817.jpg
+California_Towhee/211832.jpg
+Canyon_Towhee/211861.jpg
+Canyon_Towhee/211886.jpg
+Fox_Sparrow/214179.jpg
+Fox_Sparrow/214288.jpg
+Fox_Sparrow/214681.jpg
+Green_tailed_Towhee/214916.jpg
+Green_tailed_Towhee/214994.jpg
+Green_tailed_Towhee/215106.jpg
+Eastern_Towhee/215267.jpg
+Eastern_Towhee/215631.jpg
+Eastern_Towhee/215956.jpg
+Eastern_Towhee/216088.jpg
+Spotted_Towhee/216360.jpg
+Spotted_Towhee/216432.jpg
+Spotted_Towhee/216677.jpg
+Spotted_Towhee/217197.jpg
+Vesper_Sparrow/217587.jpg
+Vesper_Sparrow/217701.jpg
+Vesper_Sparrow/217799.jpg
+Vesper_Sparrow/217803.jpg
+American_Tree_Sparrow/218041.jpg
+American_Tree_Sparrow/218577.jpg
+American_Tree_Sparrow/218675.jpg
+American_Tree_Sparrow/218729.jpg
+Black_chinned_Sparrow/218798.jpg
+Brewers_Sparrow/218868.jpg
+Brewers_Sparrow/218969.jpg
+Brewers_Sparrow/219004.jpg
+Clay_colored_Sparrow/219059.jpg
+Clay_colored_Sparrow/219170.jpg
+Clay_colored_Sparrow/219177.jpg
+Chipping_Sparrow/219491.png
+Chipping_Sparrow/219658.jpg
+Chipping_Sparrow/220495.jpg
+Chipping_Sparrow/220942.png
+Field_Sparrow/221325.jpg
+Field_Sparrow/221370.jpg
+Field_Sparrow/221426.jpg
+Field_Sparrow/221499.jpg
+White_throated_Sparrow/221714.jpg
+White_throated_Sparrow/222006.jpg
+White_throated_Sparrow/223076.jpg
+White_throated_Sparrow/223163.jpg
+Golden_crowned_Sparrow/223833.jpg
+Golden_crowned_Sparrow/224106.jpg
+Golden_crowned_Sparrow/224269.jpg
+Golden_crowned_Sparrow/224780.jpg
+White_crowned_Sparrow/224824.jpg
+White_crowned_Sparrow/224828.jpg
+White_crowned_Sparrow/226522.jpg
+White_crowned_Sparrow/226803.jpg
+White_crowned_Sparrow/227265.jpg
+Harriss_Sparrow/227350.jpg
+Harriss_Sparrow/227512.jpg
+Harriss_Sparrow/227527.jpg
+Harriss_Sparrow/227577.jpg
+Crested_Caracara/228024.jpg
+Crested_Caracara/228385.jpg
+Crested_Caracara/228877.jpg
+Crested_Caracara/228903.jpg
+Merlin/229233.jpg
+Merlin/230057.jpg
+Merlin/230144.jpg
+Merlin/230534.jpg
+Merlin/230575.jpg
+Prairie_Falcon/231302.jpg
+Prairie_Falcon/231323.jpg
+Prairie_Falcon/231435.jpg
+Prairie_Falcon/231455.jpg
+Peregrine_Falcon/233275.jpg
+Peregrine_Falcon/234091.jpg
+American_Kestrel/235246.jpg
+American_Kestrel/235253.jpg
+American_Kestrel/235264.jpg
+American_Kestrel/235660.jpg
+American_Kestrel/236214.jpg
+Magnificent_Frigatebird/236812.jpg
+Magnificent_Frigatebird/237448.jpg
+Magnificent_Frigatebird/237859.jpg
+Magnificent_Frigatebird/238061.jpg
+Magnificent_Frigatebird/238207.jpg
+Common_Redpoll/238739.jpg
+Common_Redpoll/239046.jpg
+Common_Redpoll/239082.jpg
+Common_Redpoll/239106.jpg
+Common_Redpoll/239201.jpg
+Hoary_Redpoll/239309.jpg
+Hoary_Redpoll/239353.jpg
+Hoary_Redpoll/239354.jpg
+Hoary_Redpoll/239355.jpg
+Evening_Grosbeak/239568.jpg
+Evening_Grosbeak/239571.jpg
+Evening_Grosbeak/239578.jpg
+Evening_Grosbeak/239610.jpg
+House_Finch/240714.jpg
+House_Finch/240759.jpg
+Purple_Finch/240897.jpg
+Black_Rosy_Finch/240946.jpg
+Black_Rosy_Finch/240955.jpg
+Brown_capped_Rosy_Finch/241033.jpg
+Brown_capped_Rosy_Finch/241063.jpg
+Brown_capped_Rosy_Finch/241090.jpg
+Brown_capped_Rosy_Finch/241092.jpg
+Gray_crowned_Rosy_Finch/241263.jpg
+Gray_crowned_Rosy_Finch/241395.jpg
+Red_Crossbill/241564.jpg
+Red_Crossbill/241937.jpg
+Red_Crossbill/242149.jpg
+White_winged_Crossbill/243991.jpg
+White_winged_Crossbill/244623.jpg
+Pine_Grosbeak/244873.jpg
+Pine_Grosbeak/244874.jpg
+Pine_Grosbeak/244929.jpg
+Pine_Grosbeak/245002.jpg
+Pine_Grosbeak/245550.jpg
+Pine_Siskin/245770.jpg
+Pine_Siskin/245933.jpg
+Pine_Siskin/246014.jpg
+Pine_Siskin/246023.jpg
+Lesser_Goldfinch/246204.jpg
+Lesser_Goldfinch/246306.jpg
+Lesser_Goldfinch/246307.jpg
+American_Goldfinch/246429.jpg
+American_Goldfinch/247079.jpg
+American_Goldfinch/247294.jpg
+Common_Loon/248105.jpg
+Common_Loon/248123.jpg
+Common_Loon/248592.jpg
+Common_Loon/249061.jpg
+Pacific_Loon/250187.jpg
+Pacific_Loon/250226.jpg
+Pacific_Loon/250348.jpg
+Red_throated_Loon/250714.jpg
+Red_throated_Loon/250825.jpg
+Red_throated_Loon/251261.jpg
+Red_throated_Loon/251871.jpg
+Sandhill_Crane/252873.jpg
+Sandhill_Crane/253177.jpg
+Sandhill_Crane/253349.jpg
+Black_Oystercatcher/254613.jpg
+Black_Oystercatcher/255370.jpg
+Black_Oystercatcher/255850.jpg
+Black_Oystercatcher/255860.jpg
+American_Oystercatcher/256363.jpg
+American_Oystercatcher/256433.jpg
+American_Oystercatcher/256780.jpg
+American_Oystercatcher/256989.jpg
+American_Oystercatcher/257279.jpg
+Barn_Swallow/257707.jpg
+Barn_Swallow/257866.jpg
+Barn_Swallow/258072.jpg
+Barn_Swallow/258226.jpg
+Barn_Swallow/258520.jpg
+Cave_Swallow/260085.jpg
+Northern_Rough_winged_Swallow/262657.jpg
+Northern_Rough_winged_Swallow/262844.jpg
+Northern_Rough_winged_Swallow/262854.jpg
+Northern_Rough_winged_Swallow/263110.jpg
+Tree_Swallow/263642.jpg
+Tree_Swallow/263670.jpg
+Tree_Swallow/263864.jpg
+Tree_Swallow/264472.jpg
+Tree_Swallow/264479.jpg
+Violet_green_Swallow/265705.jpg
+Violet_green_Swallow/265721.jpg
+Violet_green_Swallow/266002.jpg
+Violet_green_Swallow/266109.jpg
+Red_winged_Blackbird/268018.jpg
+Red_winged_Blackbird/268129.jpg
+Red_winged_Blackbird/268500.jpg
+Bobolink/268779.jpg
+Bobolink/268810.jpg
+Bobolink/269087.jpg
+Bobolink/269217.jpg
+Rusty_Blackbird/269395.jpg
+Rusty_Blackbird/269425.jpg
+Rusty_Blackbird/269510.jpg
+Rusty_Blackbird/269565.jpg
+Brewers_Blackbird/269878.jpg
+Brewers_Blackbird/270185.jpg
+Brewers_Blackbird/270204.jpg
+Brewers_Blackbird/270263.jpg
+Bullocks_Oriole/270945.jpg
+Bullocks_Oriole/270969.jpg
+Bullocks_Oriole/271000.jpg
+Bullocks_Oriole/271423.jpg
+Hooded_Oriole/271756.jpg
+Hooded_Oriole/271757.jpg
+Hooded_Oriole/271923.jpg
+Hooded_Oriole/272024.jpg
+Hooded_Oriole/272035.jpg
+Baltimore_Oriole/272800.jpg
+Baltimore_Oriole/273114.jpg
+Baltimore_Oriole/273561.jpg
+Baltimore_Oriole/273846.jpg
+Audubons_Oriole/274001.jpg
+Altamira_Oriole/274122.jpg
+Altamira_Oriole/274270.jpg
+Altamira_Oriole/274278.jpg
+Scotts_Oriole/274349.jpg
+Orchard_Oriole/274565.jpg
+Orchard_Oriole/274791.jpg
+Orchard_Oriole/274977.jpg
+Orchard_Oriole/274982.jpg
+Orchard_Oriole/274990.jpg
+Bronzed_Cowbird/275320.jpg
+Bronzed_Cowbird/275324.jpg
+Brown_headed_Cowbird/275656.jpg
+Brown_headed_Cowbird/275817.jpg
+Brown_headed_Cowbird/275883.jpg
+Brown_headed_Cowbird/275912.jpg
+Brown_headed_Cowbird/276861.jpg
+Boat_tailed_Grackle/277001.jpg
+Boat_tailed_Grackle/277173.jpg
+Boat_tailed_Grackle/277653.jpg
+Boat_tailed_Grackle/277707.jpg
+Great_tailed_Grackle/277971.jpg
+Great_tailed_Grackle/278434.jpg
+Great_tailed_Grackle/278496.jpg
+Great_tailed_Grackle/278682.jpg
+Great_tailed_Grackle/278789.jpg
+Common_Grackle/279408.jpg
+Common_Grackle/279504.jpg
+Common_Grackle/279834.jpg
+Common_Grackle/280459.jpg
+Common_Grackle/280958.jpg
+Eastern_Meadowlark/281454.jpg
+Eastern_Meadowlark/281491.jpg
+Eastern_Meadowlark/281734.jpg
+Western_Meadowlark/282126.jpg
+Western_Meadowlark/282293.jpg
+Western_Meadowlark/282423.jpg
+Western_Meadowlark/282900.jpg
+Yellow_headed_Blackbird/283179.jpg
+Yellow_headed_Blackbird/283858.jpg
+Yellow_headed_Blackbird/283950.jpg
+Yellow_headed_Blackbird/284113.jpg
+Yellow_headed_Blackbird/284120.jpg
+Northern_Shrike/284500.jpg
+Northern_Shrike/285490.jpg
+Northern_Shrike/285916.jpg
+Loggerhead_Shrike/286064.jpg
+Loggerhead_Shrike/286153.jpg
+Loggerhead_Shrike/286240.jpg
+Loggerhead_Shrike/286499.jpg
+Bonapartes_Gull/287155.jpg
+Bonapartes_Gull/287412.jpg
+Bonapartes_Gull/287517.jpg
+Herring_Gull/287902.jpg
+Herring_Gull/288527.jpg
+Herring_Gull/289004.jpg
+Herring_Gull/290007.jpg
+California_Gull/290151.jpg
+California_Gull/290179.jpg
+California_Gull/290267.jpg
+California_Gull/290628.jpg
+Mew_Gull/291093.jpg
+Mew_Gull/291094.jpg
+Mew_Gull/291317.jpg
+Mew_Gull/293136.jpg
+Mew_Gull/293328.jpg
+Ring_billed_Gull/293666.jpg
+Ring_billed_Gull/294021.jpg
+Ring_billed_Gull/294083.jpg
+Ring_billed_Gull/294306.jpg
+Ring_billed_Gull/294857.jpg
+Glaucous_winged_Gull/298433.jpg
+Glaucous_winged_Gull/298616.jpg
+Glaucous_winged_Gull/298626.jpg
+Glaucous_winged_Gull/298801.jpg
+Glaucous_winged_Gull/298833.jpg
+Iceland_Gull/299515.jpg
+Iceland_Gull/299516.jpg
+Iceland_Gull/300417.jpg
+Iceland_Gull/300418.jpg
+Heermanns_Gull/300938.jpg
+Heermanns_Gull/301030.jpg
+Heermanns_Gull/301078.jpg
+Heermanns_Gull/301104.jpg
+Glaucous_Gull/301812.jpg
+Glaucous_Gull/301827.jpg
+Glaucous_Gull/302184.jpg
+Glaucous_Gull/302567.jpg
+Glaucous_Gull/302692.jpg
+Great_Black_backed_Gull/303095.jpg
+Great_Black_backed_Gull/303234.jpg
+Great_Black_backed_Gull/303477.jpg
+Great_Black_backed_Gull/303516.jpg
+Great_Black_backed_Gull/304523.jpg
+Western_Gull/305899.jpg
+Western_Gull/306080.jpg
+Western_Gull/306586.jpg
+Western_Gull/306893.jpg
+Western_Gull/306950.jpg
+Thayers_Gull/307716.jpg
+Thayers_Gull/307779.jpg
+Thayers_Gull/307818.jpg
+Thayers_Gull/307822.jpg
+Laughing_Gull/308022.jpg
+Laughing_Gull/308164.jpg
+Laughing_Gull/308266.jpg
+Franklins_Gull/309043.jpg
+Franklins_Gull/309135.jpg
+Franklins_Gull/309193.jpg
+Black_legged_Kittiwake/309259.jpg
+Black_legged_Kittiwake/309334.jpg
+Black_legged_Kittiwake/309654.jpg
+Black_legged_Kittiwake/310481.jpg
+Black_Skimmer/311960.jpg
+Black_Skimmer/312079.jpg
+Black_Skimmer/312167.jpg
+Black_Skimmer/312512.jpg
+Black_Skimmer/312679.jpg
+Black_Tern/314133.jpg
+Black_Tern/314970.jpg
+Gull_billed_Tern/315286.jpg
+Gull_billed_Tern/315294.jpg
+Gull_billed_Tern/315416.jpg
+Gull_billed_Tern/315435.jpg
+Caspian_Tern/315661.jpg
+Caspian_Tern/315662.jpg
+Caspian_Tern/315688.jpg
+Caspian_Tern/315968.jpg
+Caspian_Tern/316252.jpg
+Roseate_Tern/316924.jpg
+Roseate_Tern/316925.jpg
+Roseate_Tern/316941.jpg
+Forsters_Tern/317495.jpg
+Forsters_Tern/318187.jpg
+Forsters_Tern/318546.jpg
+Common_Tern/318688.jpg
+Common_Tern/318803.jpg
+Common_Tern/319029.jpg
+Common_Tern/319099.jpg
+Common_Tern/319411.jpg
+Arctic_Tern/320674.jpg
+Arctic_Tern/321528.jpg
+Arctic_Tern/322024.jpg
+Arctic_Tern/322181.jpg
+Arctic_Tern/322922.jpg
+Least_Tern/323045.jpg
+Least_Tern/323052.jpg
+Least_Tern/323472.jpg
+Least_Tern/323483.jpg
+Royal_Tern/323513.jpg
+Royal_Tern/323783.jpg
+Sandwich_Tern/324321.jpg
+Sandwich_Tern/324443.jpg
+Sandwich_Tern/324599.jpg
+Sandwich_Tern/324756.jpg
+Sandwich_Tern/324769.jpg
+Gray_Catbird/324859.jpg
+Gray_Catbird/324927.jpg
+Gray_Catbird/324971.jpg
+Gray_Catbird/325400.jpg
+Gray_Catbird/325934.jpg
+Northern_Mockingbird/326436.jpg
+Northern_Mockingbird/326445.jpg
+Northern_Mockingbird/328143.jpg
+Northern_Mockingbird/328292.jpg
+Northern_Mockingbird/328612.jpg
+Sage_Thrasher/328718.jpg
+Sage_Thrasher/328775.jpg
+Sage_Thrasher/328832.jpg
+Sage_Thrasher/328868.jpg
+Curve_billed_Thrasher/328914.jpg
+Curve_billed_Thrasher/329153.jpg
+Curve_billed_Thrasher/329176.jpg
+Curve_billed_Thrasher/329191.jpg
+Long_billed_Thrasher/329560.jpg
+Long_billed_Thrasher/329561.jpg
+Long_billed_Thrasher/329562.jpg
+Long_billed_Thrasher/329615.jpg
+California_Thrasher/329757.jpg
+California_Thrasher/329794.jpg
+California_Thrasher/329830.jpg
+Brown_Thrasher/330163.jpg
+Brown_Thrasher/330284.jpg
+Brown_Thrasher/330291.jpg
+Brown_Thrasher/330538.jpg
+Brown_Thrasher/330580.jpg
+American_Pipit/331265.jpg
+American_Pipit/331545.jpg
+American_Pipit/331566.jpg
+American_Pipit/331569.jpg
+California_Quail/331714.jpg
+California_Quail/331810.jpg
+California_Quail/332297.jpg
+California_Quail/333262.jpg
+Scaled_Quail/333395.jpg
+Northern_Bobwhite/333619.jpg
+Northern_Bobwhite/333770.jpg
+Northern_Bobwhite/333790.jpg
+Osprey/334503.jpg
+Osprey/334536.jpg
+Osprey/334638.jpg
+Osprey/335020.jpg
+Osprey/336288.jpg
+Black_crested_Titmouse/336421.jpg
+Black_crested_Titmouse/336453.jpg
+Black_crested_Titmouse/336499.jpg
+Black_crested_Titmouse/336543.jpg
+Black_crested_Titmouse/336575.jpg
+Tufted_Titmouse/336977.jpg
+Tufted_Titmouse/338353.jpg
+Tufted_Titmouse/338568.jpg
+Oak_Titmouse/338683.jpg
+Oak_Titmouse/338693.jpg
+Bridled_Titmouse/338900.jpg
+Black_capped_Chickadee/339253.jpg
+Black_capped_Chickadee/339446.jpg
+Black_capped_Chickadee/339713.jpg
+Black_capped_Chickadee/341033.jpg
+Carolina_Chickadee/341523.jpg
+Carolina_Chickadee/341851.jpg
+Carolina_Chickadee/341860.jpg
+Carolina_Chickadee/342293.jpg
+Mountain_Chickadee/342457.jpg
+Mountain_Chickadee/342534.jpg
+Mountain_Chickadee/342563.jpg
+Mountain_Chickadee/342571.jpg
+Mountain_Chickadee/342697.jpg
+Boreal_Chickadee/342835.jpg
+Boreal_Chickadee/342836.jpg
+Boreal_Chickadee/342847.jpg
+Boreal_Chickadee/342953.jpg
+Boreal_Chickadee/342967.jpg
+Chestnut_backed_Chickadee/343229.jpg
+Chestnut_backed_Chickadee/343361.jpg
+Chestnut_backed_Chickadee/343455.jpg
+Chestnut_backed_Chickadee/343684.jpg
+Chestnut_backed_Chickadee/343685.jpg
+Canada_Warbler/343789.jpg
+Canada_Warbler/343790.jpg
+Wilsons_Warbler/343994.jpg
+Wilsons_Warbler/343996.jpg
+Wilsons_Warbler/344021.jpg
+Mourning_Warbler/344176.jpg
+Common_Yellowthroat/344414.jpg
+Common_Yellowthroat/345188.jpg
+Common_Yellowthroat/345325.jpg
+Common_Yellowthroat/345451.jpg
+Common_Yellowthroat/345523.jpg
+Worm_eating_Warbler/346416.jpg
+Yellow_breasted_Chat/346442.jpg
+Yellow_breasted_Chat/346698.jpg
+Yellow_breasted_Chat/346703.jpg
+Yellow_breasted_Chat/346747.jpg
+Yellow_breasted_Chat/346759.jpg
+Black_and_white_Warbler/346908.jpg
+Black_and_white_Warbler/346913.jpg
+Black_and_white_Warbler/347317.jpg
+Black_and_white_Warbler/347492.jpg
+Black_and_white_Warbler/347648.jpg
+Painted_Redstart/347978.jpg
+Painted_Redstart/347980.jpg
+Painted_Redstart/347982.jpg
+Connecticut_Warbler/348035.jpg
+Connecticut_Warbler/348036.jpg
+Connecticut_Warbler/348037.jpg
+Connecticut_Warbler/348067.jpg
+Connecticut_Warbler/348090.jpg
+Orange_crowned_Warbler/348166.jpg
+Orange_crowned_Warbler/348253.jpg
+Orange_crowned_Warbler/348381.jpg
+Tennessee_Warbler/348538.jpg
+Tennessee_Warbler/348584.jpg
+Tennessee_Warbler/348598.jpg
+Tennessee_Warbler/348616.jpg
+Tennessee_Warbler/348638.jpg
+Nashville_Warbler/348685.jpg
+Nashville_Warbler/348691.jpg
+Nashville_Warbler/348693.jpg
+Nashville_Warbler/348695.jpg
+Nashville_Warbler/348842.jpg
+Louisiana_Waterthrush/348992.jpg
+Northern_Waterthrush/349047.jpg
+Northern_Waterthrush/349108.jpg
+Northern_Waterthrush/349120.jpg
+Northern_Waterthrush/349137.jpg
+Prothonotary_Warbler/349311.jpg
+Prothonotary_Warbler/349367.jpg
+Prothonotary_Warbler/349487.jpg
+Prothonotary_Warbler/349507.jpg
+Prothonotary_Warbler/349511.jpg
+Ovenbird/350013.jpg
+Ovenbird/350075.jpg
+Ovenbird/350278.jpg
+Northern_Parula/350582.jpg
+Northern_Parula/350585.jpg
+Black_throated_Blue_Warbler/350737.jpg
+Bay_breasted_Warbler/350969.jpg
+Cerulean_Warbler/351071.jpg
+Hooded_Warbler/351187.jpg
+Hooded_Warbler/351214.jpg
+Hooded_Warbler/351215.jpg
+Yellow_rumped_Warbler/351333.jpg
+Yellow_rumped_Warbler/351557.jpg
+Yellow_rumped_Warbler/351619.jpg
+Yellow_rumped_Warbler/351773.jpg
+Yellow_rumped_Warbler/351788.jpg
+Prairie_Warbler/352276.jpg
+Prairie_Warbler/352277.jpg
+Prairie_Warbler/352287.jpg
+Prairie_Warbler/352288.jpg
+Blackburnian_Warbler/352648.jpg
+Blackburnian_Warbler/352651.jpg
+Blackburnian_Warbler/352667.jpg
+Magnolia_Warbler/352840.jpg
+Magnolia_Warbler/352869.jpg
+Magnolia_Warbler/352985.jpg
+Palm_Warbler/353311.jpg
+Palm_Warbler/353356.jpg
+Palm_Warbler/353511.jpg
+Palm_Warbler/353548.jpg
+Chestnut_sided_Warbler/353600.jpg
+Chestnut_sided_Warbler/353670.jpg
+Chestnut_sided_Warbler/353726.jpg
+Chestnut_sided_Warbler/353737.jpg
+Yellow_Warbler/353938.jpg
+Yellow_Warbler/354182.jpg
+Yellow_Warbler/354253.jpg
+Yellow_Warbler/354323.jpg
+Pine_Warbler/354426.jpg
+Pine_Warbler/354536.jpg
+American_Redstart/354739.jpg
+American_Redstart/354765.jpg
+American_Redstart/354880.jpg
+American_Redstart/354932.jpg
+American_Redstart/355579.jpg
+Blackpoll_Warbler/355744.jpg
+Blackpoll_Warbler/355800.jpg
+Blackpoll_Warbler/355856.jpg
+Blackpoll_Warbler/355862.jpg
+Cape_May_Warbler/355894.jpg
+Cape_May_Warbler/355999.jpg
+Townsends_Warbler/356218.jpg
+Townsends_Warbler/356223.jpg
+Townsends_Warbler/356228.jpg
+Black_throated_Green_Warbler/356502.jpg
+Black_throated_Green_Warbler/356558.jpg
+Black_throated_Green_Warbler/356584.jpg
+Golden_winged_Warbler/356684.jpg
+Golden_winged_Warbler/356752.jpg
+Golden_winged_Warbler/356806.jpg
+Golden_winged_Warbler/356812.jpg
+Blue_winged_Warbler/356962.jpg
+Blue_winged_Warbler/356969.jpg
+Blue_winged_Warbler/356971.jpg
+House_Sparrow/358126.jpg
+House_Sparrow/358818.jpg
+House_Sparrow/359320.jpg
+American_White_Pelican/359935.jpg
+American_White_Pelican/360045.jpg
+American_White_Pelican/360046.jpg
+Brown_Pelican/362301.jpg
+Brown_Pelican/363467.jpg
+Brown_Pelican/363763.jpg
+Brown_Pelican/363813.jpg
+Brown_Pelican/364363.jpg
+Double_crested_Cormorant/364541.jpg
+Double_crested_Cormorant/364685.jpg
+Double_crested_Cormorant/364764.jpg
+Double_crested_Cormorant/365382.jpg
+Neotropic_Cormorant/366868.jpg
+Neotropic_Cormorant/367584.jpg
+Neotropic_Cormorant/367993.jpg
+Neotropic_Cormorant/368157.jpg
+Neotropic_Cormorant/368195.jpg
+Great_Cormorant/370495.jpg
+Great_Cormorant/370500.jpg
+Great_Cormorant/370516.jpg
+Great_Cormorant/370524.jpg
+Brandts_Cormorant/371769.jpg
+Brandts_Cormorant/371880.jpg
+Brandts_Cormorant/371924.jpg
+Brandts_Cormorant/372203.jpg
+Wild_Turkey/372322.jpg
+Wild_Turkey/373031.jpg
+Wild_Turkey/374274.jpg
+Ring_necked_Pheasant/375137.jpg
+Ring_necked_Pheasant/375192.jpg
+Ring_necked_Pheasant/375875.jpg
+Ring_necked_Pheasant/376251.jpg
+Ring_necked_Pheasant/376882.jpg
+Ruffed_Grouse/377185.jpg
+Ruffed_Grouse/377286.jpg
+Ruffed_Grouse/377297.jpg
+Ruffed_Grouse/377354.jpg
+Greater_Sage_Grouse/377841.jpg
+Greater_Sage_Grouse/378031.jpg
+Sooty_Grouse/378509.jpg
+Dusky_Grouse/378823.jpg
+Dusky_Grouse/378838.jpg
+Dusky_Grouse/378865.jpg
+Spruce_Grouse/379038.jpg
+Spruce_Grouse/379048.jpg
+Spruce_Grouse/379166.jpg
+Spruce_Grouse/379230.jpg
+Willow_Ptarmigan/379441.jpg
+Willow_Ptarmigan/379771.jpg
+Willow_Ptarmigan/380051.jpg
+Rock_Ptarmigan/381514.jpg
+Rock_Ptarmigan/381648.jpg
+Rock_Ptarmigan/381666.jpg
+Rock_Ptarmigan/381680.jpg
+Greater_Prairie_Chicken/382034.jpg
+Greater_Prairie_Chicken/382063.jpg
+Greater_Prairie_Chicken/382089.jpg
+Greater_Prairie_Chicken/382176.jpg
+Sharp_tailed_Grouse/382288.jpg
+Sharp_tailed_Grouse/382463.jpg
+Northern_Flicker/382710.jpg
+Northern_Flicker/382764.jpg
+Northern_Flicker/383173.jpg
+Northern_Flicker/383289.jpg
+Northern_Flicker/383640.jpg
+Pileated_Woodpecker/385340.jpg
+Pileated_Woodpecker/386642.jpg
+Pileated_Woodpecker/387058.jpg
+Pileated_Woodpecker/387106.jpg
+Pileated_Woodpecker/387470.jpg
+Golden_fronted_Woodpecker/387841.jpg
+Golden_fronted_Woodpecker/387974.jpg
+Golden_fronted_Woodpecker/387982.jpg
+Golden_fronted_Woodpecker/387991.jpg
+Golden_fronted_Woodpecker/388025.jpg
+Red_bellied_Woodpecker/388109.jpg
+Red_bellied_Woodpecker/389064.jpg
+Red_bellied_Woodpecker/389565.jpg
+Red_bellied_Woodpecker/390104.jpg
+Red_headed_Woodpecker/390678.jpg
+Red_headed_Woodpecker/390766.jpg
+Red_headed_Woodpecker/390955.jpg
+Red_headed_Woodpecker/391283.jpg
+Acorn_Woodpecker/391501.jpg
+Acorn_Woodpecker/391509.jpg
+Acorn_Woodpecker/391909.jpg
+Acorn_Woodpecker/392295.jpg
+Acorn_Woodpecker/392614.jpg
+Lewiss_Woodpecker/392895.jpg
+Lewiss_Woodpecker/392981.jpg
+Lewiss_Woodpecker/393052.jpg
+Gila_Woodpecker/393141.jpg
+Gila_Woodpecker/393176.jpg
+Gila_Woodpecker/393367.jpg
+Gila_Woodpecker/393376.jpg
+Gila_Woodpecker/393398.jpg
+White_headed_Woodpecker/393523.jpg
+White_headed_Woodpecker/393581.jpg
+White_headed_Woodpecker/393584.jpg
+White_headed_Woodpecker/393585.jpg
+White_headed_Woodpecker/393590.jpg
+Black_backed_Woodpecker/393627.jpg
+Black_backed_Woodpecker/393667.jpg
+Black_backed_Woodpecker/393725.jpg
+Black_backed_Woodpecker/393726.jpg
+Black_backed_Woodpecker/393744.jpg
+Red_cockaded_Woodpecker/393775.jpg
+Red_cockaded_Woodpecker/393829.jpg
+American_Three_toed_Woodpecker/394079.jpg
+American_Three_toed_Woodpecker/394085.jpg
+Nuttalls_Woodpecker/394236.jpg
+Nuttalls_Woodpecker/394276.jpg
+Nuttalls_Woodpecker/394435.jpg
+Downy_Woodpecker/394729.jpg
+Downy_Woodpecker/396687.jpg
+Downy_Woodpecker/396876.jpg
+Ladder_backed_Woodpecker/397082.jpg
+Ladder_backed_Woodpecker/397092.jpg
+Ladder_backed_Woodpecker/397154.jpg
+Ladder_backed_Woodpecker/397230.jpg
+Ladder_backed_Woodpecker/397337.jpg
+Hairy_Woodpecker/397433.jpg
+Hairy_Woodpecker/397887.jpg
+Hairy_Woodpecker/398004.jpg
+Red_naped_Sapsucker/399031.jpg
+Red_naped_Sapsucker/399042.jpg
+Red_breasted_Sapsucker/399170.jpg
+Red_breasted_Sapsucker/399187.jpg
+Red_breasted_Sapsucker/399215.jpg
+Red_breasted_Sapsucker/399229.jpg
+Yellow_bellied_Sapsucker/400004.jpg
+Yellow_bellied_Sapsucker/400215.jpg
+Yellow_bellied_Sapsucker/400433.jpg
+Yellow_bellied_Sapsucker/400435.jpg
+Yellow_bellied_Sapsucker/400560.jpg
+Clarks_Grebe/400990.jpg
+Clarks_Grebe/401042.jpg
+Clarks_Grebe/401081.jpg
+Western_Grebe/401381.jpg
+Western_Grebe/401437.jpg
+Western_Grebe/401632.jpg
+Western_Grebe/401639.jpg
+Western_Grebe/402231.jpg
+Horned_Grebe/402305.jpg
+Horned_Grebe/402593.jpg
+Horned_Grebe/403263.jpg
+Horned_Grebe/403484.jpg
+Horned_Grebe/404628.jpg
+Red_necked_Grebe/404744.jpg
+Red_necked_Grebe/404889.jpg
+Red_necked_Grebe/405520.jpg
+Red_necked_Grebe/405942.jpg
+Eared_Grebe/406004.jpg
+Eared_Grebe/406704.jpg
+Eared_Grebe/407707.jpg
+Pied_billed_Grebe/408163.jpg
+Pied_billed_Grebe/408312.jpg
+Pied_billed_Grebe/409307.jpg
+Least_Grebe/410481.jpg
+Least_Grebe/410673.jpg
+Least_Grebe/410801.jpg
+Least_Grebe/410806.jpg
+Blue_gray_Gnatcatcher/410976.jpg
+Blue_gray_Gnatcatcher/411244.jpg
+Blue_gray_Gnatcatcher/411840.jpg
+Blue_gray_Gnatcatcher/411896.jpg
+Blue_gray_Gnatcatcher/411916.jpg
+Monk_Parakeet/412097.jpg
+Monk_Parakeet/412721.jpg
+Monk_Parakeet/412815.jpg
+Monk_Parakeet/413168.jpg
+Monk_Parakeet/413275.jpg
+Phainopepla/413538.jpg
+Phainopepla/413718.jpg
+Phainopepla/413726.jpg
+Phainopepla/413842.jpg
+American_Coot/414142.jpg
+American_Coot/414618.jpg
+American_Coot/416377.jpg
+Common_Gallinule/416452.jpg
+Common_Gallinule/416746.jpg
+Common_Gallinule/416803.jpg
+Purple_Gallinule/416853.jpg
+Sora/417163.jpg
+Sora/417702.jpg
+Sora/417757.jpg
+Sora/417815.jpg
+King_Rail/417966.jpg
+King_Rail/418026.jpg
+King_Rail/418039.jpg
+Virginia_Rail/418069.jpg
+Virginia_Rail/418346.jpg
+Virginia_Rail/418436.jpg
+Virginia_Rail/418655.jpg
+Clapper_Rail/418753.jpg
+Clapper_Rail/419250.jpg
+Clapper_Rail/419266.jpg
+Black_necked_Stilt/419808.jpg
+Black_necked_Stilt/419827.jpg
+Black_necked_Stilt/420599.jpg
+Black_necked_Stilt/420857.jpg
+Black_necked_Stilt/420954.jpg
+American_Avocet/421867.jpg
+American_Avocet/422760.jpg
+American_Avocet/422997.jpg
+Ruby_crowned_Kinglet/424348.jpg
+Ruby_crowned_Kinglet/425199.jpg
+Ruby_crowned_Kinglet/425446.jpg
+Ruby_crowned_Kinglet/425736.jpg
+Ruby_crowned_Kinglet/425745.jpg
+Golden_crowned_Kinglet/426168.jpg
+Golden_crowned_Kinglet/426371.jpg
+Golden_crowned_Kinglet/426373.jpg
+Golden_crowned_Kinglet/426970.jpg
+Verdin/427125.jpg
+Verdin/427265.jpg
+Verdin/427318.jpg
+Verdin/427507.jpg
+Red_Phalarope/427723.jpg
+Red_Phalarope/428179.jpg
+Red_Phalarope/428250.jpg
+Red_necked_Phalarope/428313.jpg
+Red_necked_Phalarope/428339.jpg
+Red_necked_Phalarope/429019.jpg
+Red_necked_Phalarope/429706.jpg
+Wilsons_Phalarope/430014.jpg
+Wilsons_Phalarope/430094.jpg
+Wilsons_Phalarope/430255.jpg
+Wilsons_Phalarope/430720.jpg
+Spotted_Sandpiper/430789.jpg
+Spotted_Sandpiper/431158.jpg
+Spotted_Sandpiper/431252.jpg
+Spotted_Sandpiper/431385.jpg
+Surfbird/431576.jpg
+Surfbird/431795.jpg
+Surfbird/431933.jpg
+Surfbird/431937.jpg
+Ruddy_Turnstone/432289.jpg
+Ruddy_Turnstone/432347.jpg
+Ruddy_Turnstone/432697.jpg
+Ruddy_Turnstone/433773.jpg
+Black_Turnstone/434622.jpg
+Black_Turnstone/434623.jpg
+Black_Turnstone/434717.jpg
+Black_Turnstone/434941.jpg
+Upland_Sandpiper/435235.jpg
+Sanderling/435563.jpg
+Sanderling/435911.jpg
+Sanderling/436010.jpg
+Sanderling/436970.jpg
+Dunlin/438912.jpg
+Dunlin/438914.jpg
+Dunlin/439094.jpg
+Dunlin/439097.jpg
+Bairds_Sandpiper/440405.jpg
+Bairds_Sandpiper/440515.jpg
+Bairds_Sandpiper/440709.jpg
+Bairds_Sandpiper/440765.jpg
+Red_Knot/442020.jpg
+Red_Knot/442561.jpg
+White_rumped_Sandpiper/442664.jpg
+White_rumped_Sandpiper/442705.jpg
+White_rumped_Sandpiper/442860.jpg
+White_rumped_Sandpiper/443099.jpg
+Stilt_Sandpiper/443281.jpg
+Stilt_Sandpiper/443292.jpg
+Purple_Sandpiper/443665.jpg
+Purple_Sandpiper/444437.jpg
+Purple_Sandpiper/444689.jpg
+Purple_Sandpiper/444788.jpg
+Purple_Sandpiper/445192.jpg
+Western_Sandpiper/445386.jpg
+Western_Sandpiper/445806.jpg
+Western_Sandpiper/445822.jpg
+Western_Sandpiper/445914.jpg
+Pectoral_Sandpiper/446424.jpg
+Pectoral_Sandpiper/446445.jpg
+Pectoral_Sandpiper/446768.jpg
+Pectoral_Sandpiper/447006.jpg
+Pectoral_Sandpiper/447096.jpg
+Least_Sandpiper/447382.jpg
+Least_Sandpiper/447460.jpg
+Least_Sandpiper/447787.jpg
+Least_Sandpiper/448614.jpg
+Semipalmated_Sandpiper/449188.jpg
+Semipalmated_Sandpiper/449207.jpg
+Semipalmated_Sandpiper/449214.jpg
+Semipalmated_Sandpiper/449301.jpg
+Wilsons_Snipe/450184.jpg
+Wilsons_Snipe/450191.jpg
+Wilsons_Snipe/450612.jpg
+Wilsons_Snipe/450618.jpg
+Short_billed_Dowitcher/450781.jpg
+Short_billed_Dowitcher/450814.jpg
+Short_billed_Dowitcher/450915.jpg
+Short_billed_Dowitcher/451095.jpg
+Long_billed_Dowitcher/451775.jpg
+Long_billed_Dowitcher/452093.jpg
+Long_billed_Dowitcher/452263.jpg
+Long_billed_Dowitcher/452585.jpg
+Marbled_Godwit/452956.jpg
+Marbled_Godwit/453132.jpg
+Marbled_Godwit/453768.jpg
+Marbled_Godwit/453805.jpg
+Marbled_Godwit/453989.jpg
+Long_billed_Curlew/455138.jpg
+Long_billed_Curlew/455589.jpg
+Long_billed_Curlew/455670.jpg
+Long_billed_Curlew/455718.jpg
+Whimbrel/455809.jpg
+Whimbrel/455822.jpg
+Whimbrel/457287.jpg
+Whimbrel/457964.jpg
+Whimbrel/458080.jpg
+American_Woodcock/458261.jpg
+American_Woodcock/458297.jpg
+Lesser_Yellowlegs/458548.jpg
+Lesser_Yellowlegs/458703.jpg
+Lesser_Yellowlegs/458991.jpg
+Lesser_Yellowlegs/459484.jpg
+Greater_Yellowlegs/460514.jpg
+Greater_Yellowlegs/460579.jpg
+Greater_Yellowlegs/460643.jpg
+Greater_Yellowlegs/460906.jpg
+Greater_Yellowlegs/461187.jpg
+Willet/462408.jpg
+Willet/462527.jpg
+Willet/463224.jpg
+Willet/463255.jpg
+Willet/463496.jpg
+Solitary_Sandpiper/463747.jpg
+Solitary_Sandpiper/463973.jpg
+Solitary_Sandpiper/463974.jpg
+Solitary_Sandpiper/463977.jpg
+Solitary_Sandpiper/463978.jpg
+Red_breasted_Nuthatch/464738.jpg
+Red_breasted_Nuthatch/464903.jpg
+Red_breasted_Nuthatch/464930.jpg
+Red_breasted_Nuthatch/464962.jpg
+White_breasted_Nuthatch/467836.jpg
+White_breasted_Nuthatch/467951.jpg
+Brown_headed_Nuthatch/468871.jpg
+Brown_headed_Nuthatch/468946.jpg
+Brown_headed_Nuthatch/468988.jpg
+Brown_headed_Nuthatch/469111.jpg
+Pygmy_Nuthatch/469253.jpg
+Pygmy_Nuthatch/469417.jpg
+Pygmy_Nuthatch/469445.jpg
+Northern_Saw_whet_Owl/469559.jpg
+Northern_Saw_whet_Owl/469679.jpg
+Northern_Saw_whet_Owl/469897.jpg
+Northern_Saw_whet_Owl/469903.jpg
+Northern_Saw_whet_Owl/470023.jpg
+Short_eared_Owl/470608.jpg
+Short_eared_Owl/470683.jpg
+Short_eared_Owl/472205.jpg
+Short_eared_Owl/472433.jpg
+Long_eared_Owl/472902.jpg
+Long_eared_Owl/474433.jpg
+Long_eared_Owl/474456.jpg
+Burrowing_Owl/475008.jpg
+Burrowing_Owl/475704.jpg
+Burrowing_Owl/475986.jpg
+Burrowing_Owl/477034.jpg
+Snowy_Owl/477601.jpg
+Snowy_Owl/478223.jpg
+Snowy_Owl/478432.jpg
+Snowy_Owl/478923.jpg
+Great_Horned_Owl/479973.jpg
+Great_Horned_Owl/480079.jpg
+Great_Horned_Owl/480903.jpg
+Great_Horned_Owl/481305.jpg
+Ferruginous_Pygmy_Owl/482387.jpg
+Ferruginous_Pygmy_Owl/482443.jpg
+Ferruginous_Pygmy_Owl/482596.jpg
+Ferruginous_Pygmy_Owl/482669.jpg
+Ferruginous_Pygmy_Owl/482698.jpg
+Eastern_Screech_Owl/483406.jpg
+Eastern_Screech_Owl/483482.jpg
+Eastern_Screech_Owl/484269.jpg
+Western_Screech_Owl/484466.jpg
+Western_Screech_Owl/484467.jpg
+Western_Screech_Owl/484469.jpg
+Western_Screech_Owl/484533.jpg
+Western_Screech_Owl/484572.jpg
+Great_Gray_Owl/484839.jpg
+Great_Gray_Owl/484870.jpg
+Great_Gray_Owl/485472.jpg
+Great_Gray_Owl/485725.jpg
+Spotted_Owl/487290.jpg
+Spotted_Owl/487364.jpg
+Spotted_Owl/487416.jpg
+Barred_Owl/487599.jpg
+Barred_Owl/487606.jpg
+Barred_Owl/488241.jpg
+Barred_Owl/488479.jpg
+Barred_Owl/488798.jpg
+Northern_Hawk_Owl/490395.jpg
+Northern_Hawk_Owl/490794.jpg
+Northern_Hawk_Owl/490806.jpg
+Northern_Hawk_Owl/491165.jpg
+European_Starling/491764.jpg
+European_Starling/493026.jpg
+Northern_Gannet/494414.jpg
+Northern_Gannet/495305.jpg
+Northern_Gannet/495823.jpg
+Northern_Gannet/495857.jpg
+Northern_Gannet/495973.jpg
+Wrentit/496257.jpg
+Wrentit/496305.jpg
+Wrentit/496322.jpg
+Wrentit/496329.jpg
+White_Ibis/498847.jpg
+White_Ibis/498940.jpg
+White_Ibis/499040.jpg
+White_Ibis/499563.jpg
+White_Ibis/499705.jpg
+White_faced_Ibis/500101.jpg
+White_faced_Ibis/500119.jpg
+White_faced_Ibis/500360.jpg
+Glossy_Ibis/500796.jpg
+Glossy_Ibis/501570.jpg
+Black_chinned_Hummingbird/503009.jpg
+Black_chinned_Hummingbird/503286.jpg
+Black_chinned_Hummingbird/503441.jpg
+Ruby_throated_Hummingbird/503987.jpg
+Ruby_throated_Hummingbird/505437.jpg
+Ruby_throated_Hummingbird/506319.jpg
+Ruby_throated_Hummingbird/506320.jpg
+Ruby_throated_Hummingbird/506321.jpg
+Annas_Hummingbird/506429.jpg
+Annas_Hummingbird/506555.jpg
+Annas_Hummingbird/507567.jpg
+Annas_Hummingbird/507907.jpg
+Costas_Hummingbird/509138.jpg
+Costas_Hummingbird/509198.jpg
+Costas_Hummingbird/509465.jpg
+Broad_billed_Hummingbird/509907.jpg
+Broad_billed_Hummingbird/509969.jpg
+Broad_billed_Hummingbird/510069.jpg
+Broad_tailed_Hummingbird/510498.jpg
+Broad_tailed_Hummingbird/510627.jpg
+Broad_tailed_Hummingbird/510665.jpg
+Rufous_Hummingbird/511449.jpg
+Rufous_Hummingbird/511958.jpg
+Rufous_Hummingbird/512223.jpg
+Rufous_Hummingbird/512857.jpg
+Rufous_Hummingbird/512892.jpg
+Allens_Hummingbird/513139.jpg
+Allens_Hummingbird/513386.jpg
+Cactus_Wren/514254.jpg
+Cactus_Wren/514376.jpg
+Cactus_Wren/514461.jpg
+Cactus_Wren/514547.jpg
+Canyon_Wren/514600.jpg
+Canyon_Wren/514678.jpg
+Canyon_Wren/514705.jpg
+Canyon_Wren/514759.jpg
+Marsh_Wren/514957.jpg
+Marsh_Wren/515112.jpg
+Marsh_Wren/515137.jpg
+Marsh_Wren/515204.jpg
+Marsh_Wren/515772.jpg
+Sedge_Wren/515883.jpg
+Sedge_Wren/516102.jpg
+Rock_Wren/516341.jpg
+Rock_Wren/516346.jpg
+Rock_Wren/516526.jpg
+Bewicks_Wren/516922.jpg
+Bewicks_Wren/516971.jpg
+Bewicks_Wren/517189.jpg
+Carolina_Wren/517360.jpg
+Carolina_Wren/517369.jpg
+Carolina_Wren/517782.jpg
+Carolina_Wren/518318.jpg
+Carolina_Wren/518595.jpg
+House_Wren/518835.jpg
+House_Wren/518972.jpg
+House_Wren/520400.jpg
+Winter_Wren/520753.jpg
+Winter_Wren/520768.jpg
+Pacific_Wren/520920.jpg
+Elegant_Trogon/521173.jpg
+Elegant_Trogon/521208.jpg
+Elegant_Trogon/521209.jpg
+Veery/521269.jpg
+Veery/521489.jpg
+Veery/521523.jpg
+Hermit_Thrush/521695.jpg
+Hermit_Thrush/521730.jpg
+Hermit_Thrush/522360.jpg
+Hermit_Thrush/522386.jpg
+Gray_cheeked_Thrush/522620.jpg
+Gray_cheeked_Thrush/522623.jpg
+Swainsons_Thrush/522961.jpg
+Swainsons_Thrush/522984.jpg
+Swainsons_Thrush/523160.jpg
+Swainsons_Thrush/523260.jpg
+Wood_Thrush/523354.jpg
+Wood_Thrush/523364.jpg
+Wood_Thrush/523379.jpg
+Wood_Thrush/523541.jpg
+Varied_Thrush/523597.jpg
+Varied_Thrush/523805.jpg
+Varied_Thrush/523985.jpg
+Varied_Thrush/524094.jpg
+Varied_Thrush/524099.jpg
+Mountain_Bluebird/524360.jpg
+Mountain_Bluebird/524561.jpg
+Mountain_Bluebird/524908.jpg
+Mountain_Bluebird/524983.jpg
+Western_Bluebird/525428.jpg
+Western_Bluebird/526007.jpg
+Western_Bluebird/526459.jpg
+Western_Bluebird/526460.jpg
+Eastern_Bluebird/526765.jpg
+Eastern_Bluebird/526852.jpg
+Eastern_Bluebird/526976.jpg
+Eastern_Bluebird/527918.jpg
+Eastern_Bluebird/528766.jpg
+American_Robin/529670.jpg
+American_Robin/530040.jpg
+American_Robin/530041.jpg
+American_Robin/530839.jpg
+Olive_sided_Flycatcher/531191.jpg
+Olive_sided_Flycatcher/531208.jpg
+Olive_sided_Flycatcher/531333.jpg
+Olive_sided_Flycatcher/531334.jpg
+Western_Wood_Pewee/531459.jpg
+Western_Wood_Pewee/531481.jpg
+Western_Wood_Pewee/531632.jpg
+Western_Wood_Pewee/531755.jpg
+Western_Wood_Pewee/531757.jpg
+Eastern_Wood_Pewee/531868.jpg
+Eastern_Wood_Pewee/531884.jpg
+Eastern_Wood_Pewee/531987.jpg
+Eastern_Wood_Pewee/532039.jpg
+Eastern_Wood_Pewee/532201.jpg
+Pacific_slope_Flycatcher/532532.jpg
+Pacific_slope_Flycatcher/532677.jpg
+Yellow_bellied_Flycatcher/532714.jpg
+Yellow_bellied_Flycatcher/532819.jpg
+Yellow_bellied_Flycatcher/532822.jpg
+Yellow_bellied_Flycatcher/532827.jpg
+Hammonds_Flycatcher/533074.jpg
+Hammonds_Flycatcher/533076.jpg
+Hammonds_Flycatcher/533077.jpg
+Least_Flycatcher/533233.jpg
+Least_Flycatcher/533340.jpg
+Least_Flycatcher/533345.jpg
+Least_Flycatcher/533353.jpg
+Least_Flycatcher/533400.jpg
+Dusky_Flycatcher/533523.jpg
+Dusky_Flycatcher/533553.jpg
+Dusky_Flycatcher/533558.jpg
+Dusky_Flycatcher/533568.jpg
+Cordilleran_Flycatcher/533658.jpg
+Willow_Flycatcher/533773.jpg
+Willow_Flycatcher/533774.jpg
+Willow_Flycatcher/533810.jpg
+Willow_Flycatcher/534012.jpg
+Acadian_Flycatcher/534090.jpg
+Acadian_Flycatcher/534091.jpg
+Acadian_Flycatcher/534108.jpg
+Gray_Flycatcher/534274.jpg
+Gray_Flycatcher/534310.jpg
+Gray_Flycatcher/534326.jpg
+Gray_Flycatcher/534358.jpg
+Gray_Flycatcher/534370.jpg
+Vermilion_Flycatcher/534690.jpg
+Vermilion_Flycatcher/534784.jpg
+Vermilion_Flycatcher/534879.jpg
+Vermilion_Flycatcher/535302.jpg
+Vermilion_Flycatcher/536084.jpg
+Black_Phoebe/536955.jpg
+Black_Phoebe/537242.jpg
+Black_Phoebe/537318.jpg
+Black_Phoebe/537701.jpg
+Black_Phoebe/537925.jpg
+Eastern_Phoebe/538056.jpg
+Eastern_Phoebe/538589.jpg
+Eastern_Phoebe/539360.jpg
+Eastern_Phoebe/539606.jpg
+Says_Phoebe/539893.jpg
+Says_Phoebe/540039.jpg
+Ash_throated_Flycatcher/540459.jpg
+Ash_throated_Flycatcher/540522.jpg
+Ash_throated_Flycatcher/540581.jpg
+Ash_throated_Flycatcher/540657.jpg
+Great_Crested_Flycatcher/540705.jpg
+Great_Crested_Flycatcher/540780.jpg
+Great_Crested_Flycatcher/541314.gif
+Great_Crested_Flycatcher/541362.jpg
+Brown_crested_Flycatcher/541516.jpg
+Brown_crested_Flycatcher/541638.jpg
+Brown_crested_Flycatcher/541706.jpg
+Brown_crested_Flycatcher/541728.jpg
+Brown_crested_Flycatcher/541737.jpg
+Great_Kiskadee/541839.jpg
+Great_Kiskadee/542304.jpg
+Great_Kiskadee/542798.jpg
+Great_Kiskadee/542834.jpg
+Couchs_Kingbird/544352.jpg
+Couchs_Kingbird/544378.jpg
+Couchs_Kingbird/544404.jpg
+Gray_Kingbird/544466.jpg
+Gray_Kingbird/544514.jpg
+Gray_Kingbird/544558.jpg
+Gray_Kingbird/544761.jpg
+Gray_Kingbird/544765.jpg
+Scissor_tailed_Flycatcher/544968.jpg
+Scissor_tailed_Flycatcher/545061.jpg
+Scissor_tailed_Flycatcher/545177.jpg
+Scissor_tailed_Flycatcher/545474.jpg
+Scissor_tailed_Flycatcher/545643.jpg
+Tropical_Kingbird/546721.jpg
+Tropical_Kingbird/546749.jpg
+Tropical_Kingbird/546852.jpg
+Tropical_Kingbird/547217.jpg
+Eastern_Kingbird/547441.jpg
+Eastern_Kingbird/547483.jpg
+Eastern_Kingbird/547490.jpg
+Western_Kingbird/549474.jpg
+Western_Kingbird/549508.jpg
+Western_Kingbird/549678.jpg
+Cassins_Kingbird/550030.jpg
+Bells_Vireo/552110.jpg
+Cassins_Vireo/552260.jpg
+Cassins_Vireo/552309.jpg
+Yellow_throated_Vireo/552392.jpg
+Yellow_throated_Vireo/552416.jpg
+Yellow_throated_Vireo/552479.jpg
+Yellow_throated_Vireo/552522.jpg
+Warbling_Vireo/552597.jpg
+Warbling_Vireo/552975.jpg
+Warbling_Vireo/553059.jpg
+Warbling_Vireo/553061.jpg
+White_eyed_Vireo/553507.jpg
+White_eyed_Vireo/553558.jpg
+White_eyed_Vireo/553605.jpg
+Huttons_Vireo/553730.jpg
+Huttons_Vireo/553856.jpg
+Huttons_Vireo/553888.jpg
+Huttons_Vireo/553909.jpg
+Red_eyed_Vireo/553944.jpg
+Red_eyed_Vireo/554276.jpg
+Red_eyed_Vireo/554283.jpg
+Red_eyed_Vireo/554471.jpg
+Red_eyed_Vireo/554535.jpg
+Philadelphia_Vireo/554697.jpg
+Philadelphia_Vireo/554715.jpg
+Philadelphia_Vireo/554788.jpg
+Philadelphia_Vireo/554792.jpg
+Philadelphia_Vireo/554796.jpg
+Plumbeous_Vireo/554899.jpg
+Plumbeous_Vireo/554902.jpg
+Blue_headed_Vireo/555157.jpg
+Blue_headed_Vireo/555225.jpg
+Blue_headed_Vireo/555232.jpg
+Blue_headed_Vireo/555243.jpg
+Black_crowned_Night_Heron/555822.jpg
+Black_crowned_Night_Heron/556956.jpg
+Black_crowned_Night_Heron/556959.jpg
+Black_crowned_Night_Heron/557614.jpg
+Semipalmated_Plover/558727.jpg
+Semipalmated_Plover/558847.jpg
+Semipalmated_Plover/558858.jpg
+Semipalmated_Plover/559447.jpg
+Killdeer/560046.jpg
+Killdeer/560054.jpg
+Killdeer/561534.jpg
+Killdeer/562435.jpg
+Cliff_Swallow/563192.jpg
+Purple_Martin/563409.jpg
+Townsends_Solitaire/564382.jpg
+Townsends_Solitaire/564447.jpg
+Townsends_Solitaire/564517.jpg
+Least_Bittern/564964.jpg
+Least_Bittern/564973.jpg
+Yellow_crowned_Night_Heron/565315.jpg
+Yellow_crowned_Night_Heron/566085.jpg
+Yellow_crowned_Night_Heron/566509.jpg
+Yellow_crowned_Night_Heron/566959.jpg
+Cassins_Finch/567594.jpg
+Cassins_Finch/567757.jpg
+Gambels_Quail/567993.jpg
+Gambels_Quail/568019.jpg
+Gambels_Quail/568106.jpg
+Gambels_Quail/568222.jpg
+Zone_tailed_Hawk/568344.jpg
+Chestnut_collared_Longspur/571975.jpg
+Mountain_Plover/572074.jpg
+Chihuahuan_Raven/575435.jpg
+Saltmarsh_Sparrow/575563.jpg
+Olive_Sparrow/576347.jpg
+Olive_Sparrow/576482.jpg
+Yellow_eyed_Junco/577170.jpg
+Canyon_Towhee/577927.jpg
+Cassins_Sparrow/578258.jpg
+Cassins_Sparrow/578360.jpg
+Cassins_Sparrow/578405.jpg
+Cassins_Sparrow/578424.jpg
+Cassins_Sparrow/578499.jpg
+Black_chinned_Sparrow/578606.jpg
+Black_chinned_Sparrow/578742.jpg
+Hoary_Redpoll/579565.jpg
+Purple_Finch/580791.jpg
+Purple_Finch/581206.jpg
+Brown_capped_Rosy_Finch/582121.jpg
+Audubons_Oriole/582203.jpg
+Audubons_Oriole/582216.jpg
+Audubons_Oriole/582279.jpg
+Audubons_Oriole/582394.jpg
+Scaled_Quail/582711.jpg
+Scaled_Quail/582714.jpg
+Scaled_Quail/582715.jpg
+Bridled_Titmouse/582901.jpg
+Bridled_Titmouse/582935.jpg
+Mourning_Warbler/585528.jpg
+Mourning_Warbler/586285.jpg
+Mourning_Warbler/586542.jpg
+Worm_eating_Warbler/587083.jpg
+Worm_eating_Warbler/587348.jpg
+Worm_eating_Warbler/587371.jpg
+Louisiana_Waterthrush/588428.jpg
+Louisiana_Waterthrush/588764.jpg
+Louisiana_Waterthrush/589167.jpg
+Cerulean_Warbler/590344.jpg
+Hooded_Warbler/590876.jpg
+White_tailed_Ptarmigan/592628.jpg
+Rock_Sandpiper/596107.jpg
+Rock_Sandpiper/596409.jpg
+Elf_Owl/597046.jpg
+Elf_Owl/597072.jpg
+Dusky_Flycatcher/599261.jpg
+Cordilleran_Flycatcher/599594.jpg
+Plumbeous_Vireo/600257.jpg
+Plumbeous_Vireo/600264.jpg
+Plumbeous_Vireo/600295.jpg
--- a/clip_benchmark/data/flickr30k/flickr30k_cn_test.txt
+++ b/clip_benchmark/data/flickr30k/flickr30k_cn_test.txt