o
    Zh                     @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZ ddlmZmZm Z  e!e"Z#eG dd deZ$eG dd deZ%eG dd deZ&de	j'de	j'fddZ(de	j'de	j'fddZ)de de*fddZ+dSd ee*ef d!e,fd"d#Z-G d$d% d%ej.Z/G d&d' d'ej0Z1G d(d) d)ej.Z2G d*d+ d+ej.Z3G d,d- d-ej.Z4G d.d/ d/ej.Z5G d0d1 d1ej.Z6G d2d3 d3ej.Z7G d4d5 d5ej.Z8G d6d7 d7ej.Z9G d8d9 d9ej.Z:d:e9iZ;G d;d< d<ej.Z<G d=d> d>ej.Z=G d?d@ d@ej.Z>G dAdB dBej.Z?G dCdD dDej.Z@G dEdF dFej.ZAeG dGdH dHeZBedIdJG dKdL dLeBZCedMdJG dNdO dOeBZDeG dPdQ dQeBZEg dRZFdS )TzPyTorch ALIGN model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN)BaseModelOutputWithNoAttention)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions(BaseModelOutputWithPoolingAndNoAttention)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )AlignConfigAlignTextConfigAlignVisionConfigc                   @   sL   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dS )AlignVisionModelOutputa+  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r    r$   r$   W/var/www/auris/lib/python3.10/site-packages/transformers/models/align/modeling_align.pyr   )   s
   
 r   c                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )AlignTextModelOutputa  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_embedsr   r   
attentions)r   r   r   r    r'   r   r!   r"   r#   r   r   r   r(   r$   r$   r$   r%   r&   ?   s   
 r&   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )AlignOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The output of [`AlignVisionModel`].
        text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
            The output of the [`AlignTextModel`].
        vision_model_output(`BaseModelOutputWithPoolingAndNoAttention`):
            The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr'   r   text_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r-   r.   N)getattrto_tuple).0kselfr$   r%   	<genexpr>{   s
    
z'AlignOutput.to_tuple.<locals>.<genexpr>)tuplekeysr4   r$   r4   r%   r1   z   s   zAlignOutput.to_tuple)r   r   r   r    r*   r   r!   r"   r#   r+   r,   r'   r   r-   r   r.   r   r   r   r1   r$   r$   r$   r%   r)   \   s   
 r)   logitsr/   c                 C   s"   t jj| tjt| | jdddS )Ndeviceg?)Zlabel_smoothing)r   
functionalZcross_entropyr!   arangelenr;   )r9   r$   r$   r%   contrastive_loss   s   "r?   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r?   t)r@   Zcaption_lossZ
image_lossr$   r$   r%   
align_loss   s   rB   confignum_channelsc                 C   sJ   | j }|| j9 }t|t||d  | | }|d| k r!||7 }t|S )z<
    Round number of filters based on depth multiplier.
       g?)Zdepth_divisorZwidth_coefficientmaxint)rC   rD   ZdivisorZnew_dimr$   r$   r%   round_filters   s   
rH   Tkernel_sizeadjustc                 C   sn   t | tr	| | f} | d d | d d f}|r)|d d |d |d d |d fS |d |d |d |d fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rE   r   )
isinstancerG   )rI   rJ   Zcorrectr$   r$   r%   correct_pad   s   

$rL   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rC   c                    sh   t    t|d| _tjdd| _tj|j| jddddd| _	tj
| j|j|jd	| _t|j | _d S )
N    )r   r   r   r   paddingr   rE   validFrI   striderP   bias)epsmomentum)super__init__rH   out_dimr   	ZeroPad2drP   Conv2drD   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr	   
hidden_act
activationr5   rC   	__class__r$   r%   rX      s   
zAlignVisionEmbeddings.__init__pixel_valuesr/   c                 C   s,   |  |}| |}| |}| |}|S N)rP   r\   r`   rb   )r5   rf   featuresr$   r$   r%   forward   s
   



zAlignVisionEmbeddings.forward)
r   r   r   r    r   rX   r!   Tensorri   __classcell__r$   r$   rd   r%   rM      s    rM   c                       s,   e Zd Z							d fdd	Z  ZS )	AlignVisionDepthwiseConv2dr   r   r   Tzerosc	           
         s*   || }	t  j||	|||||||d	 d S )N)	in_channelsout_channelsrI   rS   rP   dilationgroupsrT   padding_mode)rW   rX   )
r5   rn   Zdepth_multiplierrI   rS   rP   rp   rT   rr   ro   rd   r$   r%   rX      s   
z#AlignVisionDepthwiseConv2d.__init__)r   r   r   r   r   Trm   )r   r   r   rX   rk   r$   r$   rd   r%   rl      s    rl   c                       sH   e Zd ZdZdedededef fddZdejd	ej	fd
dZ
  ZS )AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rC   in_dimrY   rS   c                    sB   t    tj||dddd| _tj||jd| _t|j	 | _
d S )Nr   sameFrn   ro   rI   rP   rT   )num_featuresrU   )rW   rX   r   r[   expand_convr]   r^   	expand_bnr	   ra   
expand_act)r5   rC   rt   rY   rS   rd   r$   r%   rX      s   
z"AlignVisionExpansionLayer.__init__r   r/   c                 C   s"   |  |}| |}| |}|S rg   )rx   ry   rz   r5   r   r$   r$   r%   ri      s   


z!AlignVisionExpansionLayer.forward)r   r   r   r    r   rG   rX   r!   r"   rj   ri   rk   r$   r$   rd   r%   rs      s    rs   c                
       sL   e Zd ZdZdededededef
 fddZd	ej	d
ej
fddZ  ZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rC   rt   rS   rI   adjust_paddingc                    sv   t    || _| jdkrdnd}t||d}tj|d| _t||||dd| _tj	||j
|jd| _t|j | _d S )	NrE   rQ   ru   )rJ   rO   FrR   rw   rU   rV   )rW   rX   rS   rL   r   rZ   depthwise_conv_padrl   depthwise_convr]   r^   r_   depthwise_normr	   ra   depthwise_act)r5   rC   rt   rS   rI   r}   Zconv_padrP   rd   r$   r%   rX     s   


z"AlignVisionDepthwiseLayer.__init__r   r/   c                 C   s6   | j dkr
| |}| |}| |}| |}|S )NrE   )rS   r   r   r   r   r{   r$   r$   r%   ri     s   




z!AlignVisionDepthwiseLayer.forwardr   r   r   r    r   rG   boolrX   r!   r"   rj   ri   rk   r$   r$   rd   r%   r|     s    r|   c                	       sJ   e Zd ZdZddedededef fddZd	ej	d
ej
fddZ  ZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    FrC   rt   
expand_dimexpandc                    s   t    |r	|n|| _tdt||j | _tjdd| _	tj
| j| jddd| _tj
| j| jddd| _t|j | _t | _d S )Nr   )Zoutput_sizeru   )rn   ro   rI   rP   )rW   rX   dimrF   rG   Zsqueeze_expansion_ratioZdim_ser   ZAdaptiveAvgPool2dsqueezer[   reducer   r	   ra   
act_reduceZSigmoid
act_expand)r5   rC   rt   r   r   rd   r$   r%   rX   0  s$   
z&AlignVisionSqueezeExciteLayer.__init__r   r/   c                 C   sF   |}|  |}| |}| |}| |}| |}t||}|S rg   )r   r   r   r   r   r!   mul)r5   r   Zinputsr$   r$   r%   ri   E  s   




z%AlignVisionSqueezeExciteLayer.forward)Fr   r$   r$   rd   r%   r   +  s     r   c                       sV   e Zd ZdZdedededededef fdd	Zd
e	j
de	j
de	jfddZ  ZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rC   rt   rY   rS   	drop_rateid_skipc                    sX   t    |dko| | _tj||dddd| _tj||j|jd| _	tj
|d| _d S )Nr   ru   Frv   r~   )p)rW   rX   apply_dropoutr   r[   project_convr]   r^   r_   
project_bnDropoutdropout)r5   rC   rt   rY   rS   r   r   rd   r$   r%   rX   W  s   

z#AlignVisionFinalBlockLayer.__init__
embeddingsr   r/   c                 C   s0   |  |}| |}| jr| |}|| }|S rg   )r   r   r   r   )r5   r   r   r$   r$   r%   ri   h  s   


z"AlignVisionFinalBlockLayer.forwardr   r   r   r    r   rG   floatr   rX   r!   r"   rj   ri   rk   r$   r$   rd   r%   r   R  s     $r   c                       s\   e Zd ZdZdededededededed	ed
ef fddZde	j
de	jfddZ  ZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rC   rt   rY   rS   expand_ratiorI   r   r   r}   c
                    s   t    || _| jdkrdnd| _|| }
| jr"t|||
|d| _t|| jr)|
n||||	d| _t|||
| jd| _	t
|| jrB|
n|||||d| _d S )Nr   TF)rC   rt   rY   rS   )rC   rt   rS   rI   r}   )rC   rt   r   r   )rC   rt   rY   rS   r   r   )rW   rX   r   r   rs   	expansionr|   r   r   squeeze_exciter   
projection)r5   rC   rt   rY   rS   r   rI   r   r   r}   Zexpand_in_dimrd   r$   r%   rX     s4   

zAlignVisionBlock.__init__r   r/   c                 C   s<   |}| j dkr| |}| |}| |}| ||}|S )Nr   )r   r   r   r   r   )r5   r   r   r$   r$   r%   ri     s   



zAlignVisionBlock.forwardr   r$   r$   rd   r%   r   s  s,    	
)r   c                	       sP   e Zd ZdZdef fddZ		ddejdee	 d	ee	 d
e
fddZ  ZS )AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rC   c                    s8  t    |j_fdd t|j}t fdd|jD }d}g }t|D ]k}t||j| }t||j	| }|j
| }	|j| }
|j| }t |j| D ]@}|dkrZdnd}|dkrbdn|	}	|dkrj|n|}||jv rsdnd}|j| | }t||||	|
||||d		}|| |d7 }qRq(t|_d S )
Nc                    s   t t j|  S rg   )rG   mathceildepth_coefficient)Zrepeatsr4   r$   r%   round_repeats  s   z2AlignVisionEncoder.__init__.<locals>.round_repeatsc                 3   s    | ]} |V  qd S rg   r$   )r2   n)r   r$   r%   r6     s    z.AlignVisionEncoder.__init__.<locals>.<genexpr>r   TFr   )	rC   rt   rY   rS   rI   r   r   r   r}   )rW   rX   r   r>   rn   sumZnum_block_repeatsrangerH   ro   stridesZkernel_sizesZexpand_ratiosZdepthwise_paddingZdrop_connect_rater   appendr   
ModuleListblocks)r5   rC   Znum_base_blocksZ
num_blocksZcurr_block_numr   irt   rY   rS   rI   r   jr   r}   r   blockrd   )r   r5   r%   rX     sD   






zAlignVisionEncoder.__init__FTr   output_hidden_statesreturn_dictr/   c                 C   sV   |r|fnd }| j D ]}||}|r||f7 }q
|s%tdd ||fD S t||dS )Nc                 s       | ]	}|d ur|V  qd S rg   r$   r2   vr$   r$   r%   r6     s    z-AlignVisionEncoder.forward.<locals>.<genexpr>)r   r   )r   r7   r
   )r5   r   r   r   all_hidden_statesr   r$   r$   r%   ri     s   

zAlignVisionEncoder.forward)FT)r   r   r   r    r   rX   r!   r"   r   r   r   ri   rk   r$   r$   rd   r%   r     s    .r   c                       sh   e Zd ZdZ fddZ					ddeej deej deej d	eej d
e	dej
fddZ  ZS )AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd d S )N)padding_idxrU   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)rW   rX   r   	EmbeddingZ
vocab_sizehidden_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   r0   r   Zregister_bufferr!   r=   r   rm   r   sizelongrc   rd   r$   r%   rX     s   

zAlignTextEmbeddings.__init__Nr   	input_idsr   r   inputs_embedspast_key_values_lengthr/   c                 C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u rPt| drE| jd d d |f }||d |}	|	}ntj|tj| jjd}|d u rY| 	|}| 
|}
||
 }| jdkrp| |}||7 }| |}| |}|S )Nr   r   r   r   r   r;   r   )r   r   hasattrr   r   r!   rm   r   r;   r   r   r   r   r   r   )r5   r   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r   r   r$   r$   r%   ri   %  s,   







zAlignTextEmbeddings.forward)NNNNr   )r   r   r   r    rX   r   r!   
LongTensorr"   rG   rj   ri   rk   r$   r$   rd   r%   r     s*    r   c                       s   e Zd Zd fdd	ZdejdejfddZ						dd	ejd
eej deej deej deej dee	e	ej   dee
 de	ej fddZ  ZS )AlignTextSelfAttentionNc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|p\t|dd| _| jdksh| jd	kry|j| _t	d
|j d | j| _|j| _d S )Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryrE   r   )rW   rX   r   num_attention_headsr   
ValueErrorrG   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   Zattention_probs_dropout_probr   r0   r   r   r   distance_embedding
is_decoderr5   rC   r   rd   r$   r%   rX   Q  s*   

zAlignTextSelfAttention.__init__xr/   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r   rE   r   r   )r   r   r   viewpermute)r5   r   Znew_x_shaper$   r$   r%   transpose_for_scoresk  s   
z+AlignTextSelfAttention.transpose_for_scoresFr   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 C   s  |  |}|d u}	|	r|d ur|d }
|d }|}nP|	r/| | |}
| | |}|}n;|d urZ| | |}
| | |}tj|d |
gdd}
tj|d |gdd}n| | |}
| | |}| |}|d u}| jrz|
|f}t||
dd}| j	dks| j	dkr	|j
d |
j
d }}|rtj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| j	dkrtd||}|| }n| j	dkr	td||}td|
|}|| | }|t| j }|d ur|| }tjj|dd}| |}|d ur0|| }t||}|dddd }| d d | jf }||}|rX||fn|f}| jrd||f }|S )Nr   r   rE   r   r   r   r   r   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r   r   r   r   r!   catr   matmulZ	transposer   shapetensorr   r;   r   r=   r   r   tor   Zeinsumr   sqrtr   r   r<   Zsoftmaxr   r   
contiguousr   r   )r5   r   r   r   r   r   r   r   Zmixed_query_layerZis_cross_attentionZ	key_layerZvalue_layerZquery_layer	use_cacheZattention_scoresZquery_lengthZ
key_lengthZposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr$   r$   r%   ri   p  sn   









zAlignTextSelfAttention.forwardrg   NNNNNF)r   r   r   rX   r!   rj   r   r   r"   r   r   ri   rk   r$   r$   rd   r%   r   P  s4    	r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )AlignTextSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )rW   rX   r   r   r   denser   r   r   r   r   rc   rd   r$   r%   rX        
zAlignTextSelfOutput.__init__r   input_tensorr/   c                 C   &   |  |}| |}| || }|S rg   r   r   r   r5   r   r   r$   r$   r%   ri        

zAlignTextSelfOutput.forwardr   r   r   rX   r!   rj   ri   rk   r$   r$   rd   r%   r         $r   eagerc                       s   e Zd Zd fdd	Zdd Z						ddejdeej d	eej d
eej deej dee	e	ej   dee
 de	ej fddZ  ZS )AlignTextAttentionNc                    s4   t    t|j ||d| _t|| _t | _d S )Nr   )	rW   rX   !ALIGN_TEXT_SELF_ATTENTION_CLASSESZ_attn_implementationr5   r   outputsetpruned_headsr   rd   r$   r%   rX     s   

zAlignTextAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )r>   r   r5   r   r   r  r   r   r   r   r	  r   r   union)r5   Zheadsindexr$   r$   r%   prune_heads  s   zAlignTextAttention.prune_headsFr   r   r   r   r   r   r   r/   c              	   C   s<   |  |||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )r5   r	  )r5   r   r   r   r   r   r   r   Zself_outputsattention_outputr   r$   r$   r%   ri     s   
	zAlignTextAttention.forwardrg   r   )r   r   r   rX   r  r!   rj   r   r"   r   r   ri   rk   r$   r$   rd   r%   r    s4    	r  c                       2   e Zd Z fddZdejdejfddZ  ZS )AlignTextIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rg   )rW   rX   r   r   r   intermediate_sizer   rK   ra   strr	   intermediate_act_fnrc   rd   r$   r%   rX      s
   
zAlignTextIntermediate.__init__r   r/   c                 C   s   |  |}| |}|S rg   )r   r  r{   r$   r$   r%   ri   (  s   

zAlignTextIntermediate.forwardr  r$   r$   rd   r%   r    s    r  c                       r   )AlignTextOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )rW   rX   r   r   r  r   r   r   r   r   r   r   rc   rd   r$   r%   rX   0  r   zAlignTextOutput.__init__r   r   r/   c                 C   r   rg   r   r  r$   r$   r%   ri   6  r  zAlignTextOutput.forwardr  r$   r$   rd   r%   r  /  r  r  c                       s   e Zd Z fddZ						ddejdeej deej deej d	eej d
eeeej   dee	 deej fddZ
dd Z  ZS )AlignTextLayerc                    sr   t    |j| _d| _t|| _|j| _|j| _| jr-| js&t|  dt|dd| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is addedr   r  )rW   rX   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionr   crossattentionr  intermediater  r	  rc   rd   r$   r%   rX   ?  s   


zAlignTextLayer.__init__NFr   r   r   r   r   r   r   r/   c              	   C   s  |d ur
|d d nd }| j |||||d}	|	d }
| jr(|	dd }|	d }n|	dd  }d }| jro|d urot| dsDtd|  d|d urN|d	d  nd }| |
||||||}|d }
||dd  }|d }|| }t| j| j| j|
}|f| }| jr||f }|S )
NrE   )r   r   r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r   r   r   r  r   feed_forward_chunkr  r  )r5   r   r   r   r   r   r   r   Zself_attn_past_key_valueZself_attention_outputsr  r   Zpresent_key_valueZcross_attn_present_key_valueZcross_attn_past_key_valueZcross_attention_outputslayer_outputr$   r$   r%   ri   M  sP   


	

zAlignTextLayer.forwardc                 C   s   |  |}| ||}|S rg   )r  r	  )r5   r  Zintermediate_outputr  r$   r$   r%   r    s   
z!AlignTextLayer.feed_forward_chunkr   )r   r   r   rX   r!   rj   r   r"   r   r   ri   r  rk   r$   r$   rd   r%   r  >  s4    	
Ar  c                       s   e Zd Z fddZ									ddejdeej deej d	eej d
eej deeeej   dee	 dee	 dee	 dee	 de
eej ef fddZ  ZS )AlignTextEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r$   )r  )r2   _rC   r$   r%   
<listcomp>  s    z-AlignTextEncoder.__init__.<locals>.<listcomp>F)	rW   rX   rC   r   r   r   num_hidden_layerslayergradient_checkpointingrc   rd   r!  r%   rX     s   
 
zAlignTextEncoder.__init__NFTr   r   r   r   r   past_key_valuesr   r   r   r   r/   c                 C   s^  |	rdnd }|r
dnd }|r| j jrdnd }| jr%| jr%|r%td d}|r)dnd }t| jD ]^\}}|	r;||f }|d urC|| nd }|d urM|| nd }| jrc| jrc| |j	|||||||}n
||||||||}|d }|rz||d f7 }|r||d f }| j jr||d f }q0|	r||f }|
st
dd	 |||||fD S t|||||d
S )Nr$   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   rE   c                 s   r   rg   r$   r   r$   r$   r%   r6     s    z+AlignTextEncoder.forward.<locals>.<genexpr>)r   r&  r   r(   cross_attentions)rC   r  r%  ZtrainingloggerZwarning_once	enumerater$  Z_gradient_checkpointing_func__call__r7   r   )r5   r   r   r   r   r   r&  r   r   r   r   r   Zall_self_attentionsZall_cross_attentionsZnext_decoder_cacher   Zlayer_moduleZlayer_head_maskr   Zlayer_outputsr$   r$   r%   ri     sz   


zAlignTextEncoder.forward)	NNNNNNFFT)r   r   r   rX   r!   rj   r   r"   r   r   r   r   ri   rk   r$   r$   rd   r%   r    sD    		
r  c                       r  )AlignTextPoolerc                    s*   t    t|j|j| _t | _d S rg   )rW   rX   r   r   r   r   ZTanhrb   rc   rd   r$   r%   rX     s   
zAlignTextPooler.__init__r   r/   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   rb   )r5   r   Zfirst_token_tensorpooled_outputr$   r$   r%   ri     s   

zAlignTextPooler.forwardr  r$   r$   rd   r%   r+    s    r+  c                   @   s    e Zd ZeZdZdZdd ZdS )AlignPreTrainedModelalignTc                 C   s   t |tjtjfr |jjjd| jjd |j	dur|j	j
  n8t |tr9tj|jj |jj	j
  d|j_nt |tjrX|jjjd| jjd |jdurX|jj|j 
  t |tjrm|j	j
  |jjd dS dS )zInitialize the weightsg        )meanZstdNTg      ?)rK   r   r   r[   weightdataZnormal_rC   Zinitializer_rangerT   Zzero_
AlignModelinitZxavier_uniform_text_projectionZ_is_hf_initializedr   r   r   Zfill_)r5   moduler$   r$   r%   _init_weights  s"   



z"AlignPreTrainedModel._init_weightsN)r   r   r   r   config_classZbase_model_prefixsupports_gradient_checkpointingr6  r$   r$   r$   r%   r-    s
    r-  zJ
    The text model from ALIGN without any head or projection on top.
    )Zcustom_introc                       s   e Zd ZeZdgZddedef fddZdd Zd	d
 Z	e
									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlignTextModelr   TrC   add_pooling_layerc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rW   rX   rC   r   r   r  encoderr+  pooler	post_init)r5   rC   r:  rd   r$   r%   rX   $  s   

zAlignTextModel.__init__c                 C   s   | j jS rg   r   r   r4   r$   r$   r%   get_input_embeddings4  s   z#AlignTextModel.get_input_embeddingsc                 C   s   || j _d S rg   r>  )r5   r   r$   r$   r%   set_input_embeddings7  s   z#AlignTextModel.set_input_embeddingsNr   r   r   r   r   r   r   r   r   r/   c
                 C   s  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	|dur*|dur*td|dur9| || | }
n|durF| dd }
ntd|
\}}|durU|jn|j}|du retj	||f|d}|du rt
| jdr| jjddd|f }|||}|}n	tj|
tj|d}| ||
}| || j j}| j||||d}| j||||||	d	}|d
 }| jdur| |nd}|	s||f|dd  S t|||j|j|jdS )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr:   r   r   )r   r   r   r   )r   r   r   r   r   r   r   )r   pooler_outputr   r(   r'  )rC   r   r   use_return_dictr   Z%warn_if_padding_and_no_attention_maskr   r;   r!   Zonesr   r   r   r   rm   r   Zget_extended_attention_maskZget_head_maskr#  r;  r<  r   r   r(   r'  )r5   r   r   r   r   r   r   r   r   r   r   Z
batch_sizer   r;   r   r   Zextended_attention_maskembedding_outputencoder_outputsZsequence_outputr,  r$   r$   r%   ri   :  sb   
zAlignTextModel.forwardT	NNNNNNNNN)r   r   r   r   r7  Z_no_split_modulesr   rX   r?  r@  r   r   r!   rj   r   r   r   ri   rk   r$   r$   rd   r%   r9    sJ    	

r9  zL
    The vision model from ALIGN without any head or projection on top.
    c                       sz   e Zd ZeZdZdZdef fddZdej	fddZ
e						ddeej d
ee dee deeef fddZ  ZS )AlignVisionModelrf   FrC   c                    s~   t  | || _t|| _t|| _|jdkr"tj	|j
dd| _n|jdkr1tj|j
dd| _ntd|j |   d S )Nr/  T)Z	ceil_moderF   z2config.pooling must be one of ['mean', 'max'] got )rW   rX   rC   rM   r   r   r;  Zpooling_typer   Z	AvgPool2dZ
hidden_dimr<  Z	MaxPool2dr   Zpoolingr=  rc   rd   r$   r%   rX     s   



zAlignVisionModel.__init__r/   c                 C   s
   | j jjS rg   )vision_modelr   r\   r4   r$   r$   r%   r?    s   
z%AlignVisionModel.get_input_embeddingsNr   r   c                 C   s   |dur|n| j j}|dur|n| j j}|du rtd| |}| j|||d}|d }| |}||jdd }|sH||f|dd  S t	|||j
dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_values)r   r   r   rE   r   )r   rA  r   )rC   r   rB  r   r   r;  r<  Zreshaper   r   r   )r5   rf   r   r   rC  rD  r   r,  r$   r$   r%   ri     s*   

zAlignVisionModel.forwardNNN)r   r   r   r   r7  Zmain_input_namer8  rX   r   Moduler?  r   r   r!   r"   r   r   r   r   ri   rk   r$   r$   rd   r%   rG    s&    
rG  c                       sp  e Zd ZeZdef fddZe									ddeej	 deej	 deej	 deej	 d	eej	 d
eej	 dee
 dee
 dee
 dejfddZe			ddeej dee
 dee
 dejfddZe											ddeej deej deej	 deej	 deej	 d	eej	 d
eej	 dee
 dee
 dee
 dee
 deeef fddZ  ZS )r2  rC   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _t|| _t|| _t| j| j	| _tt| jj| _|   d S )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rW   rX   rK   text_configr   	TypeErrortypevision_configr   Zprojection_dimr   Ztext_embed_dimr9  
text_modelrG  rH  r   r   r4  	Parameterr!   r   rC   Ztemperature_init_valuetemperaturer=  )r5   rC   rL  rO  rd   r$   r%   rX     s,   

zAlignModel.__init__Nr   r   r   r   r   r   r   r   r   r/   c
                 C   s   |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j|||||||||	d	}
|
d dddddf }| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AlignTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```N	r   r   r   r   r   r   r   r   r   r   )rC   r   r   rB  rP  r4  )r5   r   r   r   r   r   r   r   r   r   text_outputsr   Ztext_featuresr$   r$   r%   get_text_features  s$   
zAlignModel.get_text_featuresrf   c                 C   sD   |dur|n| j j}|dur|n| j j}| j|||d}|d }|S )a9  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AlignVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nrf   r   r   r   )rC   r   rB  rH  )r5   rf   r   r   vision_outputsZimage_featuresr$   r$   r%   get_image_featuresJ  s   zAlignModel.get_image_featuresreturn_lossc                 C   s*  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}| j||
|d}| j|||||||	|
|d	}|d }|d dddddf }| |}||jdddd	 }||jdddd	 }t	||
 | j }|
 }d}|rut|}|s||||||f}|dur|f| S |S t|||||||d
S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NrV  rS  r   r   rE   r   T)r   r   Zkeepdim)r*   r+   r,   r'   r   r-   r.   )rC   r   r   rB  rH  rP  r4  Znormr!   r   rA   rR  rB   r)   )r5   r   rf   r   r   r   r   r   rY  r   r   r   rW  rT  r   r'   r,   r+   r*   r	  r$   r$   r%   ri   w  sT   )
zAlignModel.forwardrF  rI  )NNNNNNNNNNN)r   r   r   r   r7  rX   r   r   r!   rj   r   r"   rU  rX  r   r   r   r)   ri   rk   r$   r$   rd   r%   r2    s    	
4,	

r2  )r-  r9  rG  r2  rE  )Gr    r   dataclassesr   typingr   r   r   r   r!   Ztorch.utils.checkpointr   Zactivationsr	   Zmodeling_outputsr
   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   r   Zconfiguration_alignr   r   r   Z
get_loggerr   r(  r   r&   r)   rj   r?   rB   rG   rH   r   rL   rJ  rM   r[   rl   rs   r|   r   r   r   r   r   r   r   r  r  r  r  r  r  r+  r-  r9  rG  r2  __all__r$   r$   r$   r%   <module>   st   
&('!QKA 4W^|R d