
    eTh                     P   S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJrJrJr  SSKJrJrJr  SSKJrJrJ r   \RB                  " \"5      r#\ " S S\5      5       r$\ " S S\5      5       r%\ " S S\5      5       r&S\	RN                  S\	RN                  4S jr(S\	RN                  S\	RN                  4S jr)S\ S\*4S jr+SNS\\*\4   S\,4S jjr- " S S \R\                  5      r/ " S! S"\R`                  5      r1 " S# S$\R\                  5      r2 " S% S&\R\                  5      r3 " S' S(\R\                  5      r4 " S) S*\R\                  5      r5 " S+ S,\R\                  5      r6 " S- S.\R\                  5      r7 " S/ S0\R\                  5      r8 " S1 S2\R\                  5      r9 " S3 S4\R\                  5      r:S5\90r; " S6 S7\R\                  5      r< " S8 S9\R\                  5      r= " S: S;\R\                  5      r> " S< S=\R\                  5      r? " S> S?\R\                  5      r@ " S@ SA\R\                  5      rA\ " SB SC\5      5       rB\" SDSE9 " SF SG\B5      5       rC\" SHSE9 " SI SJ\B5      5       rD\ " SK SL\B5      5       rE/ SMQrFg)OzPyTorch ALIGN model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN)BaseModelOutputWithNoAttention)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions(BaseModelOutputWithPoolingAndNoAttention)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )AlignConfigAlignTextConfigAlignVisionConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Srg)AlignVisionModelOutput)   a  
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

Args:
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nimage_embedslast_hidden_statehidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   __static_attributes__r        `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/align/modeling_align.pyr   r   )   sN     15L(5,,-459x 1 1298<M8E%"3"345<r*   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	AlignTextModelOutput?   a  
Base class for text model's outputs that also contains a pooling of the last hidden states.

Args:
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Ntext_embedsr   r   
attentionsr    )r!   r"   r#   r$   r%   r/   r   r&   r'   r(   r   r   r   r0   r)   r    r*   r+   r-   r-   ?   sh    * 04K%++,359x 1 1298<M8E%"3"345<59Ju00129r*   r-   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)AlignOutput\   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
        The output of the [`AlignTextModel`].
    vision_model_output(`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
Nlosslogits_per_imagelogits_per_textr/   r   text_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r7   r8   N)getattrto_tuple).0kselfs     r+   	<genexpr>'AlignOutput.to_tuple.<locals>.<genexpr>{   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysr@   s   `r+   r=   AlignOutput.to_tuplez   s#     
YY[
 
 	
r*   r    )r!   r"   r#   r$   r%   r4   r   r&   r'   r(   r5   r6   r/   r   r7   r   r8   r   r   r   r=   r)   r    r*   r+   r2   r2   \   s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-4FJCJDHAH
%* 
r*   r2   logitsr9   c                     [         R                  R                  U [        R                  " [        U 5      U R                  S9SS9$ )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr&   arangelenrJ   )rG   s    r+   contrastive_lossrP      s5    ==&&vu||CKPVP]P]/^ps&ttr*   
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)rP   t)rQ   caption_loss
image_losss      r+   
align_lossrV      s*    #J/L!*,,.1J%,,r*   confignum_channelsc                     U R                   nXR                  -  n[        U[        XS-  -   5      U-  U-  5      nUSU-  :  a  X2-  n[        U5      $ )z4
Round number of filters based on depth multiplier.
   g?)depth_divisorwidth_coefficientmaxint)rW   rX   divisornew_dims       r+   round_filtersra      s`     ""G,,,L'3|k9:gEOPG |##w<r*   kernel_sizeadjustc                     [        U [        5      (       a  X 4n U S   S-  U S   S-  4nU(       a  US   S-
  US   US   S-
  US   4$ US   US   US   US   4$ )a.  
Utility function to get the tuple padding value for the depthwise convolution.

Args:
    kernel_size (`int` or `tuple`):
        Kernel size of the convolution layers.
    adjust (`bool`, *optional*, defaults to `True`):
        Adjusts padding value to apply to right and bottom sides of the input.
r   rZ   r   )
isinstancer^   )rb   rc   corrects      r+   correct_padrg      s~     +s##"01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r*   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	AlignVisionEmbeddings   zD
A module that corresponds to the stem module of the original work.
rW   c           	      |  > [         TU ]  5         [        US5      U l        [        R
                  " SS9U l        [        R                  " UR                  U R                  SSSSS9U l	        [        R                  " U R                  UR                  UR                  S	9U l        [        UR                     U l        g )
N    )r   r   r   r   paddingr	   rZ   validFrb   stridern   bias)epsmomentum)super__init__ra   out_dimr   	ZeroPad2drn   Conv2drX   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr
   
hidden_act
activationr@   rW   	__class__s     r+   rv   AlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r*   pixel_valuesr9   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)rn   rz   r~   r   )r@   r   featuress      r+   forwardAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r*   )r   r~   rz   rw   rn   )r!   r"   r#   r$   r%   r   rv   r&   Tensorr   r)   __classcell__r   s   @r+   ri   ri      s5    	40 	4ELL U\\  r*   ri   c                   :   ^  \ rS rSr       SU 4S jjrSrU =r$ )AlignVisionDepthwiseConv2d   c	                 8   > X-  n	[         T
U ]  UU	UUUUUUUS9	  g )N)	in_channelsout_channelsrb   rq   rn   dilationgroupsrr   padding_mode)ru   rv   )r@   r   depth_multiplierrb   rq   rn   r   rr   r   r   r   s             r+   rv   #AlignVisionDepthwiseConv2d.__init__   s:     #5#%#% 	 
	
r*   r    )r   r	   r   r   r   Tzeros)r!   r"   r#   r$   rv   r)   r   r   s   @r+   r   r      s$     
 
r*   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jrSrU =r$ )AlignVisionExpansionLayer   zW
This corresponds to the expansion phase of each block in the original implementation.
rW   in_dimrw   rq   c                    > [         TU ]  5         [        R                  " UUSSSS9U l        [        R
                  " X1R                  S9U l        [        UR                     U l
        g )Nr   sameFr   r   rb   rn   rr   )num_featuresrs   )ru   rv   r   ry   expand_convr{   r|   	expand_bnr
   r   
expand_act)r@   rW   r   rw   rq   r   s        r+   rv   "AlignVisionExpansionLayer.__init__   sX    99 
 WBWBWX !2!23r*   r   r9   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r@   r   s     r+   r   !AlignVisionExpansionLayer.forward   s4    ((7}56r*   )r   r   r   )r!   r"   r#   r$   r%   r   r^   rv   r&   r'   r   r   r)   r   r   s   @r+   r   r      sM    
40 
4# 
4 
4UX 
4U%6%6 5<<  r*   r   c            
       ~   ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjrS	\	R                  S
\	R                  4S jrSrU =r$ )AlignVisionDepthwiseLayeri  zc
This corresponds to the depthwise convolution phase of each block in the original implementation.
rW   r   rq   rb   adjust_paddingc                 F  > [         TU ]  5         X0l        U R                  S:X  a  SOSn[        XES9n[        R
                  " US9U l        [        X$X6SS9U l        [        R                  " X!R                  UR                  S9U l        [        UR                     U l        g )	NrZ   ro   r   )rc   rm   Frp   r   rs   rt   )ru   rv   rq   rg   r   rx   depthwise_conv_padr   depthwise_convr{   r|   r}   depthwise_normr
   r   depthwise_act)	r@   rW   r   rq   rb   r   conv_padrn   r   s	           r+   rv   "AlignVisionDepthwiseLayer.__init__  s     	"kkQ.7FkA"$,,w"?8FSX
 !nn%:%:VE_E_
 $F$5$56r*   r   r9   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ )NrZ   )rq   r   r   r   r   r   s     r+   r   !AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r*   )r   r   r   r   rq   r!   r"   r#   r$   r%   r   r^   boolrv   r&   r'   r   r   r)   r   r   s   @r+   r   r     s_    7!7 7 	7
 7 7,	U%6%6 	5<< 	 	r*   r   c            	       ~   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )AlignVisionSqueezeExciteLayeri+  zd
This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
rW   r   
expand_dimexpandc                   > [         TU ]  5         U(       a  UOUU l        [        S[	        X!R
                  -  5      5      U l        [        R                  " SS9U l	        [        R                  " U R                  U R                  SSS9U l        [        R                  " U R                  U R                  SSS9U l        [        UR                     U l        [        R                   " 5       U l        g )Nr   )output_sizer   )r   r   rb   rn   )ru   rv   dimr]   r^   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezery   reducer   r
   r   
act_reduceSigmoid
act_expand)r@   rW   r   r   r   r   s        r+   rv   &AlignVisionSqueezeExciteLayer.__init__0  s    !':V!S*H*H!HIJ++:ii	
 ii	
 !!2!23**,r*   r   r9   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      n[
        R                  " X!5      nU$ r   )r   r   r   r   r   r&   mul)r@   r   inputss      r+   r   %AlignVisionSqueezeExciteLayer.forwardE  sa    ]3M26M26		&8r*   )r   r   r   r   r   r   r   )Fr   r   s   @r+   r   r   +  sR    '0 '# '3 'X\ ' '*
U%6%6 
5<< 
 
r*   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\4U 4S	 jjr	S
\
R                  S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionFinalBlockLayeriR  zS
This corresponds to the final phase of each block in the original implementation.
rW   r   rw   rq   	drop_rateid_skipc                   > [         TU ]  5         US:H  =(       a    U(       + U l        [        R                  " UUSSSS9U l        [        R                  " X1R                  UR                  S9U l	        [        R                  " US9U l        g )Nr   r   Fr   r   )p)ru   rv   apply_dropoutr   ry   project_convr{   r|   r}   
project_bnDropoutdropout)r@   rW   r   rw   rq   r   r   r   s          r+   rv   #AlignVisionFinalBlockLayer.__init__W  sx     	#q[8[II 
 .. &;&;fF`F`
 zzI.r*   
embeddingsr   r9   c                     U R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX!-   nU$ r   )r   r   r   r   )r@   r   r   s      r+   r   "AlignVisionFinalBlockLayer.forwardh  sE    ))-86 LL7M)6Mr*   )r   r   r   r   r!   r"   r#   r$   r%   r   r^   floatr   rv   r&   r'   r   r   r)   r   r   s   @r+   r   r   R  so    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf  r*   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\S	\S
\S\4U 4S jjr	S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionBlockis  a1  
This corresponds to the block module of original the EfficientNet vision encoder implementation.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
    in_dim (`int`):
        Number of input channels.
    out_dim (`int`):
        Number of output channels.
    stride (`int`):
        Stride size to be used in convolution layers.
    expand_ratio (`int`):
        Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
    kernel_size (`int`):
        Kernel size for the depthwise convolution layer.
    drop_rate (`float`):
        Dropout rate to be used in the final phase of each block.
    id_skip (`bool`):
        Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
        of each block. Set to `True` for the first block of each stage.
    adjust_padding (`bool`):
        Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
        operation, set to `True` for inputs with odd input sizes.
rW   r   rw   rq   expand_ratiorb   r   r   r   c
           	      p  > [         TU ]  5         XPl        U R                  S:w  a  SOSU l        X%-  n
U R                  (       a  [	        XXS9U l        [        UU R                  (       a  U
OUUUU	S9U l        [        XXR                  S9U l	        [        UU R                  (       a  U
OUUUUUS9U l        g )Nr   TF)rW   r   rw   rq   )rW   r   rq   rb   r   )rW   r   r   r   )rW   r   rw   rq   r   r   )ru   rv   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)r@   rW   r   rw   rq   r   rb   r   r   r   expand_in_dimr   s              r+   rv   AlignVisionBlock.__init__  s     	("//14d%-;;6mDN 8$(KK=V#)
 <];;
 5$(KK=V
r*   r   r9   c                     UnU R                   S:w  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  X!5      nU$ )Nr   )r   r   r   r   r   )r@   r   r   s      r+   r   AlignVisionBlock.forward  sY    "
! NN=9M++M: ++M:
Br*   )r   r   r   r   r   r   r   r   s   @r+   r   r   s  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
 
r*   r   c            	       v   ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\
   S\	\
   S\4S	 jjrS
rU =r$ )AlignVisionEncoderi  z
Forward propagates the embeddings through each vision encoder (EfficientNet) block.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
rW   c                   >^ ^ [         TT ]  5         UR                  T l        U 4S jm[        UR                  5      n[        U4S jUR                   5       5      nSn/ n[        U5       H  n[        XR                  U   5      n[        XR                  U   5      nUR                  U   n	UR                  U   n
UR                  U   n[        T" UR                  U   5      5       Hm  nUS:X  a  SOSnUS:  a  SOU	n	US:  a  UOUnXAR                  ;   a  SOSnUR                  U-  U-  n[        UUUU	U
UUUUS9	nUR!                  U5        US-  nMo     M     ["        R$                  " U5      T l        g )Nc                 \   > [        [        R                  " TR                  U -  5      5      $ r   )r^   mathceildepth_coefficient)repeatsr@   s    r+   round_repeats2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr*   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r    )r>   nr   s     r+   rA   .AlignVisionEncoder.__init__.<locals>.<genexpr>  s     L3Kaq))3Ks   r   TFr   )	rW   r   rw   rq   rb   r   r   r   r   )ru   rv   r   rO   r   sumnum_block_repeatsrangera   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)r@   rW   num_base_blocks
num_blockscurr_block_numr   ir   rw   rq   rb   r   jr   r   r   blockr   r   s   `                @r+   rv   AlignVisionEncoder.__init__  sy   !'!9!9	D f001L63K3KLL
'A"6+=+=a+@AF#F,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEF"#q&$e!e$%Ev*8<T<T*TZ^"44~E
R	(!!#! +!-'##1
 e$!#' G (8 mmF+r*   r   output_hidden_statesreturn_dictr9   c                     U(       a  U4OS nU R                    H  nU" U5      nU(       d  M  XA4-  nM     U(       d  [        S X4 5       5      $ [        UUS9$ )Nc              3   .   #    U  H  oc  M  Uv   M     g 7fr   r    r>   vs     r+   rA   -AlignVisionEncoder.forward.<locals>.<genexpr>  s     X$Fq$Fs   	)r   r   )r   rC   r   )r@   r   r  r  all_hidden_statesr  s         r+   r   AlignVisionEncoder.forward  sh     1E],$[[E!-0M##!%55! !
 X]$FXXX-++
 	
r*   )r   r   )FT)r!   r"   r#   r$   r%   r   rv   r&   r'   r   r   r   r   r)   r   r   s   @r+   r   r     s\    ),0 ),\ 05&*	
((
 'tn
 d^	

 
2
 
r*   r   c                      ^  \ rS rSrSrU 4S jr     SS\\R                     S\\R                     S\\R                     S\\R                     S\
S	\R                  4S
 jjrSrU =r$ )AlignTextEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxrs   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)ru   rv   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   r<   r  register_bufferr&   rN   r   r   r  sizelongr   s     r+   rv   AlignTextEmbeddings.__init__  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r*   	input_idsr  r  inputs_embedspast_key_values_lengthr9   c                 d   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      n	U	nO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R                  U5      nU R                  U5      nU$ )Nr  r   r  r   r  rJ   r  )r)  r  hasattrr  r   r&   r   r*  rJ   r   r$  r  r"  r%  r   )r@   r,  r  r  r-  r.  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr$  r   r"  s                r+   r   AlignTextEmbeddings.forward%  sC     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r*   )r%  r   r  r"  r$  r   )NNNNr   )r!   r"   r#   r$   r%   rv   r   r&   
LongTensorr'   r^   r   r   r)   r   r   s   @r+   r  r    s    Q
* 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
' 'r*   r  c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )AlignTextSelfAttentioniP  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r  r  relative_keyrelative_key_queryrZ   r   )ru   rv   r  num_attention_headsr1  
ValueErrorr^   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   r<   r  r!  r  distance_embedding
is_decoderr@   rW   r  r   s      r+   rv   AlignTextSelfAttention.__init__Q  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r*   xr9   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr  r   rZ   r   r	   )r)  r?  rA  viewpermute)r@   rL  new_x_shapes      r+   transpose_for_scores+AlignTextSelfAttention.transpose_for_scoresk  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r*   r   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r   rZ   r   r  r=  r>  r0  r  zbhld,lrd->bhlrzbhrd,lrd->bhlrr	   ) rD  rQ  rE  rF  r&   catrI  matmul	transposer  shapetensorr*  rJ   rN  rN   rH  r!  tor  einsumr   sqrtrA  r   rL   softmaxr   rO  
contiguousr)  rB  )r@   r   rS  rT  rU  rV  rW  rX  mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r+   r   AlignTextSelfAttention.forwardp  s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr*   )rB  rA  rH  r   rI  rE  r!  r?  r  rD  rF  r   NNNNNF)r!   r"   r#   r$   rv   r&   r   rQ  r   r'   r   r   r   r)   r   r   s   @r+   r9  r9  P  s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c cr*   r9  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr  )ru   rv   r   rC  r  denser%  r&  r   r'  r   r   s     r+   rv   AlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r*   r   input_tensorr9   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r   r%  r@   r   r  s      r+   r   AlignTextSelfOutput.forward  5    

=1]3}'CDr*   r%  r  r   
r!   r"   r#   r$   rv   r&   r   r   r)   r   r   s   @r+   r}  r}    6    >U\\  RWR^R^  r*   r}  eagerc                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )AlignTextAttentioni  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nr  )	ru   rv   !ALIGN_TEXT_SELF_ATTENTION_CLASSES_attn_implementationr@   r}  outputsetpruned_headsrJ  s      r+   rv   AlignTextAttention.__init__  s@    5f6Q6QR
	 *&1Er*   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rZ  )rO   r   r@   r?  rA  r  r   rD  rE  rF  r  r  rB  union)r@   headsindexs      r+   prune_headsAlignTextAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r*   r   rS  rT  rU  rV  rW  rX  r9   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   )r@   r  )r@   r   rS  rT  rU  rV  rW  rX  self_outputsattention_outputry  s              r+   r   AlignTextAttention.forward  sW     yy!"
  ;;|AF#%QR(88r*   )r  r  r@   r   r{  )r!   r"   r#   r$   rv   r  r&   r   r   r'   r   r   r   r)   r   r   s   @r+   r  r    s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r*   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )ru   rv   r   rC  r  intermediate_sizer  re   r   strr
   intermediate_act_fnr   s     r+   rv   AlignTextIntermediate.__init__   s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r*   r   r9   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r  r   s     r+   r   AlignTextIntermediate.forward(  s&    

=100?r*   r  r  r   s   @r+   r  r    s(    9U\\ ell  r*   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextOutputi/  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )ru   rv   r   rC  r  r  r  r%  r&  r   r'  r   r   s     r+   rv   AlignTextOutput.__init__0  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r*   r   r  r9   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r  s      r+   r   AlignTextOutput.forward6  r  r*   r  r  r   s   @r+   r  r  /  r  r*   r  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )AlignTextLayeri>  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is addedr  r  )ru   rv   chunk_size_feed_forwardseq_len_dimr  	attentionrI  add_cross_attentionr@  crossattentionr  intermediater  r  r   s     r+   rv   AlignTextLayer.__init__?  s    '-'E'E$+F3 ++#)#=#= ##?? D6)g!hii"4VU_"`D1&9%f-r*   r   rS  rT  rU  rV  rW  rX  r9   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
NrZ   )rX  rW  r   r   r  r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r[  )	r  rI  r1  r@  r  r   feed_forward_chunkr  r  )r@   r   rS  rT  rU  rV  rW  rX  self_attn_past_key_valueself_attention_outputsr  ry  present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r+   r   AlignTextLayer.forwardM  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr*   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  )r@   r  intermediate_outputr  s       r+   r  !AlignTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir*   )r  r  r  r  r  rI  r  r  r{  )r!   r"   r#   r$   rv   r&   r   r   r'   r   r   r   r  r)   r   r   s   @r+   r  r  >  s    ." 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B r*   r  c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )AlignTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
ru   rv   rW   r   r   r   num_hidden_layersr  layergradient_checkpointing)r@   rW   _r   s      r+   rv   AlignTextEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&r   rS  rT  rU  rV  past_key_valuesrk  rX  r  r  r9   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
Nr    zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r  r   rZ   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r    r
  s     r+   rA   +AlignTextEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)r   r  r   r0   cross_attentions)rW   r  r  trainingloggerwarning_once	enumerater  _gradient_checkpointing_func__call__rC   r   )r@   r   rS  rT  rU  rV  r  rk  rX  r  r  r  all_self_attentionsall_cross_attentionsnext_decoder_cacher  layer_modulelayer_head_maskrW  layer_outputss                       r+   r   AlignTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r*   )rW   r  r  )	NNNNNNFFT)r!   r"   r#   r$   rv   r&   r   r   r'   r   r   r   r   r   r)   r   r   s   @r+   r  r    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
r*   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )ru   rv   r   rC  r  r  Tanhr   r   s     r+   rv   AlignTextPooler.__init__  s9    YYv1163E3EF
'')r*   r   r9   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r  r   )r@   r   first_token_tensorpooled_outputs       r+   r   AlignTextPooler.forward  s6     +1a40

#566r*   )r   r  r  r   s   @r+   r  r    s(    $
U\\ ell  r*   r  c                   &    \ rS rSr\rSrSrS rSr	g)AlignPreTrainedModeli  alignTc                 D   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b$  UR                  R
                  R                  5         GO[        U[        5      (       as  [        R                  R                  UR                  R                  5        UR                  R                  R
                  R                  5         SUR                  l        O[        U[        R                   5      (       av  UR                  R
                  R                  SU R                  R                  S9  UR"                  b1  UR                  R
                  UR"                     R                  5         [        U[        R$                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R'                  S5        gg)zInitialize the weightsg        )meanstdNTg      ?)re   r   rC  ry   weightdatanormal_rW   initializer_rangerr   zero_
AlignModelinitxavier_uniform_text_projection_is_hf_initializedr  r  r%  fill_)r@   modules     r+   _init_weights"AlignPreTrainedModel._init_weights  sd   fryy"))455MM&&CT[[5R5R&S{{&  &&(
++GG##F$:$:$A$AB""'',,2248<F""5--MM&&CT[[5R5R&S!!-""6#5#56<<>fbll++KK""$MM$$S) ,r*   r    N)
r!   r"   r#   r$   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r)   r    r*   r+   r  r    s    L&*#*r*   r  zJ
    The text model from ALIGN without any head or projection on top.
    )custom_introc                   X  ^  \ rS rSr\rS/rSS\S\4U 4S jjjrS r	S r
\         SS\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlignTextModeli  r  rW   add_pooling_layerc                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
ru   rv   rW   r  r   r  encoderr  pooler	post_init)r@   rW   r  r   s      r+   rv   AlignTextModel.__init__$  sK    
 	 -f5'/1Bof- 	r*   c                 .    U R                   R                  $ r   r   r   rE   s    r+   get_input_embeddings#AlignTextModel.get_input_embeddings4  s    ...r*   c                 $    XR                   l        g r   r  )r@   rF  s     r+   set_input_embeddings#AlignTextModel.set_input_embeddings7  s    */'r*   r,  rS  r  r  rT  r-  rX  r  r  r9   c
           	      (   Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       SS n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  SS2SU24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R!                  X*5      nU R#                  XPR                   R$                  5      nU R                  UUUUS9nU R'                  UUUUUU	S	9nUS
   nU R(                  b  U R)                  U5      OSnU	(       d
  UU4USS -   $ [+        UUUR,                  UR.                  UR0                  S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AlignTextModel

>>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer  z5You have to specify either input_ids or inputs_embedsrI   r  r0  )r,  r  r  r-  )rS  rT  rX  r  r  r   r   )r   pooler_outputr   r0   r  )rW   rX  r  use_return_dictr@  %warn_if_padding_and_no_attention_maskr)  rJ   r&   onesr1  r   r  r   r   r*  get_extended_attention_maskget_head_maskr  r  r  r   r   r0   r  )r@   r,  rS  r  r  rT  r-  rX  r  r  r2  
batch_sizer3  rJ   r4  r5  extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                        r+   r   AlignTextModel.forward:  s1   8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,2/!5# ' 
 *!,8<8OO4UY#]3oab6III;-')77&11,==
 	
r*   rW   r   r  r  T	NNNNNNNNN)r!   r"   r#   r$   r   r  _no_split_modulesr   rv   r  r  r   r   r&   r   r   r   r   r   r)   r   r   s   @r+   r  r    s!    #L./ 4   /0  -11515/3,004,0/3&*^
ELL)^
 !.^
 !.	^

 u||,^
 ELL)^
  -^
 $D>^
 'tn^
 d^^
 
uBB	C^
 ^
r*   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                      ^  \ rS rSr\rSrSrS\4U 4S jjrS\	R                  4S jr\   SS\\R                     S\\   S	\\   S\\\4   4S
 jj5       rSrU =r$ )AlignVisionModeli  r   FrW   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:X  a%  [        R                  " UR                  SS9U l        OMUR                  S:X  a%  [        R                  " UR                  SS9U l        O[        SUR                   35      eU R                  5         g )Nr  T)	ceil_moder]   z2config.pooling must be one of ['mean', 'max'] got )ru   rv   rW   ri   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2dr@  poolingr  r   s     r+   rv   AlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r*   r9   c                 B    U R                   R                  R                  $ r   )vision_modelr   rz   rE   s    r+   r  %AlignVisionModel.get_input_embeddings  s      ++777r*   r  r  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUUS9nUS   nU R                  U5      nUR                  UR                  SS 5      nU(       d	  Xg4USS -   $ [        UUUR                  S9$ )a\  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AlignVisionModel

>>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```Nz You have to specify pixel_values)r  r  r   rZ   r   )r   r  r   )rW   r  r  r@  r   r  r  reshaper_  r   r   )r@   r   r  r  r  r  r   r  s           r+   r   AlignVisionModel.forward  s    8 %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5# ' 
 ,A.$56%--m.A.A"1.EF%58KKK7/')77
 	
r*   r  NNN)r!   r"   r#   r$   r   r  main_input_namer   rv   r   Moduler  r   r   r&   r'   r   r   r   r   r   r)   r   r   s   @r+   r#  r#    s     %L$O&+#0 "8bii 8  59/3&*	5
u0015
 'tn5
 d^	5

 
u>>	?5
 5
r*   r#  c                     ^  \ rS rSr\rS\4U 4S jjr\         SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\   S\\   S\\   S\	R                  4S jj5       r\   SS\\	R                     S\\   S\\   S\	R                  4S jj5       r\           SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S\\   S
\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )r  i  rW   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        [        U5      U l        [        U5      U l        [         R"                  " U R                  U R                  5      U l        [         R&                  " [(        R*                  " U R,                  R.                  5      5      U l        U R3                  5         g )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )ru   rv   re   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr#  r-  r   rC  r  	Parameterr&   r`  rW   temperature_init_valuetemperaturer  )r@   rW   r8  r;  r   s       r+   rv   AlignModel.__init__  s)    &,,o>>++,-Q0 
 &..0ABB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r*   r,  rS  r  r  rT  r-  rX  r  r  r9   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  UUUUUUUUU	S9	n
U
S   SS2SSS24   nU R                  U5      nU$ )a7  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`AlignTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```N	r,  rS  r  r  rT  r-  rX  r  r  r   )rW   rX  r  r  r>  r  )r@   r,  rS  r  r  rT  r-  rX  r  r  text_outputsr   text_featuress                r+   get_text_featuresAlignModel.get_text_features  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%'/!5# ' 

 )OAq!G4,,->?r*   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUS9nUS   nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`AlignVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```r   r  r  r   )rW   r  r  r-  )r@   r   r  r  vision_outputsimage_featuress         r+   get_image_featuresAlignModel.get_image_featuresJ  sf    > %9$D $++JjJj 	 &1%<k$++B]B]**%!5# + 
 (*r*   return_lossc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R	                  UU
US9nU R                  UUUUUUU	U
US9	nUS   nUS   SS2SSS24   nU R                  U5      nXR                  SSSS	9-  nXR                  SSSS	9-  n[        R                  " XR                  5       5      U R                  -  nUR                  5       nSnU(       a  [        U5      nU(       d  UUXX4nUb  U4U-   $ U$ [        UUUUUUUS
9$ )aA  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```NrJ  rD  r   r   rZ   r  T)r   r   keepdim)r4   r5   r6   r/   r   r7   r8   )rW   rX  r  r  r-  r>  r  normr&   r]  rS   rA  rV   r2   )r@   r,  r   rS  r  r  rT  r-  rO  rX  r  r  rK  rE  r   r/   r6   r5   r4   r  s                       r+   r   AlignModel.forwardw  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%!5# + 
 ))%'/!5# ' 

 &a("1oaAg.**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4DEHXHXX*,,.o.D&T`qF)-)9TGf$EvE-+#%* .
 	
r*   )r<  rA  r=  r>  r  r-  r   r2  )NNNNNNNNNNN)r!   r"   r#   r$   r   r  rv   r   r   r&   r   r   r'   rG  rM  r7  r   r   r2   r   r)   r   r   s   @r+   r  r    sg   L{ <  -11515/3,004,0/3&*2ELL)2 !.2 !.	2
 u||,2 ELL)2  -2 $D>2 'tn2 d^2 
		2 2h  59/3&*	*u001* 'tn* d^	*
 
		* *X  15481515/3,004&*,0/3&*\
E,,-\
 u001\
 !.	\

 !.\
 u||,\
 ELL)\
  -\
 d^\
 $D>\
 'tn\
 d^\
 
uk!	"\
 \
r*   r  )r  r  r#  r  r  )Gr%   r   dataclassesr   typingr   r   r   r   r&   torch.utils.checkpointr   activationsr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_alignr   r   r   
get_loggerr!   r  r   r-   r2   r   rP   rV   r^   ra   r   rg   r4  ri   ry   r   r   r   r   r   r   r   r  r9  r}  r  r  r  r  r  r  r  r  r  r#  r  __all__r    r*   r+   <module>r_     s     ! . .    !  . l l 9 9 P P 
		H	% =[ = =* :; : :8 !
+ !
 !
LuU\\ uell u-5<< -ELL -+ 3  @U3:. @ @*BII 4
 
6		 6$		 $P$BII $N BNryy NbG
 G
V=")) =BCRYY CN"))  #% !0 0hBII  bii SRYY SnZ
ryy Z
|bii  *? * *0 
y
) y

y
x 
O
+ O

O
d `
% `
 `
F Wr*   