
    fThNQ                    z   S r SSKrSSKJr  SSKJr  SSKJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJr  SS	KJrJr  SS
KJrJrJrJrJr  SSKJrJr  SSKJrJr  SSKJ r J!r!J"r"  SSK#J$r$J%r%J&r&  \!RN                  " \(5      r)Sr*\ " S S\5      5       r+\ " S S\5      5       r, " S S\RZ                  5      r. " S S\RZ                  5      r/ " S S\RZ                  5      r0 " S S\RZ                  5      r1 " S S\RZ                  5      r2 " S S \RZ                  5      r3 " S! S"\RZ                  5      r4 " S# S$\RZ                  5      r5 " S% S&\RZ                  5      r6 " S' S(\RZ                  5      r7S)\70r8 " S* S+\RZ                  5      r9 " S, S-\RZ                  5      r: " S. S/\RZ                  5      r; " S0 S1\RZ                  5      r< " S2 S3\RZ                  5      r=SRS4 jr>\  " S5 S6\5      5       r? " S7 S8\?5      r@\ " S9S:9 " S; S<\?5      5       rA\ " S=S:9 " S> S?\?5      5       rB " S@ SA\RZ                  5      rC " SB SC\RZ                  5      rD " SD SE\RZ                  5      rE\ " SFS:9 " SG SH\?5      5       rF\ " SIS:9 " SJ SK\?5      5       rG " SL SM\RZ                  5      rH\ " SNS:9 " SO SP\?5      5       rI/ SQQrJg)SzPyTorch BridgeTower Model    N)OrderedDict)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FNQuickGELUActivation))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputModelOutputSequenceClassifierOutput)PreTrainedModelapply_chunking_to_forward) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )BridgeTowerConfigBridgeTowerTextConfigBridgeTowerVisionConfigRobertaTokenizerc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
BridgeTowerModelOutput.   aM  
Output type of [`BridgeTowerModel`].

Args:
    text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
        Sequence of hidden-states at the text output of the last layer of the model.
    image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
        Sequence of hidden-states at the image output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
        Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
        token), respectively, after further processing through layers used for auxiliary pretraining tasks.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
        the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Ntext_featuresimage_featurespooler_outputhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r"   r   torchFloatTensor__annotations__r#   r$   r%   r   r&   __static_attributes__r'       l/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/bridgetower/modeling_bridgetower.pyr    r    .   s|    . 26M8E--.526NHU../615M8E--.58<M8E%"3"345<59Ju00129r1   r    c                   P   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Srg)BridgeTowerContrastiveOutputN   a
  
Output type of ['BridgeTowerForContrastiveLearning']

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`:
        Image-text contrastive loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    cross_embeds  (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
        the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
Nlosslogitstext_embedsimage_embedscross_embedsr%   r&   r'   )r(   r)   r*   r+   r,   r6   r   r-   r.   r/   r7   r8   r   r9   r:   r%   r&   r0   r'   r1   r2   r4   r4   N   s    . )-D(5$$
%,*.FHU&&'.6:K% 1 123:7;L(5!2!234;7;L(5!2!234;8<M8E%"3"345<59Ju00129r1   r4   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSS\R                  S\\R                     4S jjr	Sr
U =r$ )	BridgeTowerResidualAttentionp   c                 h  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " [        S[        R                  " UR                  UR                  S-  5      4S[        5       4S[        R                  " UR                  S-  UR                  5      4/5      5      U l        [        R                  " UR                  UR                  S9U l        S U l        g )N@   epsc_fc   geluc_proj)super__init__r	   MultiheadAttentionhidden_sizeattn	LayerNormlayer_norm_epsln_1
ModuleDictr   Linearr   mlpln_2	attn_maskselfconfig	__class__s     r2   rG   %BridgeTowerResidualAttention.__init__q   s    ))&*<*<f>P>PTV>VW	LL!3!39N9NO	==RYYv'9'96;M;MPQ;QRS023ryy););a)?ASASTU
 LL!3!39N9NO	r1   hidden_stateattention_maskc           	         Ub(  UR                  [        R                  UR                  S9nU R                  b.  U R                  R                  UR
                  UR                  S9OS U l        U R                  UUUSU R                  US9S   $ )NdtypedeviceF)need_weightsrR   key_padding_maskr   )tor-   boolr]   rR   r\   rJ   )rT   rX   rY   s      r2   	attention&BridgeTowerResidualAttention.attention   s    %+..UZZH[H[.\N ~~) NNL$6$6|?R?RS 	
 yynn+  
  	r1   c                     XR                  U R                  U5      U5      -   nU R                  U5      nU R                  R	                  5        H  u  pEU" U5      nM     X1-   nU$ N)rb   rM   rQ   rP   items)rT   rX   rY   residual_state_layers         r2   forward$BridgeTowerResidualAttention.forward   s\    %tyy7NP^(__yy0(HA .L )%4r1   )rJ   rR   rM   rQ   rP   re   )r(   r)   r*   r+   rG   r-   Tensorrb   r   rj   r0   __classcell__rV   s   @r2   r<   r<   p   sH    "ell ELL "ELL (5<<BX  r1   r<   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\\R                     4S jjrSr	U =r
$ )BridgeTowerTransformer   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  (       aL  [
        R                  " [        U R                  S-
  5       Vs/ s H  n[        U5      PM     sn5      U l	        OH[
        R                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l	        UR                  U l
        g s  snf s  snf Nr   )rF   rG   rI   num_hidden_layersremove_last_layerr	   
ModuleListranger<   	resblocksstop_gradientrT   rU   rh   rV   s      r2   rG   BridgeTowerTransformer.__init__   s    !--!'!9!9##]]?DTE[E[^_E_?`a?`!-f5?`aDN  ]]?DTE[E[?\]?\!-f5?\]DN $11 b ^s   -C)6C.rX   rY   c                     / nU R                    HN  nU" X5      nU R                  (       a!  UR                  UR                  5       5        M=  UR                  U5        MP     U$ re   )rx   ry   appenddetach)rT   rX   rY   r%   blocks        r2   rj   BridgeTowerTransformer.forward   sU    ^^E >L!!$$\%8%8%:;$$\2 $ r1   )rI   rt   rx   ry   re   )r(   r)   r*   r+   rG   r-   rl   r   rj   r0   rm   rn   s   @r2   rp   rp      s-    2ELL (5<<BX  r1   rp   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )BridgeTowerVisionEmbeddings   rU   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)rF   rG   rU   rI   	embed_dim
image_size
patch_sizer	   	Parameterr-   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandrS   s     r2   rG   $BridgeTowerVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr1   
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr         ?r   r   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer-   jit
is_tracingr   r   r   reshapepermuter	   
functionalinterpolateviewcat)rT   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r2   interpolate_pos_encoding4BridgeTowerVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr1   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z).r\   r   r   r   r   )r   r   
ValueErrorr   r   r\   r`   flatten	transposer   r   r-   r   r   r   r   )rT   r   r   
batch_sizerh   r   r   target_dtypepatch_embedsclass_embedsr   s              r2   rj   #BridgeTowerVisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr1   )	r   rU   r   r   r   r   r   r   r   F)r(   r)   r*   r+   r   rG   r-   rl   intr   r.   rj   r0   rm   rn   s   @r2   r   r      sj    q6 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r1   r   c                      ^  \ rS rSrU 4S jr S
S\R                  S\4S jjr S
S\R                  S\4S jjr	S\R                  4S jr
S	rU =r$ )BridgeTowerVisionTransformeri
  c           
      6  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        UR                  U l        UR                  (       dg  [        R                  " [        UR                  5       Vs/ s H,  n[        R
                  " UR                  UR                  S9PM.     sn5      U l        g g s  snf Nr@   )rF   rG   r   r   r	   rK   rI   rL   ln_prerp   transformerln_postshare_layernormrv   rw   rt   ln_separaterz   s      r2   rG   %BridgeTowerVisionTransformer.__init__  s    5f=ll6#5#56;P;PQ1&9||F$6$6F<Q<QR%55%%!}}V[\b\t\tVuvVuQRf00f6K6KLVuv D &vs   3Dr   r   c                    U R                  X5      nU R                  U5      nUR                  SSS5      nU R                  XB5      n[        R
                  " USS9nUR                  SSSS5      nU R                  (       a  U R                  U5      nU$ / n[        X@R                  5       H  u  pFU" U5      nUR                  U5        M      [        R
                  " USS9nU$ )Nr   r   r   r   r   )r   r   r   r   r-   stackr   r   zipr   r}   )rT   r   rY   r   r%   hidden_states_stacklns          r2   rj   $BridgeTowerVisionTransformer.forward  s     OM2%--aA6((GMq9%--aAq9 LL7M  #%%(8H8H%I! "= 1#**=9 &J "KK(;CMr1   c                 l    U R                  XS9nU R                  U5      nUR                  SSS5      nU$ )Nr   r   r   r   )r   r   r   )rT   r   r   r%   s       r2   forward_pre(BridgeTowerVisionTransformer.forward_pre3  s<    
 hM2%--aA6r1   rX   c                 N    UR                  SSS5      nU R                  U5      nU$ )Nr   r   r   )r   r   )rT   rX   visual_output_posts      r2   forward_post)BridgeTowerVisionTransformer.forward_post>  s-    )11!Q:!\\*<=!!r1   )r   r   r   r   r   r   r   )r(   r)   r*   r+   rG   r-   rl   ra   rj   r   r   r0   rm   rn   s   @r2   r   r   
  s]    " */	ll #'	< */	ll	 #'	" " "r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerLinkToweriD  c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  S;   a  UR                  S:X  a0  [        R
                  " [        R                  " S5      5      U l        O?UR                  S:X  a/  [        R
                  " [        R                  " S5      5      U l	        [        R                  " U R                  UR                  S9U l
        g [        SUR                   S35      e)	N)add
scaled_addr   r         ?r   r   r@   link_tower_type  is not implemented)rF   rG   link_tower_typerI   r	   r   r-   tensorscaled_factorbetarK   rL   NotImplementedErrorrS   s     r2   rG   BridgeTowerLinkTower.__init__E  s    %55!--!!%II%%5%'\\%,,s2C%D"''=8LLc):;	\\$*:*:@U@UVDN%(89O9O8PPc&deer1   c                 Z   U R                   S:X  a  U R                  X-   5      $ U R                   S:X  a   U R                  XR                  -  U-   5      $ U R                   S:X  a0  U R                  USU R                  -
  -  X R                  -  -   5      $ [	        SU R                    S35      e)Nr   r   r   r   r   r   )r   rK   r   r   r   )rT   r%   cross_modal_hidden_statesrY   s       r2   rj   BridgeTowerLinkTower.forwardR  s    5(>>-"KLL!!\1>>-2D2D"DG`"`aa!!]2>>-1tyy="AD]`i`iDi"ijj%(89M9M8NNa&bccr1   )rK   r   rI   r   r   r(   r)   r*   r+   rG   rj   r0   rm   rn   s   @r2   r   r   D  s    fd dr1   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerSelfOutputi^  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )rF   rG   r	   rO   rI   denserK   rL   Dropouthidden_dropout_probdropoutrS   s     r2   rG   BridgeTowerSelfOutput.__init___  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r1   r%   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ re   r   r   rK   rT   r%   r   s      r2   rj   BridgeTowerSelfOutput.forwarde  5    

=1]3}'CDr1   rK   r   r   
r(   r)   r*   r+   rG   r-   rl   rj   r0   rm   rn   s   @r2   r   r   ^  6    >U\\  RWR^R^  r1   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerIntermediateim  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g re   )rF   rG   r	   rO   rI   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrS   s     r2   rG    BridgeTowerIntermediate.__init__n  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r1   r%   r   c                 J    U R                  U5      nU R                  U5      nU$ re   r   r  rT   r%   s     r2   rj   BridgeTowerIntermediate.forwardv  s&    

=100?r1   r  r  rn   s   @r2   r	  r	  m  s(    9U\\ ell  r1   r	  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerOutputi}  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rF   rG   r	   rO   r  rI   r   rK   rL   r   r   r   rS   s     r2   rG   BridgeTowerOutput.__init__~  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r1   r%   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ re   r  r  s      r2   rj   BridgeTowerOutput.forward  r  r1   r  r  rn   s   @r2   r  r  }  r  r1   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g re   )rF   rG   r	   rO   rI   r   Tanh
activationrS   s     r2   rG   BridgeTowerPooler.__init__  s9    YYv1163E3EF
'')r1   r%   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )rT   r%   first_token_tensorpooled_outputs       r2   rj   BridgeTowerPooler.forward  s6     +1a40

#566r1   )r  r   r  rn   s   @r2   r  r    s(    $
U\\ ell  r1   r  c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )BridgeTowerSelfAttentioni  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()position_embedding_typeabsoluterelative_keyrelative_key_queryr   r   )rF   rG   rI   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer	   rO   querykeyvaluer   attention_probs_dropout_probr   getattrr*  max_position_embeddingsr   distance_embedding
is_decoderrT   rU   r*  rV   s      r2   rG   !BridgeTowerSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r1   xr   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr   r   r   r   r   )r   r.  r0  r   r   )rT   r<  new_x_shapes      r2   transpose_for_scores-BridgeTowerSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r1   r%   rY   	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r   r   r   r   r,  r-  r[   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r2  r?  r3  r4  r-   r   r9  matmulr   r*  r   r   longr]   r   r   r8  r7  r`   r\   einsummathsqrtr0  r	   r   softmaxr   r   
contiguousr   r1  )rT   r%   rY   rA  rB  rC  rD  rE  mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r2   rj    BridgeTowerSelfAttention.forward  s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr1   )r1  r0  r8  r   r9  r3  r7  r.  r*  r2  r4  re   NNNNNF)r(   r)   r*   r+   rG   r-   rl   r?  r   r.   r   ra   rj   r0   rm   rn   s   @r2   r&  r&    s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c cr1   r&  eagerc                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )BridgeTowerAttentioni(  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nr*  )	rF   rG   #BRIDGE_TOWER_SELF_ATTENTION_CLASSES_attn_implementationrT   r   outputsetpruned_headsr:  s      r2   rG   BridgeTowerAttention.__init__)  s@    78S8ST
	 ,F3Er1   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rT   r.  r0  rn  r   r2  r3  r4  rl  r   r1  union)rT   headsindexs      r2   prune_heads BridgeTowerAttention.prune_heads1  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r1   r%   rY   rA  rB  rC  rD  rE  r   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   )rT   rl  )rT   r%   rY   rA  rB  rC  rD  rE  self_outputsattention_outputrb  s              r2   rj   BridgeTowerAttention.forwardC  sW     yy!"
  ;;|AF#%QR(88r1   )rl  rn  rT   re   rd  )r(   r)   r*   r+   rG   ru  r-   rl   r   r.   r   ra   rj   r0   rm   rn   s   @r2   rg  rg  (  s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r1   rg  c                   B   ^  \ rS rSrU 4S jr     SS jrS rSrU =r$ )BridgeTowerBertCrossLayeri[  c                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l
        [        U5      U l        g rs   )rF   rG   chunk_size_feed_forwardseq_len_dimrg  rb   r9  add_cross_attentioncrossattentionr	  intermediater  rl  rS   s     r2   rG   "BridgeTowerBertCrossLayer.__init__\  sq    '-'E'E$-f5 ++#)#=#= 26:3F;'/r1   c           
          U R                  UUS US S9nUS   n	USS  n
U R                  U	UUUUUUS9nUS   n	XSS -   n
[        U R                  U R                  U R
                  U	5      nU4U
-   n
U
$ )N)rY   rA  rE  rD  r   r   )rY   rA  rB  rC  rD  rE  r   )rb   r  r   feed_forward_chunkr~  r  )rT   r%   rB  rY   rA  rC  rD  rE  self_attention_outputsry  rb  cross_attention_outputslayer_outputs                r2   rj   !BridgeTowerBertCrossLayer.forwardg  s     "&)/ "0 "
 2!4 ),"&"5"5)"7#9)/ #6 #
 315Ab990##T%A%A4CSCSUe
  /G+r1   c                 J    U R                  U5      nU R                  X!5      nU$ re   r  rl  rT   ry  intermediate_outputr  s       r2   r  ,BridgeTowerBertCrossLayer.feed_forward_chunk  )    "//0@A{{#6Ir1   r  rb   r~  r  r  r9  rl  r  )NNNNF)	r(   r)   r*   r+   rG   rj   r  r0   rm   rn   s   @r2   r|  r|  [  s)    	0 #*X r1   r|  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )BridgeTowerTextLayeri  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is addedr+  ri  )rF   rG   r~  r  rg  rb   r9  r  r   r  r	  r  r  rl  rS   s     r2   rG   BridgeTowerTextLayer.__init__  s    '-'E'E$-f5 ++#)#=#= ##?? D6)g!hii"6vWa"bD3F;'/r1   r%   rY   rA  rB  rC  rD  rE  r   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nr   )rE  rD  r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`rG  )	rb   r9  r/  r   r  r   r  r~  r  )rT   r%   rY   rA  rB  rC  rD  rE  self_attn_past_key_valuer  ry  rb  present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuer  r  s                    r2   rj   BridgeTowerTextLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr1   c                 J    U R                  U5      nU R                  X!5      nU$ re   r  r  s       r2   r  'BridgeTowerTextLayer.feed_forward_chunk  r  r1   r  rd  )r(   r)   r*   r+   rG   r-   rl   r   r.   r   ra   rj   r  r0   rm   rn   s   @r2   r  r    s    0" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B r1   r  c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )BridgeTowerTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rF   rG   rU   r	   rv   rw   rt   r  ri   gradient_checkpointingrz   s      r2   rG   BridgeTowerTextEncoder.__init__  sT    ]]%PVPhPhJi#jJiQ$8$@Ji#jk
&+# $ks   A&r%   rY   rA  rB  rC  past_key_valuesrT  rE  output_hidden_statesreturn_dictr   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
Nr'   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fre   r'   .0vs     r2   	<genexpr>1BridgeTowerTextEncoder.forward.<locals>.<genexpr>9  s"      
A     	)last_hidden_stater  r%   r&   cross_attentions)rU   r  r  trainingloggerwarning_once	enumerateri   _gradient_checkpointing_func__call__tupler   )rT   r%   rY   rA  rB  rC  r  rT  rE  r  r  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskrD  layer_outputss                       r2   rj   BridgeTowerTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r1   )rU   r  ri   )	NNNNNNFFT)r(   r)   r*   r+   rG   r-   rl   r   r.   r   ra   r   r   rj   r0   rm   rn   s   @r2   r  r    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
r1   r  c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )BridgeTowerTextEmbeddingsiN  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxr@   r*  r+  r   r   Fr   token_type_idsr   )rF   rG   r	   r   
vocab_sizerI   pad_token_idword_embeddingsr7  position_embeddingstype_vocab_sizetoken_type_embeddingsrK   rL   r   r   r   r6  r*  r   r-   r   r   zerosr   r   rI  r  rS   s     r2   rG   "BridgeTowerTextEmbeddings.__init__T  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r1   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr   r   r  r   r[   r+  )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   r/  r  r   r-   r  rI  r   r]   r  r  r*  r  rK   r   )rT   	input_idsr  r   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  r   r  s                r2   rj   !BridgeTowerTextEmbeddings.forwardm  sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r1   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr   r   r[   r   )r   r-   r   r  rI  r]   r   r   )rT   r  r  sequence_lengthr   s        r2   r  @BridgeTowerTextEmbeddings.create_position_ids_from_inputs_embeds  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r1   )rK   r   r  r*  r  r  r  )NNNNr   )
r(   r)   r*   r+   r,   rG   rj   r  r0   rm   rn   s   @r2   r  r  N  s$    

4 rs&P= =r1   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r-   cumsumtype_asrI  )r  r  r  maskincremental_indicess        r2   r  r    sW     <<$((*D <<!4<<TBE[[_cc##%33r1   c                   2    \ rS rSr\rSrSrSS/rSr	S r
Srg	)
BridgeTowerPreTrainedModeli  bridgetowerFr&  r<   r  c                    [        U[        5      (       Ga  UR                  R                  R                  S-  SUR                  R                  R
                  -  S-  -  nUR                  R                  R                  S-  nSUR                  R                  R                  -  S-  nUR                  R                  R                   GHB  n[        R                  R                  UR                  R                  X0R                  R                  -  S9  [        R                  R                  UR                  R                  R                  X R                  R                  -  S9  [        R                  R                  UR                   R"                  R                  X@R                  R                  -  S9  [        R                  R                  UR                   R$                  R                  X R                  R                  -  S9  GME     [        R                  R                  UR                  R&                  R(                  X0R                  R                  -  S9  [        R                  R                  UR                  R&                  R*                  R                  X0R                  R                  -  S9  O[        U[        R,                  [        R.                  [        R0                  45      (       a<  UR                  R2                  R                  SSU R                  R                  -  S9  Oh[        U[        R4                  5      (       aI  UR6                  R2                  R9                  5         UR                  R2                  R;                  S5        [        U[        R,                  5      (       a3  UR6                  b%  UR6                  R2                  R9                  5         g g g )Ng      r   )stdg        g?)meanr  r   )r  BridgeTowerVisionModelvisualr   rI   rt   rx   r	   initnormal_rJ   in_proj_weightrU   initializer_factorout_projr   rP   rB   rE   r   r   r   rO   r   r   datarK   r   zero_fill_)rT   moduleproj_stdattn_stdfc_stdr   s         r2   _init_weights(BridgeTowerPreTrainedModel._init_weights  s   f45511==tCV]]..@@@TIH }}00<<dBH&--33???DHF22<<

 9 9x++JhJh?hi

 3 3 : :;;KiKi@ij		 5 56KKDbDb;bc		 0 0 7 7XHfHf=fg	 = GGOOFMM44DD(U`U`UsUsJsOtGGOO((;;BBS^S^SqSqHq   BIIr|| DEEMM&&CTDKK<Z<Z5Z&[--KK""$MM$$S)fbii((V[[-DKK""$ .E(r1   r'   N)r(   r)   r*   r+   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr  r0   r'   r1   r2   r  r    s*    $L%&+#35ST"3%r1   r  c                   F   ^  \ rS rSr\rU 4S jr\S 5       rSS jr	Sr
U =r$ )r  i  c                 D   > [         TU ]  U5        [        U5      U l        g re   )rF   rG   r   r  rS   s     r2   rG   BridgeTowerVisionModel.__init__  s     26:r1   c                 j    U R                   R                  R                  R                  R                  $ re   )r  r   r   r   r\   rT   s    r2   r\   BridgeTowerVisionModel.dtype  s$    {{%%55<<BBBr1   c                 X    U R                  UR                  U R                  5      X#5      $ re   )r  typer\   )rT   image
image_maskr   s       r2   rj   BridgeTowerVisionModel.forward  s     {{5::djj1:XXr1   )r  r  )r(   r)   r*   r+   r   r  rG   propertyr\   rj   r0   rm   rn   s   @r2   r  r    s/    *L; C CY Yr1   r  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                      ^  \ rS rSr\rSU 4S jjrS rS rS r	\
             SS\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )BridgeTowerTextModeli  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rF   rG   rU   r  r   r  encoderr  pooler	post_init)rT   rU   add_pooling_layerrV   s      r2   rG   BridgeTowerTextModel.__init__  sL    
 	 3F;-f53D'/$ 	r1   c                 .    U R                   R                  $ re   r   r  r   s    r2   get_input_embeddings)BridgeTowerTextModel.get_input_embeddings  s    ...r1   c                 $    XR                   l        g re   r  rT   r4  s     r2   set_input_embeddings)BridgeTowerTextModel.set_input_embeddings  s    */'r1   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)rf   r  ri   rb   ru  )rT   heads_to_pruneri   rs  s       r2   _prune_heads!BridgeTowerTextModel._prune_heads  s<    
 +002LELLu%//;;EB 3r1   r  rY   r  r   rA  r  rB  rC  r  rT  rE  r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUc  [        R                  " UUU-   4US9nUcs  [        U R                  S5      (       a4  U R                  R                  S S 2S U24   nUR!                  UU5      nUnO$[        R"                  " U[        R$                  US	9nU R'                  X.5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R)                  U5      nOS nU R+                  XPR                   R,                  5      nU R                  UUUUUS
9nU R/                  UUUUUU	U
UUUS9
nUS   nU R0                  b  U R1                  U5      OS nU(       d
  UU4USS  -   $ [3        UUUR4                  UR6                  UR8                  UR:                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r   r]   r  r[   )r  r   r  r  r  )	rY   rA  rB  rC  r  rT  rE  r  r  r   )r  r$   r  r%   r&   r  )rU   rE  r  use_return_dictr9  rT  r   %warn_if_padding_and_no_attention_maskr   r]   r   r-   onesr/  r   r  r   r  rI  get_extended_attention_maskinvert_attention_maskget_head_maskrt   r  r  r   r  r%   r&   r  )rT   r  rY   r  r   rA  r  rB  rC  r  rT  rE  r  r  r  r   r  r]   r  r  r  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthrh   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr#  s                                  r2   rj   BridgeTowerTextModel.forward  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r1   )rU   r   r  r  )T)NNNNNNNNNNNNN)r(   r)   r*   r+   r   r  rG   r  r  r  r   r   r-   rl   r   r.   ra   r   r   r   rj   r0   rm   rn   s   @r2   r
  r
    sr    )L /0C  -11515/3,0048<9==A$(,0/3&*l
ELL)l
 !.l
 !.	l

 u||,l
 ELL)l
  -l
  (5l
 !) 6l
 "$u'8'8"9:l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$PP	Ql
 l
r1   r
  zv
    The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on
    c            "         ^  \ rS rSrU 4S jrS rS r\              SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\S\\\	R                      \4   4S jj5       rS rSrU =r$ )BridgeTowerModeli  c           	      	  > [         TU ]  U5        Xl        UR                  nUR                  nUR
                  (       aa  [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  UR                  5      U l
        O[        R                  " [        UR                  5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l	        [        R                  " [        UR                  5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l
        [        R                  " SUR                  5      U l        [!        U5      U l        [%        U5      U l        UR(                  (       d  UR*                  (       a  U R"                  R,                  R.                   H  nU R"                  R,                  R0                  R2                  R4                  UR2                  l        U R"                  R,                  R0                  R6                  R4                  UR6                  l        M     [        R                  " [        UR                  5       Vs/ s H  n[9        U5      PM     sn5      U l        [        R                  " [        UR                  5       Vs/ s H  n[9        U5      PM     sn5      U l        [?        U5      U l         [?        U5      U l!        [        RD                  " UR                  URF                  S9U l$        [        RD                  " UR                  URF                  S9U l%        URL                  (       a!  [O        U5      U l(        [O        U5      U l)        O[        R                  " [        UR                  S-
  5       Vs/ s H  n[O        U5      PM     sn5      U l(        [        R                  " [        UR                  S-
  5       Vs/ s H  n[O        U5      PM     sn5      U l)        U RU                  5         g s  snf s  snf s  snf s  snf s  snf s  snf )Nr   r@   r   )+rF   rG   rU   vision_configtext_config$share_cross_modal_transformer_layersr	   rO   rI   cross_modal_text_transformcross_modal_image_transformrv   rw   rt   r   r  r  vision_modelr
  
text_modelr   "init_layernorm_from_vision_encoderr  cross_modal_ln_separater   r   r  r   r|  cross_modal_image_layerscross_modal_text_layersr  cross_modal_image_poolercross_modal_text_poolerrK   rL   cross_modal_text_layernormcross_modal_image_layernormshare_link_tower_layersr   cross_modal_text_link_towercross_modal_image_link_towerr  )rT   rU   r1  r2  rh   r   rV   s         r2   rG   BridgeTowerModel.__init__  sU    ,,((66.0ii8O8OQWQcQc.dD+/1yy9R9RTZTfTf/gD,.0mmQVW]WoWoQpqQpA;22F4F4FGQpq/D+ 02}}SXY_YqYqSrsSra=44f6H6HISrs0D, &(\\!V5G5G%H"2=A.{;,,1Z1Z''..FF!%!2!2!9!9!A!A!H!H!M!M		#0077??DDII G )+=B6C[C[=\]=\&{3=\])
% (*}}=B6C[C[=\]=\&{3=\](
$
 ):&(A%'8'@$ +-,,v7I7IvOdOd*e'+-<<8J8JPVPePe+f())/CF/KD,0DV0LD-/1}}7<V=U=UXY=Y7Z[7Z!%f-7Z[0D, 137<V=U=UXY=Y7Z[7Z!%f-7Z[1D- 	W r t ^ ^  \ \s$   5Q+05Q0-Q55Q:.Q?9Rc                 6    U R                   R                  5       $ re   )r7  r  r   s    r2   r  %BridgeTowerModel.get_input_embeddings  s    3355r1   c                 :    U R                   R                  U5        g re   )r7  r  r  s     r2   r  %BridgeTowerModel.set_input_embeddings  s    ,,U3r1   r  rY   r  r   
pixel_maskrA  r  r9   image_token_type_idxrE  r  r  labelsr   r   c           	      .   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU(       a  SOSnU(       a  SOSnU(       a  SOSnU(       a  SOSnU
(       a  SOSnUb  Uc  [        S5      eUb  UOU R                   R                  nU	(       a  U	OSn	UR                  5       nU R                  R                  US9nU(       a  UU4-  nUc.  [        R                  " U[        R                  UR                  S9nU R                  R                  UU5      R                  UR                  5      n[        U R                  R                  R                   5      U R                   R"                  -
  S-   nU R                  R                  R                   SU  H  nU" UU5      S   nU(       d  M  UU4-  nM      UcH  U R$                  R&                  R)                  UR+                  U R$                  R,                  5      US9nOUR/                  SSS	5      nU(       a  UU4-  nU R$                  R&                  R0                  R2                  SU  H  nU" U5      nU(       d  M  UU4-  nM     U R$                  R&                  R5                  UR+                  U R$                  R,                  5      5      nU R7                  U5      nU R9                  [        R:                  " S[        R                  UR                  S95      R=                  U5      nU R?                  UU-   5      nU RA                  U5      nU R9                  [        RB                  " S
U	[        R                  UR                  S95      R=                  U5      nUU-   nU RE                  U5      n[        R                  " UR                  S5      UR                  S5      4[        R                  UR                  S9nU R                  R                  XUR                  5       5      R                  UR                  5      nU RF                  S   " UUUUU
S9n U S   n!U RH                  S   " UUUUU
S9n"U"S   n#U(       a  UU!U#44-  nU
(       a  UU S   U"S   44-  nSn$[K        U[        U R                  R                  R                   5      5       GHx  n%U R                  R                  R                   U%   " UU5      S   nU R$                  R&                  R0                  R2                  U%   " U5      R+                  U R$                  R,                  5      nU RA                  U R$                  R&                  R5                  U5      5      U-   nU RL                  U$   n&U RN                  U$   n'U&" U R7                  U5      U-   U!U5      n(U'" UU#U5      n)U RF                  U$S-      " U(U)UUU
S9n U S   n!U RH                  U$S-      " U)U(UUU
S9n"U"S   n#U$S-  n$U(       a  UU4-  nUU4-  nUU!U#44-  nU
(       d  GMj  UU S   U"S   44-  nGM{     U!U#n+n*U RQ                  U*U+5      n,U(       a  UUU4nU(       d  [S        S U*U+U,UU4 5       5      $ [U        U*U+U,UUS9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
image_token_type_idx (`int`, *optional*):
    - The token type ids for images.
output_hidden_states (`bool`, *optional*):
    If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and
    cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image,
    hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding
    modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and
    `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and
    `cross_modal_image_hidden_states` of each brdige layer.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels are currently not supported.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerModel
>>> from PIL import Image
>>> import requests

>>> # prepare image and text
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "hello world"
>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
>>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")

>>> inputs = processor(image, text, return_tensors="pt")
>>> outputs = model(**inputs)
>>> outputs.keys()
odict_keys(['text_features', 'image_features', 'pooler_output'])
```Nr'   zYBridgeTowerModel does not use `inputs_embeds`.  Make sure to pass in `input_ids` instead.r   )r  r[   r   r   r   r   )rY   rC  rE  c              3   0   #    U  H  nUc  M  Uv   M     g 7fre   r'   r  s     r2   r  +BridgeTowerModel.forward.<locals>.<genexpr>  s      nA nr  )r"   r#   r$   r%   r&   )+rU   rE  r  r   r  r   r7  r   r-   r!  rI  r]   r"  r`   rq  r  ri   rt   r6  r  r   r  r\   r   r   rx   r   r4  r  r  	expand_asr>  r5  fullr?  r;  r:  rw   rA  rB  get_cls_featuresr  r    )-rT   r  rY   r  r   rH  rA  r  r9   rI  rE  r  r  rJ  r   all_hidden_states_textall_hidden_states_imageall_hidden_states_crossr  r  r  r8   extend_text_maskssplit_indexri   r   image_embeds_with_lncross_modal_texttext_token_type_embeddingsimage_token_type_embeddingscross_modal_imageextend_image_maskslayer_outputs_textcross_text_featureslayer_outputs_imagecross_image_featureslink_layer_indexr  text_link_towerimage_link_towercross_text_features_cross_image_features_r"   r#   cls_featuress-                                                r2   rj   BridgeTowerModel.forward  s   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 (<(<"$(<"$"6BD$5b4$):%k  &1%<k$++B]B]7K3QRnn&oo0090E"{n4"!"ZZ5::iN^N^_N OOGGXcdgg

 $//117784;;;X;XX[\\ __,,22<K@E->?BK##&;.8&	 A ,,33??!!$"3"3"9"9:Um @ L
 (//1a8L#6# &&--99CCL[QE .L##'L?:' R
  $0077DD\EVEVW[WhWhWnWnEop  ::;G%)%?%?KKI4D4DE&

)$
% 	#  ::;KNh;hi#??@TU&*&@&@JJt1IL\L\]'

)(
) 	$  46QQ <<=QRZZ##A&(9(>(>q(AB**##


 "__HHUdUdUfgjj
 "99!<,#5/
 13";;A>-#4/
  315#)<>R(S'UU#%7%:<OPQ<R$S#UU {C(?(?(E(E$FGA//1177:;HYZ[\]K,,33??II!L\Z__!!''L 001B1B1I1I1V1VWc1de-. !
 #>>?OPO#@@AQR $3//<?YY#!$ 
 %55IK_as$t! "&!=!=>NQR>R!S$%0'9"3" #5Q"7"&"?"?@PST@T"U%$1'8"3# $7q#9 !#&;.8&'L?:''-@BV,W+YY'  #);A)>@STU@V(W'YY#a Hf )<=Q~,,]NK!79PRi j 'GXZmn   &')&+*
 	
r1   c                 r    U R                  U5      nU R                  U5      n[        R                  " X4/SS9$ )Nr   r   )r=  r<  r-   r   )rT   r"   r#   cls_features_textcls_features_images        r2   rQ  !BridgeTowerModel.get_cls_features  s9     88G!::>Jyy+@bIIr1   )rU   r?  r:  rB  r<  r5  r>  r;  rA  r=  r4  r7  r  r6  )NNNNNNNNNNNNNF)r(   r)   r*   r+   rG   r  r  r   r   r-   
LongTensorr.   r   ra   r   r   rl   r    rj   rQ  r0   rm   rn   s   @r2   r/  r/    s   6p64  156:594815155948.2,0/3&*-1).j
E,,-j
 !!2!23j
 !!1!12	j

 u001j
 U--.j
 E--.j
   1 12j
 u001j
 'smj
 $D>j
 'tnj
 d^j
 ))*j
 #'j
  
uU\\"$::	;!j
 j
XJ Jr1   r/  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )"BridgeTowerPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rF   rG   r	   rO   rI   r   r  r  r  r   transform_act_fnrK   rL   rS   s     r2   rG   +BridgeTowerPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr1   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ re   )r   rp  rK   r  s     r2   rj   *BridgeTowerPredictionHeadTransform.forward  s4    

=1--m<}5r1   )rK   r   rp  r   rn   s   @r2   rn  rn    s    U r1   rn  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )BridgeTowerMLMHeadi  c                 n  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  R                  SS9U l
        [
        R                  " [        R                  " UR                  R                  5      5      U l        Ub  X R                  l        g g )NF)r   )rF   rG   rU   rn  	transformr	   rO   rI   r2  r  decoderr   r-   r  r   r   )rT   rU   r   rV   s      r2   rG   BridgeTowerMLMHead.__init__  s    ;FCyy!3!3V5G5G5R5RY^_LLV-?-?-J-J!KL	"(LL r1   c                 d    U R                  U5      nU R                  U5      U R                  -   nU$ re   )rw  rx  r   )rT   r<  	mlm_scores      r2   rj   BridgeTowerMLMHead.forward  s-    NN1%	LL+dii7	r1   )r   rU   rx  rw  re   r   rn   s   @r2   ru  ru    s    ) r1   ru  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerITMHeadi  c                 Z   > [         TU ]  5         [        R                  " US5      U l        g Nr   rF   rG   r	   rO   fc)rT   rI   rV   s     r2   rG   BridgeTowerITMHead.__init__  s     ))K+r1   c                 (    U R                  U5      nU$ re   r  )rT   r<  	itm_scores      r2   rj   BridgeTowerITMHead.forward  s    GGAJ	r1   r  r   rn   s   @r2   r~  r~    s    , r1   r~  z\
    BridgeTower Model with a language modeling head on top as done during pretraining.
    c                     ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\\\
R                     4   4S jj5       rSrU =r$ )BridgeTowerForMaskedLMi  zmlm_score.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g re   )rF   rG   r/  r  ru  r{  r  rS   s     r2   rG   BridgeTowerForMaskedLM.__init__  s5     +F3+F3 	r1   c                 .    U R                   R                  $ re   r{  rx  r   s    r2   get_output_embeddings,BridgeTowerForMaskedLM.get_output_embeddings  s    ~~%%%r1   c                 $    XR                   l        g re   r  )rT   new_embeddingss     r2   set_output_embeddings,BridgeTowerForMaskedLM.set_output_embeddings  s    !/r1   r  rY   r  r   rH  rA  r  r9   rE  r  r  rJ  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
US9nU R                  U(       a  UR                  OUS   5      nSnUbk  [        5       nUR                  UR                  5      nU" UR                  SU R                   R                  R                  5      UR                  S5      5      nU(       d  [        U5      nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
>>> text = "a <mask> looking out of the window"

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)

>>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())

>>> print(results)
.a cat looking out of the window.
```N
rY   r  r   rH  rA  r  r9   rE  r  r  r   r   r6   r7   r%   r&   )rU   r  r  r{  r"   r
   r`   r]   r   r2  r  r  r   r%   r&   )rT   r  rY   r  r   rH  rA  r  r9   rE  r  r  rJ  rb  
mlm_logitsmasked_lm_lossloss_fctrl  s                     r2   rj   BridgeTowerForMaskedLM.forward  s   d &1%<k$++B]B]""))%!'%/!5# # 
 ^^[G$9$9gVWjY
')HYYz001F%joob$++:Q:Q:\:\&]_e_j_jkm_noN:&F3A3M^%.YSYY!//))	
 	
r1   )r  r{  NNNNNNNNNNNN)r(   r)   r*   r+   _tied_weights_keysrG   r  r  r   r   r-   rl  r.   ra   r   r   r   rj   r0   rm   rn   s   @r2   r  r    sj    55&0  156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
~uU%6%677	8Q
 Q
r1   r  z
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\R                     S\\\\R                     4   4S jj5       rSrU =r$ )#BridgeTowerForImageAndTextRetrievaliY  c                    > [         TU ]  U5        [        U5      U l        [	        UR
                  S-  5      U l        U R                  5         g r  )rF   rG   r/  r  r~  rI   r  r  rS   s     r2   rG   ,BridgeTowerForImageAndTextRetrieval.__init__`  s@     +F3+F,>,>,BC 	r1   r  rY   r  r   rH  rA  r  r9   rE  r  r  rJ  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
US9nU(       a  UR                  OUS   nU R	                  U5      nSnUb-  [        5       nUR                  UR                  5      nU" X5      nU(       d  [        U5      nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )aM  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
    Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
    The pairs with 0 will be skipped for calculation.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
>>> import requests
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # forward pass
>>> scores = dict()
>>> for text in texts:
...     # prepare inputs
...     encoding = processor(image, text, return_tensors="pt")
...     outputs = model(**encoding)
...     scores[text] = outputs.logits[0, 1].item()
```Nr  r   r  )rU   r  r  r$   r  r
   r`   r]   r  r   r%   r&   )rT   r  rY   r  r   rH  rA  r  r9   rE  r  r  rJ  rb  r$   r7   itm_lossr  rl  s                      r2   rj   +BridgeTowerForImageAndTextRetrieval.forwardj  s    \ &1%<k$++B]B]""))%!'%/!5# # 
 2=--'!*.')HYYv}}-F/H6]F-5-AXK&(MvM'!//))	
 	
r1   )r  r  r  )r(   r)   r*   r+   rG   r   r   r-   rl  r.   ra   r   r   r   rj   r0   rm   rn   s   @r2   r  r  Y  sV     156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
'u/@/@)AA	BQ
 Q
r1   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerContrastiveHeadi  c                 X   > [         TU ]  5         [        R                  " X5      U l        g re   r  )rT   rI   
embed_sizerV   s      r2   rG   #BridgeTowerContrastiveHead.__init__  s    ))K4r1   c                 (    U R                  U5      nU$ re   r  )rT   r<  s     r2   rj   "BridgeTowerContrastiveHead.forward  s    GGAJr1   r  r   rn   s   @r2   r  r    s    5 r1   r  zl
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )!BridgeTowerForContrastiveLearningi  c                   > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  S-  UR                  5      U l	        [        R                  " [        R                  " U R                  R                  5      5      U l        U R#                  5         g r  )rF   rG   r/  r  r  rI   contrastive_hidden_sizeitc_text_headitc_image_headitc_cross_modal_headr	   r   r-   r   rU   logit_scale_init_valuelogit_scaler  rS   s     r2   rG   *BridgeTowerForContrastiveLearning.__init__  s     +F378J8JFLjLjk89K9KVMkMkl$>v?Q?QTU?UW]WuWu$v!<<T[[5W5W(XYr1   r  rY   r  r   rH  rA  r  r9   rE  r  r  return_lossr   c                 >   Ub  UOU R                   R                  nU R                  UUUUUUUUU	SUS9nU(       a  UR                  OUS   nU(       a  UR                  OUS   u  nnnUS   nUS   nU R                  R
                  R                  R                  U5      nU R                  R                  [        R                  " SS[        R                  U R                  R                  R                  R                  S	95      R                  U5      nU R                  R                  U5      U-   n[         R"                  R%                  U R'                  USS2S
SS24   5      SSS9n[         R"                  R%                  U R)                  USS2S
SS24   5      SSS9R+                  UR                  S9n[         R"                  R%                  U R-                  U5      SSS9R+                  UR                  S9n[        R.                  " UUU/SS9nU R0                  R3                  5       R+                  UR                  S9n[        R4                  " UUR7                  5       5      U-  n[        R4                  " UUR7                  5       5      U-  n[        R4                  " UUR7                  5       5      U-  nSnU(       a  [        R8                  " [;        U5      UR                  S9n[         R"                  R=                  UU5      n[         R"                  R=                  UU5      n[         R"                  R=                  UU5      nUU-   U-   S-  nU(       d  UUUU4USS -   n Ub  U4U -   $ U $ [?        UUUUUUR                  UR@                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
>>> import requests
>>> from PIL import Image
>>> import torch

>>> image_urls = [
...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
...     "http://images.cocodataset.org/val2017/000000039769.jpg",
... ]
>>> texts = ["two dogs in a car", "two cats sleeping on a couch"]
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
>>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")

>>> inputs = processor(images, texts, padding=True, return_tensors="pt")
>>> loss = model(**inputs, return_loss=True).loss

>>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
>>> loss_swapped = model(**inputs, return_loss=True).loss

>>> print("Loss", round(loss.item(), 4))
Loss 0.0019

>>> print("Loss with swapped images", round(loss_swapped.item(), 4))
Loss with swapped images 2.126
```NTr  r   r   r   rL  r   r[   r   )r   pr  rG  r   g      @)r6   r7   r8   r9   r:   r%   r&   )!rU   r  r  r$   r%   r6  r  r   r  r-   rP  rI  r   r]   rO  r5  r	   r   	normalizer  r  r`   r  r   r  exprH  tr   rq  cross_entropyr4   r&   )!rT   r  rY   r  r   rH  rA  r  r9   rE  r  r  r  rb  r$   hidden_states_txthidden_states_imghidden_states_cross_modalr8   rW  rZ  r:   r7   r  logits_text_to_imagelogits_text_to_crosslogits_image_to_crossitc_lossrJ  text_to_image_losstext_to_cross_lossimage_to_cross_lossrl  s!                                    r2   rj   )BridgeTowerForContrastiveLearning.forward  sz   j &1%<k$++B]B]""))%!'%/!%# # 
 2=--'!*%0G!!gaj 	H,.G (+(,#//<<CCPPQ]^&*&6&6&L&LJJtQejj9I9I9_9_9f9f9m9mn'

)(
) 	$ ''CCDXY\ww mm--d.@.@QPQSTWAU.V\^bc-d}}..t/B/B<PQSTVWPWCX/Y_aef.gjj%% k 
 }}..t/H/H/W]_cd.ehh%% i 
 k<FBO&&**,//{7I7I/J$||K9IJ[X$||K9IJ[X %\<>>;K L{ Z\\#f+fmmDF!#!<!<=QSY!Z!#!<!<=QSY!Z"$--"="=>SU["\*-??BUUY\\Hk<FQRQSTF-5-AXK&(MvM+#%%!//))
 	
r1   )r  r  r  r  r  )NNNNNNNNNTNN)r(   r)   r*   r+   rG   r   r   r-   rl  r.   ra   r   r4   r   rj   r0   rm   rn   s   @r2   r  r    sO     156:594815155948,0/3&*&*x
E,,-x
 !!2!23x
 !!1!12	x

 u001x
 U--.x
 E--.x
   1 12x
 u001x
 $D>x
 'tnx
 d^x
 d^x
 
+U53D3D-EE	Fx
 x
r1   r  )r  r  r  r/  r  )r   )Kr,   rK  collectionsr   dataclassesr   typingr   r   r   r   r-   torch.utils.checkpointr	   torch.nnr
   activationsr   r   modeling_outputsr   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_bridgetowerr   r   r   
get_loggerr(   r  _TOKENIZER_FOR_DOCr    r4   Moduler<   rp   r   r   r   r   r	  r  r  r&  rj  rg  r|  r  r  r  r  r  r  r
  r/  rn  ru  r~  r  r  r  r  __all__r'   r1   r2   <module>r     s      # ! / /    % 6  I Q 7 7 h h 
		H	%'  :[ : :> :; : :B)299 )XRYY 6P")) Pf7"299 7"td299 d4BII bii  		 		  Cryy CN %' #0299 0f;		 ;|S299 SnZ
RYY Z
|V=		 V=t4  % % %DY7 Y O
5 O
O
d 
oJ1 oJ
oJf	 "    
d
7 d

d
N ]
*D ]
]
@  
G
(B G

G
Tr1   