
    eTh                    8   S r SSKrSSKJr  SSKJrJrJrJrJ	r	J
r
  SSKrSSKJr  SSKrSSKJr  SSKJrJrJrJrJr  SSKJrJr  SS	KJrJrJr  SS
KJrJ r J!r!J"r"  SSK#J$r$J%r%J&r&  \!RN                  " \(5      r)S\RT                  S\RT                  4S jr+S\RT                  S\RT                  4S jr,\ " S S\5      5       r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1S\00r2 " S S\R\                  5      r3 " S S\R\                  5      r4 " S S \R\                  5      r5 " S! S"\R\                  5      r6 " S# S$\R\                  5      r7 " S% S&\R\                  5      r8 SIS'\R\                  S(\RT                  S)\RT                  S*\RT                  S+\\RT                     S,\9S-\94S. jjr: " S/ S0\R\                  5      r; " S1 S2\R\                  5      r< " S3 S4\R\                  5      r= " S5 S6\R\                  5      r> " S7 S8\R\                  5      r?\  " S9 S:\5      5       r@ " S; S<\R\                  5      rA " S= S>\@5      rB\ " S?S@9 " SA SB\@5      5       rC " SC SD\@5      rD " SE SF\@5      rESJSG jrF/ SHQrGg)KzPyTorch AltCLIP model.    N)	dataclass)AnyCallableListOptionalTupleUnion   )ACT2FN)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)nn
functionalcross_entropytorcharangelenr"   )r   s    d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr*   ,   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r*   t)r,   caption_loss
image_losss      r)   	clip_lossr1   0   s*    #J/L!*,,.1J%,,r+   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)AltCLIPOutput6   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r:   r;   N)getattrto_tuple).0kselfs     r)   	<genexpr>)AltCLIPOutput.to_tuple.<locals>.<genexpr>V   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysrB   s   `r)   r?   AltCLIPOutput.to_tupleU   s#     
YY[
 
 	
r+    )__name__
__module____qualname____firstlineno____doc__r5   r   r&   FloatTensor__annotations__r6   r7   r8   r9   r:   r   r;   r   r   r?   __static_attributes__rI   r+   r)   r3   r3   6   s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r+   r3   c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )AltRobertaEmbeddings]   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r#   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr>   rY   register_bufferr&   r'   expandzerosr[   sizelongrV   rB   config	__class__s     r)   rd   AltRobertaEmbeddings.__init__c   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r+   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr]   r   r`   r   rb   r"   rZ   )"create_position_ids_from_input_idsrV   &create_position_ids_from_inputs_embedsrv   hasattrr`   rt   r&   ru   rw   r[   r"   ri   rm   rY   rk   rn   rr   )rB   	input_idsr`   r[   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrm   
embeddingsrk   s                r)   forwardAltRobertaEmbeddings.forward|   sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r+   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr]   r   r}   r   )rv   r&   r'   rV   rw   r"   	unsqueezert   )rB   r   r   sequence_lengthr[   s        r)   r   ;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r+   )rn   rr   rV   rY   rk   rm   ri   )NNNNr   )
rJ   rK   rL   rM   rN   rd   r   r   rQ   __classcell__rz   s   @r)   rS   rS   ]   s$    

4 rs&P= =r+   rS   c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )AltRobertaSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rY   rZ   relative_keyrelative_key_query   r   )rc   rd   rg   num_attention_headsr   
ValueErrorintattention_head_sizeall_head_sizer#   Linearquerykeyvaluerp   attention_probs_dropout_probrr   r>   rY   rj   re   distance_embedding
is_decoderrB   ry   rY   rz   s      r)   rd    AltRobertaSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r+   xr   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr]   r   r   r   r
   )rv   r   r   viewpermute)rB   r   new_x_shapes      r)   transpose_for_scores,AltRobertaSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r+   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r   r   dimr]   r   r   r}   ra   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) r   r   r   r   r&   catr   matmul	transposerY   shapetensorrw   r"   r   r'   r   rj   torb   einsummathsqrtr   r#   r$   softmaxrr   r   
contiguousrv   r   )rB   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r)   r   AltRobertaSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr+   )r   r   r   rr   r   r   rj   r   rY   r   r   NNNNNNF)rJ   rK   rL   rM   rd   r&   Tensorr   r   rO   r   boolr   rQ   r   r   s   @r)   r   r      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c cr+   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaSelfOutputi>  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrW   )rc   rd   r#   r   rg   densern   ro   rp   rq   rr   rx   s     r)   rd   AltRobertaSelfOutput.__init__?  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r+   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rr   rn   rB   r   r   s      r)   r   AltRobertaSelfOutput.forwardE  5    

=1]3}'CDr+   rn   r   rr   
rJ   rK   rL   rM   rd   r&   r   r   rQ   r   r   s   @r)   r   r   >  6    >U\\  RWR^R^  r+   r   eagerc                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )AltRobertaAttentioniR  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )NrY   )	rc   rd   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationrB   r   outputsetpruned_headsr   s      r)   rd   AltRobertaAttention.__init__S  s@    6v7R7RS
	 +62Er+   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )r(   r   rB   r   r   r   r   r   r   r   r   r   r   union)rB   headsindexs      r)   prune_headsAltRobertaAttention.prune_heads[  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r+   r   r   r   r   r   r   r   r   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   )rB   r   )rB   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r)   r   AltRobertaAttention.forwardm  sW     yy!"
  ;;|AF#%QR(88r+   )r   r   rB   r   r   )rJ   rK   rL   rM   rd   r   r&   r   r   rO   r   r   r   rQ   r   r   s   @r)   r   r   R  s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r+   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rc   rd   r#   r   rg   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrx   s     r)   rd   AltRobertaIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r+   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r	  rB   r   s     r)   r   AltRobertaIntermediate.forward  s&    

=100?r+   r  r   r   s   @r)   r  r    s(    9U\\ ell  r+   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rc   rd   r#   r   r  rg   r   rn   ro   rp   rq   rr   rx   s     r)   rd   AltRobertaOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r+   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r)   r   AltRobertaOutput.forward  r   r+   r   r   r   s   @r)   r  r    r   r+   r  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )AltRobertaLayeri  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is addedrZ   r   )rc   rd   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionr   crossattentionr  intermediater  r   rx   s     r)   rd   AltRobertaLayer.__init__  s    '-'E'E$,V4 ++#)#=#= ##?? D6)g!hii"5fV`"aD26:&v.r+   r   r   r   r   r   r   r   r   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nr   )r   r   r   r   r]   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r   r   r   r  r   feed_forward_chunkr  r  )rB   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r)   r   AltRobertaLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr+   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r   )rB   r   intermediate_outputr'  s       r)   r   "AltRobertaLayer.feed_forward_chunk  s)    "//0@A{{#6Ir+   )r  r  r  r  r  r   r   r  r   )rJ   rK   rL   rM   rd   r&   r   r   rO   r   r   r   r   rQ   r   r   s   @r)   r  r    s    /" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B r+   r  c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )AltRobertaEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rc   rd   ry   r#   
ModuleListrangenum_hidden_layersr  layergradient_checkpointingrB   ry   _rz   s      r)   rd   AltRobertaEncoder.__init__  sR    ]]U6KcKcEd#eEdOF$;Ed#ef
&+# $f   A&r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
NrI   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r]   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rI   r@   vs     r)   rC   ,AltRobertaEncoder.forward.<locals>.<genexpr>E  s"      
A  s   	)last_hidden_stater9  r   
attentionscross_attentions)ry   r  r4  trainingloggerwarning_once	enumerater3  _gradient_checkpointing_func__call__rE   r   )rB   r   r   r   r   r   r9  r   r   r:  r;  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r)   r   AltRobertaEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r+   )ry   r4  r3  )	NNNNNNFFT)rJ   rK   rL   rM   rd   r&   r   r   rO   r   r   r	   r   r   rQ   r   r   s   @r)   r-  r-    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
r+   r-  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaPooleriZ  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rc   rd   r#   r   rg   r   Tanh
activationrx   s     r)   rd   AltRobertaPooler.__init__[  s9    YYv1163E3EF
'')r+   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   rW  )rB   r   first_token_tensorpooled_outputs       r)   r   AltRobertaPooler.forward`  s6     +1a40

#566r+   )rW  r   r   r   s   @r)   rT  rT  Z  s(    $
U\\ ell  r+   rT  moduler   r   r   r   scalingrr   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr]   r   )r   rb   )prD  r   r   )r&   r   r   r#   r$   r   float32r   rb   rr   rD  r   )
r]  r   r   r   r   r^  rr   kwargsattn_weightsattn_outputs
             r)   eager_attention_forwardre  j  s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r+   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )AltCLIPAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)rc   rd   ry   rg   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrr   	is_causalr#   r   k_projv_projq_projout_projrx   s     r)   rd   AltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar+   r   r   causal_attention_maskr   r   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   r   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )rp  r^  rr   )r   rs  rq  rr  r   rl  rm  r   ry   r   rp  re  rE  rF  r   rn  rD  rr   reshaper   rt  )rB   r   r   rv  r   
batch_sizer   rk  queriesrF   valuesattention_interfacerd  rc  s                 r)   r   AltCLIPAttention.forward  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r+   )ry   rr   rk  rm  rp  rq  rl  rt  rs  rn  rr  )NNF)rJ   rK   rL   rM   rN   rd   r&   r   r   r   r   r   rQ   r   r   s   @r)   rg  rg    s    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r+   rg  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
AltCLIPMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rc   rd   ry   r   r  activation_fnr#   r   rg   r  fc1fc2rx   s     r)   rd   AltCLIPMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr+   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r  s     r)   r   AltCLIPMLP.forward  s4    /**=9/r+   )r  ry   r  r  r   r   s   @r)   r  r    s)    KU\\ ell  r+   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )AltCLIPEncoderLayeri  ry   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rc   rd   rg   rk  rg  	self_attnr#   rn   ro   layer_norm1r  mlplayer_norm2rx   s     r)   rd   AltCLIPEncoderLayer.__init__  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr+   r   r   rv  r   r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   rv  r   )r  r  r  r  )rB   r   r   rv  r   residualrc  r   s           r)   r   AltCLIPEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr+   )rk  r  r  r  r  F)rJ   rK   rL   rM   r   rd   r&   r   r   r   r   rO   r   rQ   r   r   s   @r)   r  r    sk    S} S -2&||& &  %||	&
 $D>& 
u  	!& &r+   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )AltCLIPEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`AltCLIPEncoderLayer`].

Args:
    config: AltCLIPConfig
ry   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r/  )
rc   rd   ry   r#   r0  r1  r2  r  layersr4  r5  s      r)   rd   AltCLIPEncoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %kr8  r   rv  r   r:  r;  r   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrI   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rI   r>  s     r)   rC   )AltCLIPEncoder.forward.<locals>.<genexpr>l  s     e$Sq$Ss   	)rA  r   rB  )ry   r   r:  use_return_dictrG  r  r4  rD  rH  rI  rE   r   )rB   r   r   rv  r   r:  r;  encoder_statesall_attentionsr   idxencoder_layerrQ  s                r)   r   AltCLIPEncoder.forward   s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+Vd
 	
r+   )ry   r4  r  )NNNNN)rJ   rK   rL   rM   rN   r   rd   r   r&   r   r   r	   r   r   r   rQ   r   r   s   @r)   r  r    s    ,} , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
r+   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )AltCLIPVisionEmbeddingsis  ry   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasr   r   r[   r\   r^   )rc   rd   ry   rg   rk  
image_size
patch_sizer#   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsre   position_embeddingrs   r'   rt   rx   s     r)   rd    AltCLIPVisionEmbeddings.__init__t  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr+   r   heightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr]   g      ?r
   r   bicubicF)rv   modealign_cornersr   )r   r  weightr   r&   jit
is_tracingr[   r  r   r{  r   r#   r$   interpolater   r   )rB   r   r  r  r  r  r  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r)   interpolate_pos_encoding0AltCLIPVisionEmbeddings.interpolate_pos_encoding  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr+   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (ri  ra   r   r   r]   r   )r   r  r   r  r  rb   r   flattenr   r  rt   r&   r   r  r  r[   )rB   r  r  r|  r6  r  r  target_dtypepatch_embedsclass_embedsr   s              r)   r   AltCLIPVisionEmbeddings.forward  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr+   )	r  ry   rk  r  r  r  r  r  r  r  )rJ   rK   rL   rM   r   rd   r&   r   r   r  rO   r   rQ   r   r   s   @r)   r  r  s  sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r+   r  c                   *    \ rS rSr\rSrSr/ rS r	Sr
g)AltCLIPPreTrainedModeli  altclipTc                 6   U R                   R                  n[        U[        5      (       a  U R                   R                  n[        R
                  R                  UR                  SUR                  S-  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  g[        U[        5      (       Ga   U R                   R                  nUR                  S-  SUR                   R                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                   R                  US9  [        R
                  R                  UR"                  R                  US9  [        R
                  R                  UR$                  R                  US9  g[        U[&        5      (       a  U R                   R                  nUR                   R(                  S-  SUR                   R                  -  S-  -  U-  nSUR                   R(                  -  S-  U-  n[        R
                  R                  UR*                  R                  US9  [        R
                  R                  UR,                  R                  US9  g[        U[.        5      (       a  [        R
                  R                  UR0                  R                  UR2                  S-  U R                   R                  -  S9  SUR0                  l        [        R
                  R                  UR6                  R                  UR8                  S-  U R                   R                  -  S9  SUR6                  l        g[        U[        R:                  5      (       aJ  UR<                  R>                  RA                  5         UR                  R>                  RC                  S5        g[        U[        RD                  5      (       ak  UR                  R>                  R                  SU R                   R                  S9  UR<                  b%  UR<                  R>                  RA                  5         gg[        U[        RF                  5      (       ax  UR                  R>                  R                  SU R                   R                  S9  URH                  b2  UR                  R>                  URH                     RA                  5         ggg)	zInitialize the weightsrz  rj  )meanstd)r  r   Tg      ?N)%ry   initializer_factorr  r  r#   initnormal_r  rk  r  r  initializer_ranger  rg  r2  rs  rq  rr  rt  r  rg   r  r  AltCLIPModeltext_projectiontext_embed_dim_is_hf_initializedvisual_projectionvision_embed_dimrn   r  datazero_fill_r   re   rV   )rB   r]  factorin_proj_stdout_proj_stdfc_stds         r)   _init_weights$AltCLIPPreTrainedModel._init_weights  s   //f566[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 011[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
++[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?--GGOO&&--))4/$++2P2PP   9=F""5GGOO((//++T1DKK4R4RR   ;?F$$7--KK""$MM$$S)		**MM&&CT[[5S5S&T{{&  &&( '--MM&&CT[[5S5S&T!!-""6#5#56<<> . .r+   rI   N)rJ   rK   rL   rM   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_moduler  rQ   rI   r+   r)   r  r    s     L!&*#+?r+   r  c                      ^  \ rS rSrS\4U 4S jjr\     SS\\R                     S\\
   S\\
   S\\
   S\\
   S	\\\4   4S
 jj5       rSrU =r$ )AltCLIPVisionTransformeri  ry   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )rc   rd   ry   rg   r  r   r#   rn   ro   pre_layrnormr  encoderpost_layernorm)rB   ry   rk  rz   s      r)   rd   !AltCLIPVisionTransformer.__init__  sd    &&	1&9LL8M8MN%f- ll9:O:OPr+   r  r   r:  r;  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r  )r   r   r:  r;  r   r   rA  pooler_outputr   rB  )ry   r   r:  r  r   r   r  r  r  r   r   rB  )
rB   r  r   r:  r;  r  r   encoder_outputsrA  r[  s
             r)   r    AltCLIPVisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r+   )ry   r   r  r  r  )NNNNF)rJ   rK   rL   rM   r   rd   r   r   r&   rO   r   r	   r   r   r   rQ   r   r   s   @r)   r  r    s    Q2 Q  59,0/3&*38'
u001'
 $D>'
 'tn	'

 d^'
 #+4.'
 
u00	1'
 '
r+   r  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )AltCLIPVisionModeli1  r  ry   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rc   rd   r  vision_model	post_initrx   s     r)   rd   AltCLIPVisionModel.__init__5  s'     4V<r+   r   c                 B    U R                   R                  R                  $ r   )r  r   r  rG   s    r)   get_input_embeddings'AltCLIPVisionModel.get_input_embeddings;  s      ++;;;r+   r   r:  r  r;  c                 ^    Ub  UOU R                   R                  nU R                  UUUUUS9$ )aN  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPVisionModel

>>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r  r   r:  r  r;  )ry   r  r  )rB   r  r   r:  r  r;  s         r)   r   AltCLIPVisionModel.forward>  sA    : &1%<k$++B]B]  %/!5%=# ! 
 	
r+   )r  NNNFN)rJ   rK   rL   rM   r   r  main_input_namerd   r#   Moduler  r   r   r&   rO   r   r	   r   r   r   rQ   r   r   s   @r)   r  r  1  s    &L$O2 <bii <  59,0/3).&*$
u001$
 $D>$
 'tn	$

 #'$
 d^$
 
u00	1$
 $
r+   r  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                      ^  \ rS rSr\rSU 4S jjrS rS rS r	\
             SS\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )AltRobertaModelif  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rc   rd   ry   rS   r   r-  r  rT  poolerr  )rB   ry   add_pooling_layerrz   s      r)   rd   AltRobertaModel.__init__x  sL    
 	 .v6(02C&v. 	r+   c                 .    U R                   R                  $ r   r   ri   rG   s    r)   r  $AltRobertaModel.get_input_embeddings  s    ...r+   c                 $    XR                   l        g r   r  rB   r   s     r)   set_input_embeddings$AltRobertaModel.set_input_embeddings  s    */'r+   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r3  r  r   )rB   heads_to_pruner3  r   s       r)   _prune_headsAltRobertaModel._prune_heads  s<    
 +002LELLu%//;;EB 3r+   r   r   r`   r[   r   r   r   r   r9  r   r   r:  r;  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUc  [        R                  " UUU-   4US9nUcs  [        U R                  S5      (       a4  U R                  R                  S S 2S U24   nUR!                  UU5      nUnO$[        R"                  " U[        R$                  US	9nU R'                  X.5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R)                  U5      nOS nU R+                  XPR                   R,                  5      nU R                  UUUUUS
9nU R/                  UUUUUU	U
UUUS9
nUS   nU R0                  b  U R1                  U5      OS nU(       d
  UU4USS  -   $ [3        UUUR4                  UR6                  UR8                  UR:                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer]   z5You have to specify either input_ids or inputs_embedsr   r   r!   r`   r}   )r   r[   r`   r   r   )	r   r   r   r   r9  r   r   r:  r;  r   )rA  r  r9  r   rB  rC  )ry   r   r:  r  r   r   r   %warn_if_padding_and_no_attention_maskrv   r"   r   r&   onesr   r   r`   rt   ru   rw   get_extended_attention_maskinvert_attention_maskget_head_maskr2  r  r  r   r9  r   rB  rC  )rB   r   r   r`   r[   r   r   r   r   r9  r   r   r:  r;  r   r|  r   r"   r   r   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr6  encoder_hidden_shapeencoder_extended_attention_maskembedding_outputr  sequence_outputr[  s                                  r)   r   AltRobertaModel.forward  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r+   )ry   r   r  r  )T)NNNNNNNNNNNNN)rJ   rK   rL   rM   r   r  rd   r  r  r  r   r   r&   r   r   rO   r   r	   r   r   r   rQ   r   r   s   @r)   r  r  f  sr    %L /0C  -11515/3,0048<9==A$(,0/3&*l
ELL)l
 !.l
 !.	l

 u||,l
 ELL)l
  -l
  (5l
 !) 6l
 "$u'8'8"9:l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$PP	Ql
 l
r+   r  c                     ^  \ rS rSr\rU 4S jrS\R                  4S jr	S\R                  SS4S jrSS\\   S\R                  4U 4S	 jjjr\           SS
\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AltCLIPTextModeli  c                   > [         TU ]  U5        [        USS9U l        [        R
                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        U R                  5         g )NF)r  rW   )rc   rd   r  robertar#   r   rg   project_dimtransformationrn   ro   pre_LNr  rx   s     r)   rd   AltCLIPTextModel.__init__
  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr+   r   c                 B    U R                   R                  R                  $ r   r-  r   ri   rG   s    r)   r  %AltCLIPTextModel.get_input_embeddings  s    ||&&666r+   r   Nc                 8    XR                   R                  l        g r   r3  r  s     r)   r  %AltCLIPTextModel.set_input_embeddings  s    27/r+   new_num_tokensc                 "   > [         TU ]  U5      $ r   )rc   resize_token_embeddings)rB   r7  rz   s     r)   r9  (AltCLIPTextModel.resize_token_embeddings  s    w.~>>r+   r   r   r`   r[   r   r   r   r   r   r;  r:  c                 *   U
b  U
OU R                   R                  n
U R                  UUUUUUUUU	UU
S9nUS   nU R                  U5      nU R	                  U5      nUSS2S4   nU
(       d	  X4USS -   $ [        UUUR                  UR                  S9$ )a  
Examples:

```python
>>> from transformers import AutoProcessor, AltCLIPTextModel

>>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> texts = ["it's a cat", "it's a dog"]

>>> inputs = processor(text=texts, padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```N)r   r   r`   r[   r   r   r   r   r   r:  r;  r   r      r  )ry   r  r-  r0  r/  r   r   rB  )rB   r   r   r`   r[   r   r   r   r   r   r;  r:  r   r(  projection_stater  s                   r)   r   AltCLIPTextModel.forward  s    B &1%<k$++B]B],,))%'"7#9/!5#  
 "!* ++o6  ..?(A.$4wq|CC6.'!//))	
 	
r+   )r0  r-  r/  r   )NNNNNNNNNNN)rJ   rK   rL   rM   r   r  rd   r#   r  r  re   r  r   r   r9  r   r&   r   r   r	   r   r   r   rQ   r   r   s   @r)   r+  r+    sn   $L7bii 78",, 84 8?hsm ?r|| ? ?  -11515/3,0048<9=,0&*/3B
ELL)B
 !.B
 !.	B

 u||,B
 ELL)B
  -B
  (5B
 !) 6B
 $D>B
 d^B
 'tnB
 
u==	>B
 B
r+   r+  c                   `  ^  \ rS rSr\rS\4U 4S jjr\       SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\     SS\\	R                     S\\   S\\   S\S	\\   S
\	R                  4S jj5       r\          SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\S	\\   S
\\\4   4S jj5       rSrU =r$ )r  i`  ry   c                 `  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        UR                  U l        [        U5      U l        [!        U5      U l        [$        R&                  " U R                  U R                  SS9U l        [$        R&                  " U R                  U R                  SS9U l        [$        R,                  " [.        R0                  " U R2                  R4                  5      5      U l        U R9                  5         g )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)r  )rc   rd   r  vision_configr   	TypeErrortypetext_configr   projection_dimr.  r  rg   r  r+  
text_modelr  r  r#   r   r  r  r  r&   r   ry   logit_scale_init_valuelogit_scaler  )rB   ry   rE  rB  rz   s       r)   rd   AltCLIPModel.__init__c  s]    &..0CDD--./q2  &,,.?@@++,-Q0 
 ((,,$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r+   r   r   r[   r   r:  r;  r   c           
          Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUUS9nUS   n	U R                  U	5      n
U
$ )a/  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`AltCLIPTextModel`].

Examples:

```python
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```)r   r   r[   r`   r   r:  r;  r   )ry   r   r:  r  rG  r  )rB   r   r   r[   r`   r   r:  r;  text_outputsr[  text_featuress              r)   get_text_featuresAltCLIPModel.get_text_features  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%)/!5# ' 
 %Q,,];r+   r  r  c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9nUS   nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```r  r   )ry   r   r:  r  r  r  )	rB   r  r   r:  r  r;  vision_outputsr[  image_featuress	            r)   get_image_featuresAltCLIPModel.get_image_features  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r+   r`   return_lossc           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
U R	                  UUUUUUU
S9nU R                  UUUU	U
S9nUS   nU R                  U5      nUS   nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  nSnU(       a  [        U5      nU
(       d  UUXX4nUb  U4U-   $ U$ [!        UUUUUUUS	9$ )
a0  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```N)r   r   r`   r[   r   r:  r;  r  r   r   r]   T)r`  r   keepdim)r5   r6   r7   r8   r9   r:   r;   )ry   r   r:  r  rG  r  r  r  normrI  expr&   r   r.   Tr1   r3   )rB   r   r  r   r[   r`   rU  r   r:  r  r;  rL  rQ  r9   r8   rI  r7   r6   r5   r   s                       r)   r   AltCLIPModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%/!5# ' 
 **%/!5%=# + 
 &a(--l;"1o**;7 $&7&7!T&7&RR!$4$4qb$$4$OO &&**,,,{NN4DES*,,_-D&T`qF)-)9TGf$EvE-+#%* .
 	
r+   )rI  rF  r  rG  r  r  r  r  )NNNNNNNr  )
NNNNNNNNFN)rJ   rK   rL   rM   r   r  rd   r   r   r&   r   r   rO   rN  rS  
LongTensorr	   r   r3   r   rQ   r   r   s   @r)   r  r  `  s(    L} >  -115/3,0/3&*,ELL), !., u||,	, $D>, 'tn, d^, 
		, ,\  59,0/3).&*-u001- $D>- 'tn	-
 #'- d^- 
		- -^  1548153715&*,0/3).&*[
E,,-[
 u001[
 !.	[

 u//0[
 !.[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
r+   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r&   cumsumtype_asrw   )r   rV   r   maskincremental_indicess        r)   r~   r~   A  sW     <<$((*D <<!4<<TBE[[_cc##%33r+   )r  r  r+  r  )rz  )r   )HrN   r   dataclassesr   typingr   r   r   r   r   r	   r&   torch.nnr#   torch.utils.checkpointactivationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_altclipr   r   r   
get_loggerrJ   rE  r   r*   r1   r3   r  rS   r   r   r   r   r  r  r  r-  rT  floatre  rg  r  r  r  r  r  r  r  r  r+  r  r~   __all__rI   r+   r)   <module>rp     s     ! > >    !  G l l D D X X 
		H	%
`U\\ `ell `-%,, -5<< - !
K !
 !
JV=299 V=tCbii CN299  $& "0")) 0hRYY  ryy Sbii SnZ
		 Z
|ryy . %II%<<% 
% <<	%
 U\\*% % %,L)ryy L)` /")) /d^
RYY ^
DPbii Pf 1?_ 1? 1?h3
ryy 3
l2
/ 2
j P
, P
P
fV
- V
r]
) ]
B4  _r+   