
    fThn                        S SK JrJrJrJr  S SKrS SKJr  SSKJ	r	  SSK
Jr  SSKJrJr  SSKJrJr  SSKJr  SS	KJrJrJrJr  S
SKJr  \R6                  " \5      r " S S\R<                  5      r " S S\R<                  5      r  " S S\R<                  5      r! S2S\R<                  S\RD                  S\RD                  S\RD                  S\\RD                     S\#S\#4S jjr$S r%S\RD                  S\&S\RD                  4S jr'S\RD                  S \RD                  S!\RD                  S"\RD                  S\\RD                  \RD                  4   4
S# jr( " S$ S%\R<                  5      r) " S& S'\R<                  5      r* " S( S)\R<                  5      r+ " S* S+\R<                  5      r,\ " S, S-\5      5       r-\" S.S/9 " S0 S1\-5      5       r.S-S1/r/g)3    )CallableOptionalTupleUnionN   )ACT2FN)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging	torch_int   )MLCDVisionConfigc                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MLCDMLP&   c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)super__init__configr   
hidden_actactivation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr   	__class__s     ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mlcd/modeling_mlcd.pyr   MLCDMLP.__init__'   sb    #F$5$5699V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r#   r   r$   )r&   r+   s     r(   forwardMLCDMLP.forward.   s4    /**=9/r*   )r   r   r#   r$   )
__name__
__module____qualname____firstlineno__r   torchTensorr.   __static_attributes____classcell__r'   s   @r(   r   r   &   s)    KU\\ ell  r*   r   c                   f   ^  \ rS rSrSS\S\SS4U 4S jjjrS\S\S\R                  4S	 jr	S
r
U =r$ )MLCDRotaryEmbedding5   dimthetar,   Nc           	         > [         TU ]  5         SU[        R                  " SUS[        R                  S9U-  -  -  nU R                  SUSS9  g )N      ?r      dtypeinv_freqF
persistent)r   r   r4   arangefloatregister_buffer)r&   r<   r=   rC   r'   s       r(   r   MLCDRotaryEmbedding.__init__6   sK    %ELLC%++$NQT$TUVZeDr*   num_patches_heightnum_patches_widthc                 ~   [         R                  " XR                  R                  S9R	                  S5      R                  SU5      n[         R                  " X R                  R                  S9R	                  S5      R                  US5      n[         R                  " UR                  5       UR                  5       /SS9n[        X5      n[         R                  " X`R                  R                  U R                  R                  S9n[         R                  " XpR                  5      nX   R                  S5      n	U	$ )aE  
Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

Args:
    num_patches_height (int): Number of patches in the height dimension.
    num_patches_width (int): Number of patches in the width dimension.

Returns:
    torch.Tensor: Rotary positional embeddings for the given grid size.
)devicer   r   r<   )rM   rB   )r4   rF   rC   rM   	unsqueezeexpandstackflattenmaxrB   outer)
r&   rJ   rK   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r(   r.   MLCDRotaryEmbedding.forward;   s     LL+MM4H4HISSTUV]]^`bst 	 LL*==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .Bll=1E1ET]]M`M`a#kk#}}= -5==a@r*    )g     @)r0   r1   r2   r3   intrG   r   r4   r5   r.   r6   r7   r8   s   @r(   r:   r:   5   sJ    EC E ED E E
# # %,,  r*   r:   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	S	\R                  S\R                  4S
 jrSrU =r$ )MLCDVisionEmbeddings\   r   c                 v  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        U R#                  S[        R$                  " U R                   5      R'                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasr@   r   position_ids)r   rN   rD   )r   r   r   r!   	embed_dim
image_size
patch_sizer   	Parameterr4   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsrH   rF   rQ   r%   s     r(   r   MLCDVisionEmbeddings.__init__]   s    ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1^U\\$:L:L-M-T-TU\-]jopr*   
embeddingsheightwidthr,   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   NrN   g      ?r   r@   bicubicF)sizemodealign_cornersrO   )shapeposition_embeddingweightrP   r4   jit
is_tracingri   rl   r   reshapepermuter   
functionalinterpolateviewcat)r&   rv   rw   rx   rs   r   rt   class_pos_embedpatch_pos_embedr<   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encoding-MLCDVisionEmbeddings.interpolate_pos_encodingr   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr*   pixel_valuesc                 H   UR                   S   nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " XT/SS9nU$ )Nr   rA   r@   r   rN   rO   )r~   rr   r   rB   torS   	transposero   rQ   r4   r   )r&   r   
batch_sizetarget_dtypepatch_embedsclass_embedsrv   s          r(   r.   MLCDVisionEmbeddings.forward   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
r*   )ro   r   rj   rk   rs   rt   rr   rl   )r0   r1   r2   r3   r   r   r4   r5   r_   r   FloatTensorr.   r6   r7   r8   s   @r(   ra   ra   \   sg    q/ q*'D5<< 'D 'DUX 'D]b]i]i 'DR
E$5$5 
%,, 
 
r*   ra   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr@   r   rN   )r<   rB   )ptrainingr   )	repeat_kvnum_key_value_groupsr4   matmulr   r~   r   r   softmaxfloat32r   rB   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r(   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r*   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrN   r@   rO   )r~   r4   r   )xx1x2s      r(   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r*   r+   n_repr,   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r~   rQ   r   )r+   r   batchnum_key_value_headsslenhead_dims         r(   r   r      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr*   qkcossinc                    U R                   nUR                   nU R                  5       UR                  5       pUR                  S5      R                  5       UR                  S5      R                  5       p2X-  [        U 5      U-  -   nX-  [        U5      U-  -   nUR	                  U5      nUR	                  U5      nXg4$ )Nr   )rB   rG   rP   r   r   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r(   apply_rotary_pos_emb_visionr      s     77L77L779aggiq}}R &&(#--*;*A*A*Cw;q>C/0Gw;q>C/0Gjj&Gjj&Gr*   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\
\R                     S\\   S	\	\R                  \
\R                     4   4
S
 jjrSrU =r$ )MLCDAttention   a)  Multi-headed attention from 'Attention Is All You Need' paper
Multi-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://arxiv.org/abs/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://arxiv.org/abs/2104.09864
r   c                 B  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        UR&                  U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r!   rj   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutr   	is_causalr   r    k_projv_projq_projout_projr   r%   s     r(   r   MLCDAttention.__init__   s&   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..A$*$?$?!r*   r+   position_embeddingsr   r   r,   c                 @   UR                   SS u  pVU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      n	US   R                  S5      R                  5       n
US   R                  S5      R                  5       n[        XxX5      u  pxUR                  SSSS5      R                  5       nUR                  SSSS5      R                  5       nU	R                  SSSS5      R                  5       n	[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  S	S
5      (       a  [         R#                  S5        O[$        U R                  R                     nU" U UUU	U4U R&                  (       d  SOU R(                  U R*                  U R,                  S.UD6u  pUR                  SSSS5      R                  5       nUR/                  XeS5      nU R1                  U5      nUR                  SSS5      R                  5       nX4$ )z#Input shape: Batch x Time x ChannelNrN   r   r   r@   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r~   r   r   r   r   r   r   rP   rG   r   r   r   r   r   _attn_implementationgetloggerwarning_oncer   r   r   r   r   r   r   )r&   r+   r   r   r   r   
seq_lengthquery_statesr   r   r   r   attention_interfacer   r   s                  r(   r.   MLCDAttention.forward  sj    "/!4!4Sb!9
 {{=199:SWSaSacgcpcp:qr[[/77QUQ_Q_aeanan8op
{{=199:SWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|Y\#b  $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$,,JJnn
%
 
%
! "))!Q15@@B!&&zrBmmK0!))!Q2==?((r*   )r   r   rj   r   r   r   r   r   r   r   r   r   r   )r0   r1   r2   r3   __doc__r   r   r4   r5   r   r   r   r	   r.   r6   r7   r8   s   @r(   r   r      s    @/ @2 26	3)||3) #5<<#=>3) !.	3)
 -.3) 
u||Xell33	43) 3)r*   r   c                      ^  \ rS rSrS\4U 4S jjr  SS\R                  S\\R                  \R                  4   S\	\R                     S\	\
   S\\R                     4
S	 jjrS
rU =r$ )MLCDEncoderLayeri7  r   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g )Neps)r   r   r!   rj   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r%   s     r(   r   MLCDEncoderLayer.__init__8  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr*   r+   r   r   r   r,   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a;  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
        Represents the hidden states from the previous layer or the input embeddings.
    position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
r+   r   r   r   )r   r   r   r   )r&   r+   r   r   r   residualr   outputss           r(   r.   MLCDEncoderLayer.forward@  s    * !((7&*nn' 3)/	 '5 '
# !0 ((7/ 0 "&Gr*   )rj   r   r   r   r   )NF)r0   r1   r2   r3   r   r   r4   r5   r   r   boolr   r.   r6   r7   r8   s   @r(   r   r   7  s    S/ S 26,1*||* #5<<#=>* !.	*
 $D>* 
u  	!* *r*   r   c                      ^  \ rS rSrSrS\4U 4S jjr\    SS\R                  S\
\R                  \R                  4   S\\R                     S\\   S	\\   S
\\   S\\
\4   4S jj5       rSrU =r$ )MLCDEncoderim  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`MLCDEncoderLayer`].

Args:
    config: MLCDVisionConfig
r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        gs  snf )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.FN)
r   r   r   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r&   r   _r'   s      r(   r   MLCDEncoder.__init__v  sS    mmuVMeMeGf$gGf!%5f%=Gf$gh&+# %hs   A&inputs_embedsr   r   r   output_hidden_statesreturn_dictr,   c                 N   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        U	UUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr^   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r^   ).0vs     r(   	<genexpr>&MLCDEncoder.forward.<locals>.<genexpr>  s     e$Sq$Ss   	)last_hidden_stater+   
attentions)r   r  use_return_dictr   	enumerater   r   r   _gradient_checkpointing_func__call__tupler
   )r&   r  r   r   r   r  r  encoder_statesall_attentionsr+   idxencoder_layerlayer_outputss                r(   r.   MLCDEncoder.forward}  s:   F %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq30d%"+DKK"8C#!/2B!B**t}} $ A A!**!'"%! !."/(;#1&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+(%
 	
r*   )r   r   r   NNNN)r0   r1   r2   r3   r   r   r   r   r4   r   r   r5   r   r   r   r
   r.   r6   r7   r8   s   @r(   r   r   m  s    ,/ , 
 26,0/3&*L
((L
 #5<<#=>L
 !.	L

 $D>L
 'tnL
 d^L
 
uo%	&L
 L
r*   r   c                      ^  \ rS rSrS\4U 4S jjr\    SS\\R                     S\\
   S\\
   S\\
   S\\\4   4
S	 jj5       rS
rU =r$ )MLCDVisionTransformeri  r   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        [        UR                  UR                  -  S-  5      U l        [        R                   " ["        R$                  " SUR                  UR                  -  S-  5      5      U l        g )Nr   r@   r   )r   r   r   r!   ra   rv   r   r   r   pre_layrnormr   encoderpost_layernormr:   r   vision_rotary_embeddingrm   r4   rn   class_pos_emb)r&   r   rj   r'   s      r(   r   MLCDVisionTransformer.__init__  s    &&	.v6LL8M8MN"6* ll9:O:OP':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*opr*   r   r   r  r  r,   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR
                  S   U R                   R                  -  nUR
                  S   U R                   R                  -  nU R                  XV5      nUR                  U R                  R                  5      n[        R                  " U R                  U/SS9n[        R                  " Xw4SS9nUR                  5       UR                  5       4n	U R                  U5      n
U R!                  U
5      n
U R#                  U
U	UUUS9nUS   nUS S 2SS S 24   nU R%                  U5      nU(       d	  X4USS  -   $ ['        UUUR(                  UR*                  S9$ )	Nz You have to specify pixel_valuesr   rN   r   rO   )r  r   r   r  r  r   )r  pooler_outputr+   r  )r   r  r  r   r   r~   rl   r  r   r   rM   r4   r   r   r   rv   r  r  r  r   r+   r  )r&   r   r   r  r  rJ   rK   r\   embr   r+   encoder_outputsr  pooled_outputs                 r(   r.   MLCDVisionTransformer.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq?@@)//3t{{7M7MM(..r2dkk6L6LL556H\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8,,' 3/!5# ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r*   )r   r   rv   r  r  r  r  r  )r0   r1   r2   r3   r   r   r   r   r4   r   r   r   r   r   r.   r6   r7   r8   s   @r(   r  r    s    
q/ 
q  59,0/3&*/
u001/
 $D>/
 'tn	/

 d^/
 
u00	1/
 /
r*   r  c                   .    \ rS rSr\rSrSrSrSr	S r
Srg)MLCDPreTrainedModeli  mlcdTc                 T   U R                   R                  n[        U[        5      (       a  U R                   R                  n[        R
                  R                  UR                  SUR                  S-  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  g[        U[        5      (       Ga   U R                   R                  nUR                  S-  SUR                   R                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                   R                  US9  [        R
                  R                  UR"                  R                  US9  g[        U[$        5      (       a  U R                   R                  nUR                   R&                  S-  SUR                   R                  -  S-  -  U-  nSUR                   R&                  -  S-  U-  n[        R
                  R                  UR(                  R                  US9  [        R
                  R                  UR*                  R                  US9  g[        U[,        5      (       av  U R                   R                  nUR                   R&                  UR                   R.                  -  S-  S-  U-  n[        R
                  R                  UR0                  SUS9  g[        U[        R2                  5      (       aJ  UR4                  R6                  R9                  5         UR                  R6                  R;                  S5        g[        U[        R<                  5      (       a3  UR4                  b%  UR4                  R6                  R9                  5         ggg)zInitialize the weightsr   r   )meanstd)r-  r@   r?   N)r   initializer_factor
isinstancera   r   initnormal_ro   rj   rr   r   initializer_ranger   r   r   r   r   r   r   r!   r#   r$   r  r   r   r   rh   datazero_fill_r    )r&   r   factorin_proj_stdout_proj_stdfc_stdpos_emb_stds          r(   _init_weights!MLCDPreTrainedModel._init_weights  s   //f233[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOh..[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE(([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 566[[33F!==448Y8YY]^^cggjppKGGOOF00sOL--KK""$MM$$S)		**v{{/FKK""$ 0G*r*   r^   N)r0   r1   r2   r3   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar;  r6   r^   r*   r(   r)  r)    s#    #L&*#!N%r*   r)  zN
    The vision model from M_L_C_D without any head or projection on top.
    )custom_introc                      ^  \ rS rSr\rSrS/rS\4U 4S jjrS\	R                  4S jr\    SS\\R                     S\\   S	\\   S
\\   S\\\4   4
S jj5       rSrU =r$ )MLCDVisionModeli5  r   r   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )r   r   r  vision_model	post_initr%   s     r(   r   MLCDVisionModel.__init__?  s'     1&9r*   r,   c                 B    U R                   R                  R                  $ r   )rF  rv   rr   )r&   s    r(   get_input_embeddings$MLCDVisionModel.get_input_embeddingsE  s      ++;;;r*   r   r  r  c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUS9$ )a"  
Example:

```python
>>> import requests
>>> from PIL import Image
>>> from transformers import AutoProcessor, MLCDVisionModel
>>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
>>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs, output_attentions=True)

>>> features = outputs.last_hidden_state
>>> print(f"Extracted features shape: {features.shape}")
>>> print(f"Number of attention layers: {len(outputs.attentions)}")
>>> print(f"Attention shape: {outputs.attentions[0].shape}")
```)r   r   r  r  )r   r  r  r   rF  )r&   r   r   r  r  s        r(   r.   MLCDVisionModel.forwardH  su    > %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq  %/!5#	 ! 
 	
r*   )rF  r  )r0   r1   r2   r3   r   r=  main_input_name_no_split_modulesr   r   ModulerJ  r   r   r4   r   r   r   r   r   r.   r6   r7   r8   s   @r(   rD  rD  5  s     $L$O+,/ <bii <  59,0/3&*(
u001(
 $D>(
 'tn	(

 d^(
 
u00	1(
 (
r*   rD  )r   )0typingr   r   r   r   r4   torch.nnr   activationsr   modeling_flash_attention_utilsr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_mlcdr   
get_loggerr0   r   rP  r   r:   ra   r5   rG   r   r   r_   r   r   r   r   r   r  r)  rD  __all__r^   r*   r(   <module>r\     s  * 4 3   ! B K F & I I 0 
		H	%bii $")) $NI299 If %II%<<% 
% <<	%
 U\\*% % %4(	UU\\ 	U# 	U%,, 	U||+0<<>Cll
5<<%&Q)BII Q)h3ryy 3l]
")) ]
@=
BII =
@ $%/ $% $%N 
7
) 7

7
t !"3
4r*   