
    fTh^                        S SK JrJrJrJr  S SKrS SKJr  SSKJ	r	  SSK
Jr  SSKJrJr  SSKJrJr  SSKJr  SS	KJrJr  S
SKJrJrJrJrJrJrJr  S
SKJ r   S
SK!J"r"J#r#  \RH                  " \%5      r& " S S\	5      r' " S S\5      r( " S S\"5      r) " S S\5      r* " S S\5      r+ " S S\5      r, " S S\5      r- " S S\5      r.\ " S S\5      5       r/ " S  S!\5      r0/ S"Qr1g)#    )CallableOptionalTupleUnionN   )PretrainedConfig)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                   R   ^  \ rS rSrSrSrSr             SU 4S jjrSrU =r	$ )MLCDVisionConfig*   af  
This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
[DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1664):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 8192):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    projection_dim (`int`, *optional*, defaults to 1024):
        Dimensionality of text and vision projection layers.
    num_hidden_layers (`int`, *optional*, defaults to 48):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    image_size (`int`, *optional*, defaults to 336):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import MLCDVisionConfig, MLCDVisionModel

>>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
>>> configuration = MLCDVisionConfig()

>>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
>>> model = MLCDVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```mlcd_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xl        Xpl	        Xl
        Xl        Xl        Xl        Xl        g )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr%   r&   r'   r(   r)   r*   r,   r+   r1   r0   r/   r-   r.   kwargs	__class__s                  ]/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mlcd/modular_mlcd.pyr$   MLCDVisionConfig.__init__d   s`    " 	"6"&!2!2#6 $8!($$!2"4!2,$    )r/   r1   r%   r,   r.   r-   r&   r0   r(   r*   r'   r)   r+   )i  i    0         r   iP     gelugh㈵>        g{Gz?      ?)
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr$   __static_attributes____classcell__r4   s   @r5   r   r   *   sH    4l %J%O % %r7   r   c                       \ rS rSrSrg)MLCDMLP   r"   N)r?   r@   rA   rB   rF   r"   r7   r5   rJ   rJ      s    r7   rJ   c                   >    \ rS rSrS\S\S\R                  4S jrSrg)MLCDRotaryEmbedding   num_patches_heightnum_patches_widthreturnc                 ~   [         R                  " XR                  R                  S9R	                  S5      R                  SU5      n[         R                  " X R                  R                  S9R	                  S5      R                  US5      n[         R                  " UR                  5       UR                  5       /SS9n[        X5      n[         R                  " X`R                  R                  U R                  R                  S9n[         R                  " XpR                  5      nX   R                  S5      n	U	$ )aE  
Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

Args:
    num_patches_height (int): Number of patches in the height dimension.
    num_patches_width (int): Number of patches in the width dimension.

Returns:
    torch.Tensor: Rotary positional embeddings for the given grid size.
)devicer:   r   dim)rS   dtype)torcharangeinv_freqrS   	unsqueezeexpandstackflattenmaxrW   outer)
r2   rO   rP   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r5   forwardMLCDRotaryEmbedding.forward   s     LL+MM4H4HISSTUV]]^`bst 	 LL*==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .Bll=1E1ET]]M`M`a#kk#}}= -5==a@r7   r"   N)	r?   r@   rA   rB   intrX   Tensorrh   rF   r"   r7   r5   rM   rM      s     # # %,, r7   rM   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )MLCDVisionEmbeddings   configc                 (   > [         TU ]  U5        U ?g N)r#   r$   position_embeddingr2   ro   r4   s     r5   r$   MLCDVisionEmbeddings.__init__   s     #r7   pixel_valuesrQ   c                 H   UR                   S   nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " XT/SS9nU$ )Nr   )rW   r   r:   rT   rU   )shapepatch_embeddingweightrW   tor^   	transposeclass_embeddingr\   rX   cat)r2   ru   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          r5   rh   MLCDVisionEmbeddings.forward   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
r7   r"   )r?   r@   rA   rB   r   r$   rX   FloatTensorrk   rh   rF   rG   rH   s   @r5   rm   rm      s2    $/ $
E$5$5 
%,, 
 
r7   rm   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\
\R                     S\\   S	\	\R                  \
\R                     4   4
S
 jjrSrU =r$ )MLCDAttention   zMulti-headed attention with RoPE. Refer to papers:
- Attention is all you need:
    https://arxiv.org/abs/1706.03762
- RoFormer: Enhanced Transformer with Rotary Position Embedding:
    https://arxiv.org/abs/2104.09864
ro   c                 T   > [         TU ]  U5        UR                  U l        SU l        g NF)r#   r$   r)   	is_causalrs   s     r5   r$   MLCDAttention.__init__   s%     $*$?$?!r7   hidden_statesposition_embeddingsattention_maskr3   rQ   c                 @   UR                   S S u  pVU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      n	US   R                  S5      R                  5       n
US   R                  S5      R                  5       n[        XxX5      u  pxUR                  SSSS5      R                  5       nUR                  SSSS5      R                  5       nU	R                  SSSS5      R                  5       n	[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS	5      (       a  [         R#                  S
5        O[$        U R                  R                     nU" U UUU	U4U R&                  (       d  SOU R(                  U R*                  U R,                  S.UD6u  pUR                  SSSS5      R                  5       nUR/                  XeS5      nU R1                  U5      nUR                  SSS5      R                  5       nX4$ )NrT   r   r:   r   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r=   )dropoutscalingr   )rw   q_projreshape	num_headshead_dimk_projv_projr[   floatr   permute
contiguousr   ro   _attn_implementationgetloggerwarning_oncer   trainingr   scaler   viewout_proj)r2   r   r   r   r3   r~   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r5   rh   MLCDAttention.forward   sj    "/!4!4Sb!9
 {{=199:SWSaSacgcpcp:qr[[/77QUQ_Q_aeanan8op
{{=199:SWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|Y\#b  $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$,,JJnn
%
 
%
! "))!Q15@@B!&&zrBmmK0!))!Q2==?((r7   )r   r)   rq   )r?   r@   rA   rB   rC   r   r$   rX   rk   r   r   r   r	   rh   rF   rG   rH   s   @r5   r   r      s    /  26	2)||2) #5<<#=>2) !.	2)
 -.2) 
u||Xell33	42) 2)r7   r   c                      ^  \ rS rSrS\4U 4S jjr  SS\R                  S\\R                  \R                  4   S\	\R                     S\	\
   S\\R                     4
S	 jjrS
rU =r$ )MLCDEncoderLayer   ro   c                 D   > [         TU ]  U5        [        U5      U l        g rq   )r#   r$   r   	self_attnrs   s     r5   r$   MLCDEncoderLayer.__init__  s     &v.r7   r   r   r   r   rQ   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a;  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
        Represents the hidden states from the previous layer or the input embeddings.
    position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
r   r   r   r   )layer_norm1r   layer_norm2mlp)r2   r   r   r   r   residualr   outputss           r5   rh   MLCDEncoderLayer.forward  s    * !((7&*nn' 3)/	 '5 '
# !0 ((7/ 0 "&Gr7   )r   r   )r?   r@   rA   rB   r   r$   rX   rk   r   r   boolr   rh   rF   rG   rH   s   @r5   r   r      s    // / 26,1*||* #5<<#=>* !.	*
 $D>* 
u  	!* *r7   r   c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S\\R                     S\\   S	\\   S
\\   S\\	\4   4S jjrSrU =r$ )MLCDEncoderi2  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`MLCDEncoderLayer`].

Args:
    config: MLCDVisionConfig
ro   c                 $   > [         TU ]  U5        g)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r#   r$   rs   s     r5   r$   MLCDEncoder.__init__;  s     r7   inputs_embedsr   r   r   output_hidden_statesreturn_dictrQ   c                 N   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        U	UUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr"   r   r   r:   c              3   .   #    U  H  oc  M  Uv   M     g 7frq   r"   ).0vs     r5   	<genexpr>&MLCDEncoder.forward.<locals>.<genexpr>  s     e$Sq$Ss   	)last_hidden_stater   
attentions)ro   r   use_return_dictr   	enumeratelayersgradient_checkpointingr   _gradient_checkpointing_func__call__tupler
   )r2   r   r   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r5   rh   MLCDEncoder.forward?  s:   D %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq30d%"+DKK"8C#!/2B!B**t}} $ A A!**!'"%! !."/(;#1&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+(%
 	
r7   r"   NNNN)r?   r@   rA   rB   rC   r   r$   rX   r   r   rk   r   r   r   r
   rh   rF   rG   rH   s   @r5   r   r   2  s    !/ ! 26,0/3&*L
((L
 #5<<#=>L
 !.	L

 $D>L
 'tnL
 d^L
 
uo%	&L
 L
r7   r   c                      ^  \ rS rSrS\4U 4S jjr\    SS\\R                     S\\
   S\\
   S\\
   S\\\4   4
S	 jj5       rS
rU =r$ )MLCDVisionTransformeri  ro   c                   > [         TU ]  U5        [        UR                  UR                  -  S-  5      U l        [        R                  " [        R                  " SUR                  UR                  -  S-  5      5      U l
        g )Nr   r:   )r#   r$   rM   r%   r(   vision_rotary_embeddingnn	ParameterrX   randnclass_pos_embrs   s     r5   r$   MLCDVisionTransformer.__init__  sh     ':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*opr7   ru   r   r   r   rQ   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR
                  S   U R                   R                  -  nUR
                  S   U R                   R                  -  nU R                  XV5      nUR                  U R                  R                  5      n[        R                  " U R                  U/SS9n[        R                  " Xw4SS9nUR                  5       UR                  5       4n	U R                  U5      n
U R!                  U
5      n
U R#                  U
U	UUUS9nUS   nUS S 2SS S 24   nU R%                  U5      nU(       d	  X4USS  -   $ ['        UUUR(                  UR*                  S9$ )	Nz You have to specify pixel_valuesrT   r   rU   )r   r   r   r   r   r:   )r   pooler_outputr   r   )ro   r   r   r   
ValueErrorrw   r+   r   rz   r   rS   rX   r}   r   r   r   pre_layrnormencoderpost_layernormr   r   r   )r2   ru   r   r   r   rO   rP   rg   embr   r   encoder_outputsr   pooled_outputs                 r5   rh   MLCDVisionTransformer.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq?@@)//3t{{7M7MM(..r2dkk6L6LL556H\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8,,' 3/!5# ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r7   )r   r   r   )r?   r@   rA   rB   r   r$   r   r   rX   r   r   r   r   r   rh   rF   rG   rH   s   @r5   r   r     s    q/ q
  59,0/3&*/
u001/
 $D>/
 'tn	/

 d^/
 
u00	1/
 /
r7   r   c                   .    \ rS rSr\rSrSrSrSr	S r
Srg)MLCDPreTrainedModeli  mlcdTc                 T   U R                   R                  n[        U[        5      (       a  U R                   R                  n[        R
                  R                  UR                  SUR                  S-  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  g[        U[        5      (       Ga   U R                   R                  nUR                  S-  SUR                   R                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                   R                  US9  [        R
                  R                  UR"                  R                  US9  g[        U[$        5      (       a  U R                   R                  nUR                   R&                  S-  SUR                   R                  -  S-  -  U-  nSUR                   R&                  -  S-  U-  n[        R
                  R                  UR(                  R                  US9  [        R
                  R                  UR*                  R                  US9  g[        U[,        5      (       av  U R                   R                  nUR                   R&                  UR                   R.                  -  S-  S-  U-  n[        R
                  R                  UR0                  SUS9  g[        U[        R2                  5      (       aJ  UR4                  R6                  R9                  5         UR                  R6                  R;                  S5        g[        U[        R<                  5      (       a3  UR4                  b%  UR4                  R6                  R9                  5         ggg)zInitialize the weightsr=   g      )meanstd)r   r   r>   N)ro   r.   
isinstancerm   r   initnormal_r|   	embed_dimrx   ry   r-   r   r'   r   r   r   r   rJ   r%   fc1fc2r   r(   r   	LayerNormbiasdatazero_fill_Linear)r2   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stds          r5   _init_weights!MLCDPreTrainedModel._init_weights  s   //f233[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOh..[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE(([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 566[[33F!==448Y8YY]^^cggjppKGGOOF00sOL--KK""$MM$$S)		**v{{/FKK""$ 0G*r7   r"   N)r?   r@   rA   rB   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar  rF   r"   r7   r5   r   r     s#    #L&*#!N%r7   r   c                   ~    \ rS rSr\    S
S\\R                     S\\   S\\   S\\   S\	\
\4   4
S jj5       rS	rg)MLCDVisionModeli  Nru   r   r   r   rQ   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUS9$ )a"  
Example:

```python
>>> import requests
>>> from PIL import Image
>>> from transformers import AutoProcessor, MLCDVisionModel
>>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
>>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs, output_attentions=True)

>>> features = outputs.last_hidden_state
>>> print(f"Extracted features shape: {features.shape}")
>>> print(f"Number of attention layers: {len(outputs.attentions)}")
>>> print(f"Attention shape: {outputs.attentions[0].shape}")
```)ru   r   r   r   )ro   r   r   r   vision_model)r2   ru   r   r   r   s        r5   rh   MLCDVisionModel.forward  su    > %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq  %/!5#	 ! 
 	
r7   r"   r   )r?   r@   rA   rB   r   r   rX   r   r   r   r   r   rh   rF   r"   r7   r5   r  r    st     59,0/3&*(
u001(
 $D>(
 'tn	(

 d^(
 
u00	1(
 (
r7   r  )r   r   r  )2typingr   r   r   r   rX   torch.nnr   configuration_utilsr   modeling_flash_attention_utilsr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr?   r   r   rJ   rM   rm   r   r   r   r   r   r  __all__r"   r7   r5   <module>r!     s    4 3   3 B K F & ,   ; [ 
		H	%Y%' Y%x	g 	/ D/ $?)M ?)D/' /dY
+ Y
x6
1 6
r $%/ $% $%N*
o *
Zr7   