
    fThY5                       S SK r S SKJr  S SKJrJrJrJrJr  S SK	r	S SK
Jr  S SKJs  Jr  S SKJr  SSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0J1r1  SSK2J3r3J4r4  \0" 5       (       a  S SK5J6r6  SSK7J8r8  \1Rr                  " \:5      r; " S S\Rx                  5      r= " S S\Rx                  5      r> " S S\	R                  Rx                  5      r? " S S\Rx                  5      r@\" S5       " S S\Rx                  5      5       rA " S S \Rx                  5      rBS!\	R                  S"\	R                  S#\	R                  S$\\	R                  \	R                  4   4S% jrDS&\	R                  S'\ES$\	R                  4S( jrF S[S)\Rx                  S*\	R                  S+\	R                  S,\	R                  S-\\	R                     S.\GS/\G4S0 jjrH S[S)\Rx                  S*\	R                  S+\	R                  S,\	R                  S-\\	R                     S.\GS/\G4S1 jjrI " S2 S3\Rx                  5      rJ " S4 S5\Rx                  5      rK\. " S6 S7\)5      5       rL\. " S8 S9\L5      5       rM " S: S;\\-5      rN " S< S=\L\5      rO\ " S> S?\#5      5       rP " S@ SA\	R                  Rx                  5      rQ " SB SC\Rx                  5      rRSD rS " SE SF\Rx                  5      rTSG\	R                  S*\	R                  4SH jrUS*\	R                  S+\	R                  SG\	R                  S$\\	R                  \	R                  4   4SI jrV " SJ SK\Rx                  5      rW " SL SM\Rx                  5      rX " SN SO\Rx                  5      rY " SP SQ\Rx                  5      rZ " SR SS\Rx                  5      r[ " ST SU\Rx                  5      r\ " SV SW\L5      r] " SX SY\L\5      r^/ SZQr_g)\    N)	dataclass)CallableListOptionalTupleUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCacheHybridChunkedCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )Llama4ConfigLlama4TextConfig)	BlockMask)make_flex_block_causal_maskc                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Llama4TextExperts0   configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U l        [        R                  " [        R                  " U R                  U R
                  SU R                  -  5      5      U l        [        R                  " [        R                  " U R                  U R                  U R
                  45      5      U l        [        UR                     U l        g N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr   
hidden_actact_fnselfr)   	__class__s     b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/llama4/modeling_llama4.pyr.   Llama4TextExperts.__init__1   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 T   UR                  U R                  SU R                  5      n[        R                  " XR
                  5      nUR                  SSS9u  p4[        R                  " X@R                  U5      -  U R                  5      nUR                  SU R                  5      nU$ )a  
This should really not be run on a single machine, as we are reaching compute bound:
- the inputs are expected to be "sorted" per expert already.
- the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

Args:
    hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
    selected_experts (torch.Tensor): (batch_size * token_num, top_k)
    routing_weights (torch.Tensor): (batch_size * token_num, top_k)
Returns:
    torch.Tensor
r,   dim)	viewr0   r2   r6   bmmr8   chunkr;   r9   )r=   rB   gate_upgateupnext_statess         r?   forwardLlama4TextExperts.forward;   s     &**4+;+;RAQAQR))M+<+<====+iikk$&7!7$..I!&&r4+;+;<rA   )r;   r9   r3   r8   r2   r1   r0   )__name__
__module____qualname____firstlineno__r#   r.   r6   TensorrO   __static_attributes____classcell__r>   s   @r?   r'   r'   0   s0    0/ 0U\\ ell  rA   r'   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Llama4TextMLPQ   c                 X  > [         TU ]  5         Uc  UR                  nXl        [        R
                  " UR                  USS9U l        [        R
                  " UR                  USS9U l        [        R
                  " X!R                  SS9U l	        [        UR                     U l        g NFbias)r-   r.   r1   r)   r4   Linearr2   	gate_projup_projr9   r   r:   activation_fn)r=   r)   r1   r>   s      r?   r.   Llama4TextMLP.__init__R   s    $ & 8 86#5#57HuUyy!3!35FUS#46H6HuU#F$5$56rA   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      $ N)rc   ra   rb   r9   )r=   xr9   s      r?   rO   Llama4TextMLP.forward^   s7    &&t~~a'89DLLOK	~~i((rA   )rc   r)   r9   ra   rb   rf   rQ   rR   rS   rT   r.   rO   rV   rW   rX   s   @r?   rZ   rZ   Q   s    
7) )rA   rZ   c                   F   ^  \ rS rSrSS\4U 4S jjjrS rS rS rSr	U =r
$ )	Llama4TextL2Normc   epsc                 .   > [         TU ]  5         Xl        g rf   )r-   r.   rm   )r=   rm   r>   s     r?   r.   Llama4TextL2Norm.__init__d   s    rA   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ Nr,   rE   T)keepdimr6   rsqrtpowmeanrm   r=   rg   s     r?   _normLlama4TextL2Norm._normh   4    5;;quuQx}}R}>IJJJrA   c                 ^    U R                  UR                  5       5      R                  U5      $ rf   )rx   floattype_asrw   s     r?   rO   Llama4TextL2Norm.forwardk   s"    zz!'')$,,Q//rA   c                      SU R                    3$ )Nzeps=rm   r=   s    r?   
extra_reprLlama4TextL2Norm.extra_reprn   s    dhhZ  rA   r   )gư>)rQ   rR   rS   rT   r|   r.   rx   rO   r   rV   rW   rX   s   @r?   rk   rk   c   s)    E  K0! !rA   rk   c                   >   ^  \ rS rSrSU 4S jjrS rS rS rSrU =r	$ )Llama4TextRMSNormr   c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g)z,
Llama4RMSNorm is equivalent to T5LayerNorm
N)r-   r.   rm   r4   r5   r6   onesweight)r=   r2   rm   r>   s      r?   r.   Llama4TextRMSNorm.__init__s   s.     	ll5::k#:;rA   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ rq   rs   rw   s     r?   rx   Llama4TextRMSNorm._norm{   rz   rA   c                 z    U R                  UR                  5       5      R                  U5      nX R                  -  $ rf   )rx   r|   r}   r   )r=   rg   outputs      r?   rO   Llama4TextRMSNorm.forward~   s.    AGGI&..q1##rA   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   shaperm   r   s    r?   r   Llama4TextRMSNorm.extra_repr   s'    ))*+6$((<<rA   )rm   r   )gh㈵>)
rQ   rR   rS   rT   r.   rx   rO   r   rV   rW   rX   s   @r?   r   r   r   s    <K$= =rA   r   Llama4TextMoec                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r      c                 &  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        U5      U l	        [        R                  " UR                  UR                  SS9U l        [        U5      U l        g r]   )r-   r.   num_experts_per_toktop_kr2   
hidden_dimr/   r0   r'   expertsr4   r`   routerrZ   shared_expertr<   s     r?   r.   Llama4TextMoe.__init__   sp    //
 ,,!33(0ii 2 2F4L4LSXY*62rA   c                    UR                   u  p#nUR                  SU R                  5      nU R                  U5      nX#-  n[        R
                  " XPR                  SS9u  px[        R                  " U[        S5      5      R                  SX5      R                  SS5      n	[        R                  " XaR                  S9R                  SS5      R                  U	R                  S5      S5      n[        R                   " U	R                  5       5      R#                  UR$                  5      n	UR                  SS5      R                  SU5      n[        R&                  " USUS9R#                  UR                  5      n
XR                  SS5      -  n
U R)                  U
5      nU R+                  U5      nUR-                  SXR                  SU5      S9  X4$ )	NrE   r!   rF   z-infr   device)inputrG   index)rG   r   src)r   reshaper   r   r6   topkr   	full_liker|   scatter_	transposearanger   rH   expandsizesigmoidtodtypegatherr   r   scatter_add_)r=   rB   batchseq_lenr   router_logitstokens_per_expertrouter_top_valuerouter_indicesrouter_scores	routed_in
routed_outouts                r?   rO   Llama4TextMoe.forward   s   %2%8%8"
%--b$//BM2!O+0::mZZUV+W(OOM5=9BB1ngqqrsuvw 	 LL*3G3GHMMaQST[[\i\n\nop\qsuv 	 m&9&9&;<??@S@ST'//A6==b*MLL 
 "]!!
"	 	  5 5b! <<	\\),
  / 	Qn//"j:YZ!!rA   )r   r   r0   r   r   r   ri   rX   s   @r?   r   r      s    3" "rA   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Llama4TextRotaryEmbedding   r)   c                 X  > [         TU ]  5         UR                  b  SOSU l        UR                  U l        UR                  U l        Xl        [        U R                     U l	        U R                  U R                  U5      u  o0l
        U R                  SUSS9  U R                  U l        g )Nllama3defaultinv_freqF)
persistent)r-   r.   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr)   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r=   r)   r   r   r>   s       r?   r.   "Llama4TextRotaryEmbedding.__init__   s    %+%8%8%D)"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%rA   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  UR
                  5      U-  R                  SS5      n[        R                  " [        R                  " U5      U5      nXpR                  -  nS S S 5        U$ ! , (       d  f       W$ = f)	Nr   rE   r!   mpscpuF)device_typeenabledr,   )r   r|   r   r   
isinstancer   typestrr6   autocastr   r   polar	ones_liker   )r=   rg   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r?   rO   !Llama4TextRotaryEmbedding.forward   s    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&))!((36KKVVWXZ[\EEOOE$:EBI!$:$::I D
  DC
 s   A(D==
E)r   r)   r   r   r   r   r   rf   )rQ   rR   rS   rT   r#   r.   r6   no_gradr   rO   rV   rW   rX   s   @r?   r   r      s7    // / / ]]_
  
rA   r   xqxkr   rC   c           	      *   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[         R
                  " X2S S 2S S 2S S S 24   -  5      R                  S5      n[         R
                  " XBS S 2S S 2S S S 24   -  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrE   r,   r
   )r6   view_as_complexr|   r   r   view_as_realflattenr}   )r   r   r   xq_xk_xq_outxk_outs          r?   apply_rotary_embr      s    
 


 2 2 IBHHSbM I2 Iq I
JC



 2 2 IBHHSbM I2 Iq I
JC1dA&> >?GGJF1dA&> >?GGJF>>"v~~b111rA   rB   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)r   r   r   )rB   r   r   num_key_value_headsslenhead_dims         r?   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrA   modulequerykeyvalueattention_maskscalingdropoutc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr,   r
   rE   rF   ptrainingr!   )r   num_key_value_groupsr6   matmulr   r   r4   
functionalsoftmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r?   eager_attention_forwardr
     s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2(>L==((6??([L,,|:K''1-88:K$$rA   c                 
   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U R
                  S-  -  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )	Nr,   r
         r   rE   rF   r   r!   )r   r   r6   r   r   r   r   r4   r   r  r   r   r  r  s                r?   vision_eager_attention_forwardr    s     3 ; ;<JU$?$?@L<<';';Aq'ABV__VZEZZL!$Q1.D
0@0@0D.D%DE#1==((2(>L==((6??([L,,|:K''1-88:K$$rA   c                   B  ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\R                     S
\\   S\	\R                  \
\R                     \
\	\R                        4   4S jjrSrU =r$ )Llama4TextAttentioni   z=Multi-headed attention from 'Attention Is All You Need' paperr)   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  U l        UR                  UR                  -  U l	        UR                  U l        U R                  S-  U l
        UR                  U l        UR                  U l        UR                  U l        UR                  U l        SU l        UR                   U   U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  U R                  -  UR
                  UR(                  S9U l        U R                  R2                  (       a-  U R"                  (       a  [5        UR6                  5      U l        g g g )Nr   r  Tr^   )r-   r.   r)   	layer_idxgetattrr2   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper4   r`   attention_biasq_projk_projv_projo_projuse_qk_normrk   rms_norm_epsqk_normr=   r)   r  r>   s      r?   r.   Llama4TextAttention.__init__#  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"rA   rB   position_embeddingsr   past_key_valuecache_positionr  rC   c                 (   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      n	U R	                  U5      R                  " / UQSPU R                  P76 n
U R                  U5      R                  U5      R                  SS5      nU R                  (       a'  [        XUR                  U	R                  5      5      u  p[        U S5      (       a"  U R                  U	5      n	U R                  U
5      n
U R                  (       a  U R                  (       d  [        R                  " [        R                   " UR#                  5       S-   U R$                  -  5      S-   5      U R&                  -  S-   nUR                  SUS   SS45      R)                  / UQSPSP75      nX-  R                  U	R*                  5      n	U	R                  SS5      n	U
R                  SS5      n
Ub#  SU0nUR-                  XU R.                  U5      u  p[0        nU R2                  R4                  S:w  ad  U R2                  R4                  S:X  a-  UR7                  S	S
5      (       a  [8        R;                  S5        O[<        U R2                  R4                     nU" U U	U
UU4U R>                  (       d  SOU R@                  U RB                  S.UD6u  nnURD                  " / UQSP76 RG                  5       nU RI                  U5      nUU4$ )NrE   r!   r,   r"        ?r'  eagersdpaoutput_attentionsF`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   )%r   r   r  rH   r  r  r   r  r   r   r   hasattrr"  r  r6   logfloorr|   r  r  r   r   updater  r
  r)   _attn_implementationgetloggerwarning_oncer   r   r  r   r   r  r  )r=   rB   r%  r   r&  r'  r  input_shapehidden_shapequery_statesr  r  attn_scalescache_kwargsattention_interfacer	  r  s                    r?   rO   Llama4TextAttention.forwardA  s    $))#2.88b8$--8{{=166|D[[/44UkU2Ut}}U
{{=166|DNNqRST=='7*=*@*@ATAT*U($L 4##<<5Lj1J ''		%++~';';'='CtGWGW&WX[^^_bfbqbqqtww  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(6::<;M;MNL#--a3))!Q/
%,n=L'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((rA   )r  r  r  r)   r  r   r  r  r  r  r   r   r  r  r"  r   r  r  NN)rQ   rR   rS   rT   __doc__r#   r.   r6   rU   r   r   r   
LongTensorr   r   rO   rV   rW   rX   s   @r?   r  r     s    GA/ AF +/59?)||?) #5<<#=>?) !.	?)
 !?) !!1!12?) -.?) 
u||Xell3XeELL>Q5RR	S?) ?)rA   r  c                     ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\	\R                        S\\
   S	\\
   S
\\
   S\\R                     S\\	\R                  \R                  4      S\\   S\	\R                  \\	\R                  \R                  4      4   4S jjrSrU =r$ )Llama4TextDecoderLayeri  c                   > [         TU ]  5         UR                  U l        [        X5      U l        UR
                  S L=(       a    [        UR                  U   5      U l        X!R                  ;   U l
        U R                  (       a  [        U5      U l        O[        XR                  S9U l        [        UR                  UR                   S9U l        [        UR                  UR                   S9U l        X l        g )N)r1   r   )r-   r.   r2   r  	self_attnattention_chunk_sizeboolr  use_chunked_attention
moe_layersis_moe_layerr   feed_forwardrZ   intermediate_size_mlpr   r!  input_layernormpost_attention_layernormr  r#  s      r?   r.   Llama4TextDecoderLayer.__init__  s    !--,V?%+%@%@%L%wQUV\VkVkluVvQw"%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%"rA   rB   r   chunk_causal_maskr   r&  r,  output_router_logits	use_cacher'  r%  r  rC   c                    UnU R                  U5      nU R                  (       a  Ub  UnU R                  " SUU
UUUUU	S.UD6u  pX-   nUnU R                  U5      nU R	                  U5      nU R
                  (       a  Uu  pOS nXR                  UR                  5      -   nU4nU(       a  UU4-  nU(       a  UU4-  nU$ )N)rB   r%  r   r&  r,  rQ  r'   )rL  rG  rD  rM  rJ  rI  rH   r   )r=   rB   r   rO  r   r&  r,  rP  rQ  r'  r%  r  residualattention_statesself_attn_weightsr   outputss                    r?   rO   Llama4TextDecoderLayer.forward  s     !,,]; %%*;*G.N /3nn 	/
' 3))/)	/
 	/
+ !3 !55mD))-8+8(M= M #5#5hnn#EE ")++G''GrA   )rJ  r2   rL  rI  r  rM  rD  rG  )	NNNNFFFNN)rQ   rR   rS   rT   r.   r6   rU   r   r@  r   rF  r   r   FloatTensorrO   rV   rW   rX   s   @r?   rB  rB    s3   #& 2648378<,1/4$)59KO5||5 !.5 $ELL1	5
 u//05 !u||!455 $D>5 'tn5 D>5 !!1!125 &eELL%,,,F&GH5 -.5 
u  (51B1BEDUDU1U+V"WW	X5 5rA   rB  c                   D    \ rS rSr\rSrS/rSrSr	Sr
SrSrSrSrS rSrg)Llama4PreTrainedModeli  Tpast_key_valuesFc                 |   [        U R                  S5      (       a  U R                  R                  OU R                  R                  R                  n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [	        U[         5      (       a&  UR                  R                  R                  S5        g [	        U["        5      (       aI  UR$                  R                  R                  SUS9  UR&                  R                  R                  SUS9  g [	        U[(        5      (       a[  UR*                  R                  R                  UR,                  S9  UR.                  R                  R                  UR,                  S9  g g )Ninitializer_ranger.  )rv   stdr)  )r_  )r/  r)   r^  text_configr   r4   r`   r   datanormal_r_   zero_	Embeddingpadding_idx	LayerNormfill_r   r'   r8   r9   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r=   r   r_  s      r?   _init_weights#Llama4PreTrainedModel._init_weights  s    t{{$788 KK))((:: 	
 fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$ 122MM$$S) 122$$,,#3,?!!))s)< 122""''//FLL/A++0088V\\8J 3rA   rS  N)rQ   rR   rS   rT   r"   config_classsupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendrl  rV   rS  rA   r?   r[  r[    sE    L&*##4"5"N  $!"&KrA   r[  c                     ^  \ rS rSrS/rSr\rS\4U 4S jjrS r	S r
\\          S$S\R                  S	\\R                      S
\\R                     S\\   S\\R$                     S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       5       r\R4                  R7                  SS9   S%S	\R                   S\R                   S\R                   S\S\4
S jj5       rS\S\S\S\R<                  S\R                   4
S jr\ S	\R                   S\S\S \RB                  S\R                   S!\4S" j5       r"S#r#U =r$$ )&Llama4TextModeli  rB  modelr)   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   )r)   F)r-   r.   pad_token_idre  
vocab_sizer4   rd  r2   embed_tokens
ModuleListrangenum_hidden_layersrB  layersr   r!  normr   
rotary_embgradient_checkpointing	post_initr#  s      r?   r.   Llama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHghHg9#F6Hgh
 &f&8&8f>Q>QR	36B&+# 	 is   C?c                     U R                   $ rf   r~  r   s    r?   get_input_embeddings$Llama4TextModel.get_input_embeddings
  s       rA   c                     Xl         g rf   r  r=   r   s     r?   set_input_embeddings$Llama4TextModel.set_input_embeddings  s    !rA   	input_idsr   r   r\  inputs_embedsrQ  r,  output_hidden_statesreturn_dictr'  flash_attn_kwargsrC   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc>  U R                  UR                  U R                  R                  R                  5      5      nU(       ad  Uca  U R                   R                  5       R                  b2  [!        U R                   UR"                  S   UR"                  S   5      nO
[%        5       nU
cD  Ub  UR'                  5       OSn[(        R*                  " XUR"                  S   -   UR                  S9n
Uc  U
R-                  S5      nU R/                  X%XXvS9u  pUnU R1                  X5      nU(       a  SOS nU(       a  SOS nU R2                  S U R                   R4                    H  nU(       a  UU4-  nU R                  (       a7  U R                  (       a&  U R7                  UR8                  UUUUUUSUU
U5      nOU" U4UUUUUUU
US	.UD6nUS   nU(       d  Mx  UUS   4-  nM     U R;                  U5      nU(       a  UU4-  n[=        UU(       a  UOS UUS
9$ )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r!   r   )rQ  rS  )r   rO  r   r&  r,  rQ  r'  r%  )last_hidden_stater\  rB   
attentions)r)   r,  r  rQ  use_return_dict
ValueErrorr  r   r5  r6  r~  r   r   r   get_text_configrE  r   r   r   get_seq_lengthr6   r   	unsqueeze_update_causal_maskr  r  r  _gradient_checkpointing_func__call__r  r   )r=   r  r   r   r\  r  rQ  r,  r  r  r'  r  past_seen_tokensr  rO  rB   freq_cisall_hidden_statesall_self_attnsdecoder_layerlayer_outputss                        r?   rO   Llama4TextModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --ill4;L;L;S;S;Z;Z.[\M0{{**,AAM"4T[[-BUBUVWBXZgZmZmnoZp"q"..!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L)-)A)A>L] *B *
& & ??=? #7BD0d![[)H4;;+H+HIM#!m%55!**t}} $ A A!**!% #%"! !.!!#.&7!-#2&7'#1(0! (! *!,M  =#3"55G JJ 		-0  -!11&+/8Od+%	
 	
rA   F)	recursiveinput_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  X4$ gU R                   R                  S;  a  gUR                  S   nUR	                  U R
                  5      nU R                   R                  n	U	S Ln
US   nUb  UR                  5       =(       d    UnOUb  UR                  S   OUnU
(       aN  X:  nX:  X-   U	:  -  nU(       a5  [        R                  " UX-   S-
  [        R                  " XU-   U	5      5      OUnU R                   R                  S:X  an  [        U[        R                  5      (       a7  U
(       a  U[        X-
  S-   S5      4n[        XUWUS	9n[        UUUUS4S
9nX4$ [        U[        5      (       a  X4$ UR                  UR
                  nnU
(       a  [        X5      OUnU R!                  UUUUUUR                  S   S9nU
(       GaN  X:  GaH  [        X-
  S-   S5      nUW-   nU R#                  U R                   R                  UUUS9nUS S 2UU24   nUR                  S   U	:  nU(       a2  [$        R&                  R)                  USU	UR                  S   -
  45      nU(       d  US S U* S 2S S 24   nOUS S US S 24   nUR+                  UR                  S   SSS5      nUUS S 2S S S S 24   -  nU R                   R                  S:X  aJ  [        R,                  " U5      R.                  n[        R                  " US:H  US5      R	                  U5      nU R                   R                  S:X  ak  Ubh  UR
                  R0                  S;   aN  UR2                  S:X  a>  U(       d7  [        R,                  " U5      R.                  n[4        R6                  " UU5      nU R                   R                  S:X  a^  Ub[  UR9                  5       nU[        R,                  " U5      R.                  :g  n[4        R:                  " UUUU R<                  S9(       a  S nUU4$ )Nflash_attention_2r.  r>  )r+  flex_attentionr*  r!   r   rE   r  )offsets)query_length
key_lengthr  )sequence_lengthtarget_lengthr   r'  
batch_size)startendr   r*  r+  )cudaxpunpu   )r  past_key_values_lengthis_training)r)   r3  anyr   r   r   rE  get_max_cache_shaper6   wherer   rU   maxr%   r$   r   5_prepare_4d_causal_attention_mask_with_cache_positioncreate_chunked_attention_maskr4   r   padr   finfominr   ndimr   _unmask_unattendedrF  _ignore_causal_mask_sdpar   )r=   r   r  r'  r\  r,  chunked_attention_maskrQ  r  rE  using_chunked_attentionfirst_cache_positionfull_cache_lengthcond1cond2r  r  r   r   r  r  	start_idxend_idxlocal_attention_maskrequires_padding	min_dtypes                             r?   r  #Llama4TextModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%55;;++3VV&,,Q/'**4;;7#{{??"6d"B-a0& / C C E X<J<V 4 4R 8\k"(@E)@$69MME  (:Q>KKo'MOcd '  ;;++/??.%,,77*3S9M9dgh9hjk5lmG-H&ozcj.* "="!00115	" &==.)44%== %**L,?,?vH_-DevPP+')#))!, Q 
 #'8'O0G!KQOI*,G%)%G%G00	 &H &" $2!Yw5F2F#G 399"=@TT')}}'8'8(1.BEYE_E_`bEc.c*d($ $)?d_L\L]_`@`)a&)?dN\]@])^&%;%B%B<CUCUVWCXZ\^`bd%e"%;>RSTVZ\`bcSc>d%d"{{//7:!KK.22	).5Kq5PR[]`)a)d)dej)k& KK,,6*%%**.DD##q(%
 E*..I0CCKQZ[K ;;++v5:P:\%;%@%@%B"%U);)?)??K%>>*'; MM	 #222rA   rE  r  r  r   c                 $   [         R                  " X#US9n[         R                  " UR                  S5      U-  UR                  S5      U-  -
  5      nUR                  S5      UR                  S5      -
  nUS:H  US:*  -  nUR	                  U5      $ )u  
Generate the following:

'What'      :  0 ■ ⬚ ⬚ ⬚ ⬚ ⬚    |
'▁is'       :  1 ■ ■ ⬚ ⬚ ⬚ ⬚     |
'▁ch'       :  2 ■ ■ ■ ⬚ ⬚ ⬚     |
'unked'     :  3 ⬚ ⬚ ⬚ ■ ⬚ ⬚    |
'▁attention':  4 ⬚ ⬚ ⬚ ■ ■ ⬚    |
'?'         :  5 ⬚ ⬚ ⬚ ■ ■ ■     |

If the chunk size is 3.
This can just be applied over the already created attention mask
r   r   r!   )r6   r   absr  r   )	r=   rE  r  r  r   arange_vector	block_pos	token_posmasks	            r?   r  -Llama4TextModel.create_chunked_attention_mask  s      U?II##A&*>>AXAXYZA[_sAss
	 "++A.1H1H1KK	Q9>2wwvrA   r  r  r   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    device (`torch.device`):
        The device to place the 4D attention mask on.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr  
fill_valuer   r   r!   diagonalr   rE   r   rG   r6   r  r  fullr   triur   r   r   cloner   r   masked_fillr   r  r  r   r'  r  r  r  r  mask_lengthpadding_masks              r?   r  ELlama4TextModel._prepare_4d_causal_attention_mask_with_cache_position  s~   @ %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg"))E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 rA   )r~  r  r  r  re  r  r}  )
NNNNNNNNNN)FNT)%rQ   rR   rS   rT   _no_split_modulesbase_model_prefixr#   rn  r.   r  r  r   r   r6   r@  r   rU   r   rY  rF  r   r   r   r   r   rO   compilerdisabler  intr   r  staticmethodr   r  rV   rW   rX   s   @r?   ry  ry    sF   12#L/  !"  '+1537+/59$(,0/3&*59m
##m
 !.m
 u//0	m

 "%m
   1 12m
 D>m
 $D>m
 'tnm
 d^m
 !!1!12m
 $$89m
 
u--	.m
  m
^ ^^e, #(#~3~3 ll~3 	~3
 ~3  ~3 -~3@$'03:=GL||	0 666 6 {{	6
 6 6 6rA   ry  c                       \ rS rSrSrg)KwargsForCausalLMiT  rS  N)rQ   rR   rS   rT   rV   rS  rA   r?   r  r  T  s    3rA   r  c            !         ^  \ rS rSrS/rSrS/rSS0r\r	S\4U 4S jjr
S	 rS
 rS rS rS rS r\\            SS\R(                  S\\R,                     S\\R(                     S\\\\\R4                     4      S\\R4                     S\\R(                     S\\   S\\   S\\   S\\   S\\R(                     S\\\R,                  4   S\\   S\\\ 4   4S jj5       5       r!Sr"U =r#$ ) Llama4ForCausalLMiW  rB  language_modelzlm_head.weightlm_headcolwise_repr)   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r]   )
r-   r.   ry  rz  r}  r4   r`   r2   r  r  r<   s     r?   r.   Llama4ForCausalLM.__init__^  sU     $V,
 ++yy!3!3V5F5FUS 	rA   c                 .    U R                   R                  $ rf   rz  r~  r   s    r?   r  &Llama4ForCausalLM.get_input_embeddingsg  s    zz&&&rA   c                 $    XR                   l        g rf   r  r  s     r?   r  &Llama4ForCausalLM.set_input_embeddingsj  s    "'

rA   c                     U R                   $ rf   r  r   s    r?   get_output_embeddings'Llama4ForCausalLM.get_output_embeddingsm  s    ||rA   c                     Xl         g rf   r  r=   new_embeddingss     r?   set_output_embeddings'Llama4ForCausalLM.set_output_embeddingsp  s    %rA   c                     Xl         g rf   rz  r=   decoders     r?   set_decoderLlama4ForCausalLM.set_decoders  s    
rA   c                     U R                   $ rf   r  r   s    r?   get_decoderLlama4ForCausalLM.get_decoderv  s    zzrA   r  r   r   r\  r  labelsrQ  r,  r  r  r'  logits_to_keepr  rC   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  " SUUUUUUUU	SUS.
UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SUX`R                   R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Llama4ForCausalLM

>>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```NT)
r  r   r   r\  r  rQ  r,  r  r  r'  r   )logitsr  r}  )lossr  r\  rB   r  rS  )r)   r,  r  r  rz  r   r  slicer  loss_functionr}  r   r\  rB   r  )r=   r  r   r   r\  r  r  rQ  r,  r  r  r'  r  r  rW  rB   slice_indicesr  r  s                      r?   rO   Llama4ForCausalLM.forwardy  s1   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ** 
)%+'/!5)
 
  
8B>SV8W8W~ot4]kmA}a,?@A%%pVF{{OeOepiopD%#33!//))
 	
rA   )r  rz  r}  )NNNNNNNNNNNr   )$rQ   rR   rS   rT   r  r  _tied_weights_keys_tp_planr#   rn  r.   r  r  r  r  r	  r  r   r   r6   r@  r   rU   r   r   r   rY  rF  r  r   r  r   r   rO   rV   rW   rX   s   @r?   r  r  W  s   12(*+=)H#L/ '(&  '+1537KO59-1$(,0/3&*5934I
##I
 !.I
 u//0	I

 "%tE4E4E/F(F"GHI
   1 12I
 ))*I
 D>I
 $D>I
 'tnI
 d^I
 !!1!12I
 c5<</0I
 *+I
 
u,,	-I
  I
rA   r  c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)Llama4CausalLMOutputWithPasti  a<  
Base class for Llava causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nr  r  r\  rB   r  image_hidden_statesrS  )rQ   rR   rS   rT   r?  r  r   r6   rY  __annotations__r  r\  r   rB   r   r  r  rV   rS  rA   r?   r  r    s    < )-D(5$$
%, $FE$9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;rA   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4VisionMLP2i  c                 x  > [         TU ]  5         UR                  U l        UR                  U l        [        R
                  " U R                  UR                  SS9U l        [        R
                  " UR                  UR                  SS9U l	        [        R                  " 5       U l        UR                  U l        g r]   )r-   r.   r2   r1   r4   r`   projector_input_dimfc1projector_output_dimfc2GELUrc   projector_dropoutr   r<   s     r?   r.   Llama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//rA   c                     U R                  U5      nU R                  U5      n[        R                  " XR                  U R                  S9nU R                  U R                  U5      5      $ )Nr   )r!  rc   Fr   r   r#  r=   rB   s     r?   rO   Llama4VisionMLP2.forward  sR    /**=9		-<<$--X!!$((="9::rA   )rc   r   r!  r#  r2   r1   ri   rX   s   @r?   r  r    s    0; ;rA   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4MultiModalProjectori   c                    > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        g r]   )	r-   r.   r4   r`   vision_configvision_output_dimr`  r2   linear_1r<   s     r?   r.   "Llama4MultiModalProjector.__init__  s?    		  22**
rA   c                 (    U R                  U5      nU$ rf   r0  )r=   image_featuresrB   s      r?   rO   !Llama4MultiModalProjector.forward	  s    n5rA   r3  ri   rX   s   @r?   r,  r,     s    
 rA   r,  c           
      8   U R                   u  p#n[        [        R                  " U5      5      nU R	                  X%US5      n U R                  5       u  p&ptU R	                  X&[        Xq-  5      [        XA-  5      5      nUR                  SSSS5      R                  5       nUR	                  U[        Xa-  5      [        Xq-  5      [        XAS-  -  5      5      nUR                  SSSS5      R                  5       nUR	                  USUR                   S   5      n	U	$ )NrE   r   r,   r!   r
   )r   r  mathsqrtrH   r   permuter  )
r  shuffle_ratior  num_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r?   pixel_shufflerB    s   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='J"''
C@U<VX[\d\tXuvO%--aAq9DDFO%**C./U5J1KSQYlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMrA   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionPixelShuffleMLPi"  c                    > [         TU ]  5         UR                  U l        [        UR                  U R                  S-  -  5      U l        UR                  U l        [        U5      U l	        g r+   )
r-   r.   pixel_shuffle_ratior  r   	inner_dimr"  
output_dimr  mlpr<   s     r?   r.   $Llama4VisionPixelShuffleMLP.__init__#  sX    #)#=#= V77D<T<TVW<WXY 55#F+rA   encoded_patchesrC   c                 N    [        XR                  5      nU R                  U5      $ rf   )rB  rF  rI  )r=   rK  s     r?   rO   #Llama4VisionPixelShuffleMLP.forward*  s!    '9Q9QRxx((rA   )rG  rI  rH  rF  
rQ   rR   rS   rT   r.   r6   rU   rO   rV   rW   rX   s   @r?   rD  rD  "  s(    ,)u|| ) ) )rA   rD  freqs_cic                     UR                   n[        UR                  5       VVs/ s H  u  p4US:X  d  X2S-
  :X  a  UOSPM     nnnU R                  " U6 $ s  snnf )Nr!   )r  	enumerater   rH   )rO  r   r  idr   s         r?   reshape_for_broadcastrT  0  sT    ::D=Fu{{=ST=STQ!q&AMQq0=SET==%   Us   Ac                 >   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[        X#S9nUR                  UR                  5      n[         R                  " X2-  5      R                  S5      n[         R                  " XB-  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrE   r,   )rO  r   r
   )r6   r   r|   r   r   rT  r   r   r   r   r}   )r   r   rO  query_key_	query_outkey_outs          r?   vision_apply_rotary_embrZ  6  s    
 ""5;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!Lciin!Lb!L!!LMD$hEH{{6==)H""6#45==a@I  199!<GU#W__S%999rA   c                     ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\\R                     S\\	   S\
\   S	\\R                  \\R                     \\\R                        4   4S
 jjrSrU =r$ )Llama4VisionAttentioniD  r)   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  UR
                  -  U l        SU l        UR                  U l	        U R                  S-  U l
        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )Nr!   r  Tr^   )r-   r.   r)   r2   	embed_dimr  	num_headsr   r   r  r   r4   r`   r  r  r  r  r<   s     r?   r.   Llama4VisionAttention.__init__E  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZrA   rB   rO  r   r&  r  rC   c                 ^   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  U5      n	U R                  U5      R                  U5      n
[        XUS9u  pUR                  SS5      nU	R                  SS5      n	U
R                  SS5      n
[        nU R                  R                  S;  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R                  S	5        O[        U R                  R                     nU" U UU	U
S 4U R                  (       d  S
OU R                   S SS.UD6u  pUR"                  " / UQSP76 R%                  5       nU R'                  U5      nX4$ )NrE   rO  r!   r,   )r*  r  r+  r,  Fr-  r.  )r   r   r  )r   r   r  rH   r  r  rZ  r   r  r)   r3  r4  r5  r6  r   r   r  r   r  r  )r=   rB   rO  r   r&  r  r7  r8  r9  r  r  r<  r	  r  s                 r?   rO   Llama4VisionAttention.forwardT  s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g #--a3))!Q/
#--a3(F;;++3NN{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2H
%
 
%
! "));;;;FFHkk+.((rA   )r  r)   r^  r   r  r_  r   r  r  r   r  r>  )rQ   rR   rS   rT   r	   r.   r6   rU   r   r   r   r   r   rO   rV   rW   rX   s   @r?   r\  r\  D  s    [1 [& 26*..)||.) ,,.) !.	.)
 !.) -..) 
u||Xell3XeELL>Q5RR	S.) .)rA   r\  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionMLPi  c                   > [         TU ]  5         Xl        [        R                  " 5       U l        [        R                  " UR                  UR                  SS9U l	        [        R                  " UR                  UR                  SS9U l
        g )NTr^   )r-   r.   r)   r4   r$  rc   r`   r2   r1   r!  r#  r<   s     r?   r.   Llama4VisionMLP.__init__  sc    WWY99V//1I1IPTU99V55v7I7IPTUrA   rB   rC   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rf   )r!  rc   r#  r)  s     r?   rO   Llama4VisionMLP.forward  s4    /**=9/rA   )rc   r)   r!  r#  rN  rX   s   @r?   re  re    s)    VU\\ ell  rA   re  c            
          ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\R                  S\\R                     S\\	   4S jjr
S	rU =r$ )Llama4VisionEncoderLayeri  r)   c                   > [         TU ]  5         UR                  U l        [        U5      U l        [        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  5      U l
        g rf   )r-   r.   r2   r\  rD  re  rI  r4   rf  rL  rM  r<   s     r?   r.   !Llama4VisionEncoderLayer.__init__  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%rA   hidden_staterO  r   r,  c                     UnU R                  U5      nU R                  UUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )N)rO  r   )rL  rD  rM  rI  )r=   rn  rO  r   r,  rT  r  rW  s           r?   rO    Llama4VisionEncoderLayer.forward  s      ++L9%)^^) &4 &
"
  .  44\Bxx-./&GrA   )r2   rL  rI  rM  rD  r>  )rQ   rR   rS   rT   r	   r.   r6   rU   r   rF  rO   rV   rW   rX   s   @r?   rk  rk    s_    I1 I 26,0ll ,, !.	
 $D> rA   rk  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S\	\R                     S\	\
   S	\	\
   S
\	\
   S\\\4   4S jjrSrU =r$ )Llama4VisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Llama4VisionEncoderLayer`].

Args:
    config: Llama4VisionConfig
r)   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        Xl        g s  snf )NF)
r-   r.   r)   r4   r  r  r  rk  r  r  )r=   r)   _r>   s      r?   r.   Llama4VisionEncoder.__init__  sY    mmuU[UmUmOn$oOn!%=f%EOn$op&+# %ps   A,rB   rO  r   r,  r  r  rC   c                 .   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnU R                   Hn  n	U(       a  Xq4-   nU R
                  (       a1  U R                  (       a   U R                  U	R                  UUUU5      n
O	U	" UUUUS9n
U(       a  XS   4-   nU
S   nMp     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrS  )rn  r   r,  rO  r!   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frf   rS  .0vs     r?   	<genexpr>.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     e$Sq$S   	r  rB   r  )r)   r,  r  r  r  r  r   r  r  r   r   )r=   rB   rO  r   r,  r  r  encoder_statesall_attentionsencoder_layerr  s              r?   rO   Llama4VisionEncoder.forward  s'   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[M#!/2B!B**t}} $ A A!**!"%! !.!.#1&7%	! !!/3C2E!E)!,M- )0  +.>>Ne]N$Seee+Vd
 	
rA   )r)   r  r  NNNN)rQ   rR   rS   rT   r?  r	   r.   r6   rU   r   rF  r   r   r   rO   rV   rW   rX   s   @r?   rr  rr    s    1  26,0/3&*G
||G
 ,,G
 !.	G

 $D>G
 'tnG
 d^G
 
uo%	&G
 G
rA   rr  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4UnfoldConvolutioni  c                 8  > [         TU ]  5         UR                  n[        U[        5      (       a  X"4n[
        R                  R                  X!R                  S9U l        [        R                  " UR                  US   -  US   -  UR                  SS9U l        g )N)kernel_sizestrider   r!   Fr^   )r-   r.   r=  r   r  r6   r4   Unfoldunfoldr`   num_channelsr2   linear)r=   r)   r  r>   s      r?   r.    Llama4UnfoldConvolution.__init__  s    ''k3''&4Khhoo+FWFWoXii+a.0;q>A
rA   rB   rC   c                 p    U R                  U5      nUR                  SSS5      nU R                  U5      nU$ )Nr   r,   r!   )r  r9  r  r)  s     r?   rO   Llama4UnfoldConvolution.forward'  s8    M2%--aA6M2rA   )r  r  rN  rX   s   @r?   r  r    s(    

U\\ ell  rA   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4VisionRotaryEmbeddingi.  c                   > [         TU ]  5         UR                  UR                  -  n[        R
                  " US-  [        R                  S9R                  US-  S5      n[        R                  " X3S S /SS9nSUS'   X2-  nX2-  nUR                  UR                  -  S-  nSUR                  [        R
                  " SUS5      S US-   R                  5       U-  -  -  nUS-   S	   US S S S 24   -  R                  SS
S9nUS-   S	   US S S S 24   -  R                  SS
S9n	[        R                  " X/S
S9R                  5       R                  5       SS S S24   n
U
R                  UR                  S
SS5      S:  S5      n
[        R                   " [        R"                  " [        R$                  " U
5      [        R&                  " U
5      /S
S95      nXl        g )Nr,   )r   r!   r   rF   r   )rE   rE   r)  ).NrE   .)r-   r.   
image_sizer=  r6   r   int32r   catr2   r  
rope_thetar|   repeat_interleaver  r  r   stackcossinrO  )r=   r)   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   r  r>   s               r?   r.   $Llama4VisionRotaryEmbedding.__init__/  s   6#4#44,,sAvU[[9AA#q&!L))Wbqk2:%%)C)CCqH6,,a11MN_QY]^Q^1`1f1f1hks1stu	!A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc rA   c                 L    U R                   R                  UR                  5      $ rf   )rO  r   r   r)  s     r?   rO   #Llama4VisionRotaryEmbedding.forward@  s    }} 4 455rA   rb  ri   rX   s   @r?   r  r  .  s    !"6 6rA   r  c                      ^  \ rS rSrSrS/r\rS\4U 4S jjrS r	    SS\
R                  S\\
R                     S	\\   S
\\   S\\   S\\\\
R                  S4   4   4S jjrSrU =r$ )rh  iD  vision_modelrk  r)   c                 ~  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  -  S-  S-   U l        UR                  S-  U l        [        U5      U l	        [        R                  " U R                  [        R                  " U R                  5      -  5      U l        [        R                  " U R                  [        R                  " U R                  U R                  5      -  5      U l        [!        U5      U l        [        R$                  " U R                  5      U l        [        R$                  " U R                  5      U l        [+        U5      U l        [/        U5      U l        U R3                  5         g )Nr,   r!   r  )r-   r.   r  r=  r2   r  r;  rj  r  patch_embeddingr4   r5   r6   randnri  rk  r  rotary_embeddingrf  layernorm_prelayernorm_postrr  rz  rD  vision_adapterr  r<   s     r?   r.   Llama4VisionModel.__init__I  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&ArA   c                     U R                   $ )zW
This function is used to fetch the first embedding layer to activate grads on inputs.
)r  r   s    r?   r  &Llama4VisionModel.get_input_embeddingsb  s     ###rA   pixel_valuesr   r,  r  r  rC   .c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR                  u  pgpSn
SnU R                  U5      nUR                  u  pnUR                  Xj-  U-  X5      nU R                  R                  UR                  S   SUR                  S   5      n[        R                  " UU/SS9nUS-  nUR                  Xj-  XU5      nU R                  R                  UR                  UR                  S9nUU-   nU R                  U5      nUR!                  USU5      nU R#                  U5      nU R%                  USUUUS9nUR&                  nU R)                  U5      nUSS2SS2SS24   nU R+                  U5      nU(       a  UR,                  OSnU(       a  US   nOSnU(       d  [/        S	 UUU4 5       5      $ [1        UUUS
9$ )aN  

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, MllamaVisionModel

>>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
>>> model = MllamaVisionModel.from_pretrained(checkpoint)
>>> processor = AutoProcessor.from_pretrained(checkpoint)

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")

>>> output = model(**inputs)

>>> print(output.last_hidden_state.shape)
torch.Size([1, 1, 4, 1025, 7680])
```
Nr!   r   rE   rF   )r   r   )r   r  r,  rO  r,   c              3   .   #    U  H  oc  M  Uv   M     g 7frf   rS  rx  s     r?   r{  ,Llama4VisionModel.forward.<locals>.<genexpr>  s     _$Mq$Mr}  r~  )r)   r,  r  r  r   r  r   ri  r   r6   r  rk  r   r   r   r  rH   r  rz  r  r  r  rB   r   r   )r=   r  r   r,  r  r  batch_size_times_num_tilesr  r>  r?  num_concurrent_media
num_chunksrn  rt  r;  r   ri  positional_embeddingrO  r   rB   r  s                         r?   rO   Llama4VisionModel.forwardh  sB   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"& 
++L9%1%7%7"
 $++&=
JK
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&=zXb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___*'!
 	
rA   )ri  r2   r  r  r  rz  r  r;  r  r=  rk  r  rj  r  r  )rQ   rR   rS   rT   r  r  r	   rn  r.   r  r6   rU   r   rF  r   r   r   rO   rV   rW   rX   s   @r?   rh  rh  D  s    &34%L1 2$ 26,0/3&*_
ll_
 !._
 $D>	_

 'tn_
 d^_
 
ellC&7 88	9_
 _
rA   rh  c            (         ^  \ rS rSrSS/r0 rSr\rS\4U 4S jjr	S r
S rS	 rS
 rS rS rS\R"                  S\\\\   4   S\4S jr\                S(S\R0                  S\R"                  S\\R4                     S\\R0                     S\\\R"                        S\\R"                     S\\\\\   4      S\\   S\\R0                     S\\   S\\   S\\   S\\   S\\R0                     S\\\R4                  4   S\R4                  S\\   S\\\4   4$S  jj5       r       S)S! jr!\"S\R4                  S"\S#\S$\RF                  S\R4                  S%\4S& j5       r$S'r%U =r&$ )*Llama4ForConditionalGenerationi  rB  rk   r)   c                 j  > [         TU ]  U5        [        UR                  5      U l        [        U5      U l        [        UR                  5      U l	        UR                  R                  U l
        U R                  R                  b  U R                  R                  OSU l        U R                  5         g )NrE   )r-   r.   rh  r.  r  r,  multi_modal_projectorr  r`  r  r}  r)   r|  r  r<   s     r?   r.   'Llama4ForConditionalGeneration.__init__  s     -f.B.BC%>v%F"/0B0BC ,,778<8P8P8\DKK44bdrA   c                 6    U R                   R                  5       $ rf   )r  r  r   s    r?   r  3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799rA   c                 :    U R                   R                  U5        g rf   )r  r  r  s     r?   r  3Llama4ForConditionalGeneration.set_input_embeddings  s    007rA   c                 6    U R                   R                  5       $ rf   )r  r  r   s    r?   r  4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::rA   c                 :    U R                   R                  U5        g rf   )r  r  r  s     r?   r  4Llama4ForConditionalGeneration.set_output_embeddings  s    11.ArA   c                 :    U R                   R                  U5        g rf   )r  r	  r  s     r?   r	  *Llama4ForConditionalGeneration.set_decoder  s    ''0rA   c                 6    U R                   R                  5       $ rf   )r  r  r   s    r?   r  *Llama4ForConditionalGeneration.get_decoder  s    ""..00rA   r  vision_feature_layervision_feature_select_strategyc                     US;  a  [        SU R                   35      eUR                  5        VVs0 s H  u  pVUc  M
  XV_M     nnnU R                  " U4SS0UD6nUR                  nU$ s  snnf )a  
Obtains image last hidden states from the vision tower and apply al projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layer (`Union[int, List[int]]`):
        The index of the layer to select the vision feature. If multiple indices are provided,
        the vision feature of the corresponding indices will be concatenated to form the
        vision features.
    vision_feature_select_strategy (`str`):
        The feature selection strategy used to select the vision feature from the vision backbone.
        Can be one of `"default"` or `"full"`
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)r   r  z$Unexpected select feature strategy: r  F)r  r  itemsr  r  )	r=   r  r  r  r  krz  image_outputsrn  s	            r?   get_image_features1Llama4ForConditionalGeneration.get_image_features  s{    . *1DDCDDgDgChijj#)<<>C>41Q$!$>C)),]U]V\]$66 Ds
   	A+A+r  r   r   r\  r  r  rQ  r,  r  r  r'  r  image_sizesr  rC   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  R
                  nUb  UOU R                   R                  R                  nUSL USL-  (       a  [        S5      eUb  Ub  [        S5      eUc  U R                  5       " U5      nUGb_  U R                  UUUUS9nUR                  nUR                  SUR                  S5      5      nU R                  U5      nXR                   R                  :H  R                  S5      nUR!                  UR"                  5      nUR                  SUR                  S5      5      nUS   R%                  S5      nUR'                  5       nUUR                  S5      :w  a   [        SU S	UR                  S5       35      eUR                  S5      R)                  SUR                  S5      5      nUR+                  UU5      nUR                  U5      nU R,                  " SUUUUU
UUUUUS
.
UD6nUS   nSnU	Gb>  Ub  USS2UR                  S   S-
  * S24   R!                  UR"                  5      nUSSS2SS24   UR!                  UR"                  5      S:g     R/                  5       nU	SSS24   UR!                  U	R"                  5      S:g     R/                  5       n O1USSS2SS24   R/                  5       nU	SSS24   R/                  5       n [0        R2                  " 5       n!U!" UR                  SUR                  S5      5      U R                  S5      R!                  UR"                  5      5      nU(       d  U4USS -   n"Ub  U4U"-   $ U"$ [5        UUUR6                  UR8                  UR:                  Ub  WS9$ SS9$ )a   
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, LlavaForConditionalGeneration

>>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
>>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
```Nr  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r  r  r  r  rE   ).r   r   zMismatch: final_mask wants z0 embeddings, but multi_modal_projector returned )
r   r   r\  r  rQ  r,  r  r  r'  r  r!   .)r  r  r\  rB   r  r  rS  )r)   r,  r  r  r.  r  r  r  r  r  r   rH   r   r  image_token_idr  r   r   r   sumr   masked_scatterr  r  r4   CrossEntropyLossr  r\  rB   r  )#r=   r  r  r   r   r\  r  r  r  r  rQ  r,  r  r  r'  r  r  r  r4  original_inputs_embeds_shapevision_flatprojected_vision_flatspecial_image_mask
final_maskfinal_mask_1dnum_tokens_to_fillexpanded_maskrW  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr   s#                                      r?   rO   &Llama4ForConditionalGeneration.forward  s[   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] $/ !**?? 	 .9 +**II 	' -t";<YZZ#(Av    557	BM#!44)%9/M'	 5 N ,9+>+>((--b.2E2Eb2IJK$($>$>{$K!"+{{/I/I"I!T!TUW!X+..}/C/CDJ)..r=3E3Eb3IJM&v.66r:M!.!2!2!4!%:%?%?%BB 12D1E F::O:T:TUV:W9XZ 
 *33B7>>r=CUCUVXCYZM)88H]^M)../KLM%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
rA   c           	      f    U R                   R                  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)r\  r  r   r'  r  r   r  )r  prepare_inputs_for_generation)
r=   r  r\  r  r  r   r'  r  r  model_inputss
             r?   r  <Llama4ForConditionalGeneration.prepare_inputs_for_generation  sZ     **HH
+')))
 
 !! ,8(rA   r  r  r   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr  r  r!   r  r   rE   r   r  r  s              r?   r  TLlama4ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 rA   )r  r  r|  r  r}  )NNNNNNNNNNNNNNr   N)NNNNNN)'rQ   rR   rS   rT   r  r  r  r"   rn  r.   r  r  r  r  r	  r  r6   rY  r   r  r   r   r  r   r@  r   rU   rF  r   r  r   r  rO   r  r  r   r  rV   rW   rX   s   @r?   r  r    s   13MNHL	| 	:8;B11'' $CcN3 ),	<  '+*.1537=A59@D8<-1$(,0/3&*5934$(#U
##U
 ''U
 !.	U

 u//0U
 "$u'8'8"9:U
   1 12U
 'uS$s)^'<=U
 )1U
 ))*U
 D>U
 $D>U
 'tnU
 d^U
 !!1!12U
  c5<</0!U
" \\#U
$ *+%U
& 
u22	3'U
 U
t < 444 4 {{	4
 4 4 4rA   r  )r[  ry  rh  r  r  )r.  )`r7  dataclassesr   typingr   r   r   r   r   r6   torch.nnr4   torch.nn.functionalr   r(  /transformers.models.llama4.configuration_llama4r	   activationsr   cache_utilsr   r   r   
generationr   integrations.hub_kernelsr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    configuration_llama4r"   r#   !torch.nn.attention.flex_attentionr$   integrations.flex_attentionr%   
get_loggerrQ   r5  Moduler'   rZ   rk   r   r   r   rU   r   r  r   r|   r
  r  r  rB  r[  ry  r  r  r  r  r,  rB  rD  rT  rZ  r\  re  rk  rr  r  r  rh  r  __all__rS  rA   r?   <module>r     s     ! 9 9     N ! B B ) C > B m m K F & h h @  !!;J			H	%		 B)BII )$!uxx !=		 =( _-)"BII )" .)"X		 >	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % %D %II%<<% 
% <<	%
 U\\*% % %4`)")) `)FFRYY FR $KO $K $KN \+ \ \~
 ?,j >m
- m
` $<; $< $<N;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:>)BII >)Bbii )ryy )XW
")) W
tbii (6")) 6,C
- C
Ll%:O l^	rA   