o
    ZhY5                    @   s  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlm  mZ d dlmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4 e0 rd dl5m6Z6 ddl7m8Z8 e19e:Z;G dd dej<Z=G dd dej<Z>G dd de	jj<Z?G dd dej<Z@edG dd dej<ZAG dd  d ej<ZBd!e	jCd"e	jCd#e	jCd$ee	jCe	jCf fd%d&ZDd'e	jCd(eEd$e	jCfd)d*ZF	+dcd,ej<d-e	jCd.e	jCd/e	jCd0ee	jC d1eGd2eGfd3d4ZH	+dcd,ej<d-e	jCd.e	jCd/e	jCd0ee	jC d1eGd2eGfd5d6ZIG d7d8 d8ej<ZJG d9d: d:ej<ZKe.G d;d< d<e)ZLe.G d=d> d>eLZMG d?d@ d@ee-ZNG dAdB dBeLeZOeG dCdD dDe#ZPG dEdF dFe	jj<ZQG dGdH dHej<ZRdIdJ ZSG dKdL dLej<ZTdMe	jCd-e	jCfdNdOZUd-e	jCd.e	jCdMe	jCd$ee	jCe	jCf fdPdQZVG dRdS dSej<ZWG dTdU dUej<ZXG dVdW dWej<ZYG dXdY dYej<ZZG dZd[ d[ej<Z[G d\d] d]ej<Z\G d^d_ d_eLZ]G d`da daeLeZ^g dbZ_dS )d    N)	dataclass)CallableListOptionalTupleUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCacheHybridChunkedCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )Llama4ConfigLlama4TextConfig)	BlockMask)make_flex_block_causal_maskc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Llama4TextExpertsconfigc                    sx   t    |j| _|j| _|j| _| j| _tt	
| j| jd| j | _tt	
| j| j| jf| _t|j | _d S N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_sizeZ
expert_dimnn	Parametertorchemptygate_up_proj	down_projr
   
hidden_actact_fnselfr&   	__class__ Y/var/www/auris/lib/python3.10/site-packages/transformers/models/llama4/modeling_llama4.pyr*   1   s   
 zLlama4TextExperts.__init__hidden_statesreturnc                 C   s\   | | jd| j}t|| j}|jddd\}}t|| | | j}| d| j}|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r(   dim)	viewr,   r.   r1   Zbmmr3   chunkr6   r4   )r8   r=   Zgate_upZgateupZnext_statesr;   r;   r<   forward;   s   zLlama4TextExperts.forward)	__name__
__module____qualname__r"   r*   r1   TensorrE   __classcell__r;   r;   r9   r<   r%   0   s    
r%   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Llama4TextMLPNc                    sj   t    |d u r|j}|| _tj|j|dd| _tj|j|dd| _tj||jdd| _	t
|j | _d S NFbias)r)   r*   r-   r&   r/   Linearr.   	gate_projup_projr4   r
   r5   activation_fn)r8   r&   r-   r9   r;   r<   r*   R   s   
zLlama4TextMLP.__init__c                 C   s$   |  | || | }| |S N)rR   rP   rQ   r4   )r8   xr4   r;   r;   r<   rE   ^   s   
zLlama4TextMLP.forwardrS   rF   rG   rH   r*   rE   rJ   r;   r;   r9   r<   rK   Q   s    rK   c                       s<   e Zd Zddef fddZdd Zdd Zd	d
 Z  ZS )Llama4TextL2Normư>epsc                    s   t    || _d S rS   )r)   r*   rX   )r8   rX   r9   r;   r<   r*   d   s   

zLlama4TextL2Norm.__init__c                 C   $   |t |djddd| j  S Nr(   r?   T)Zkeepdimr1   ZrsqrtpowmeanrX   r8   rT   r;   r;   r<   _normh      $zLlama4TextL2Norm._normc                 C   s   |  | |S rS   )r_   floattype_asr^   r;   r;   r<   rE   k   s   zLlama4TextL2Norm.forwardc                 C   s   d| j  S )Nzeps=rX   r8   r;   r;   r<   
extra_reprn      zLlama4TextL2Norm.extra_repr)rW   )	rF   rG   rH   ra   r*   r_   rE   re   rJ   r;   r;   r9   r<   rV   c   s
    rV   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )Llama4TextRMSNormh㈵>c                    s&   t    || _tt|| _dS )z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r)   r*   rX   r/   r0   r1   Zonesweight)r8   r.   rX   r9   r;   r<   r*   s   s   
zLlama4TextRMSNorm.__init__c                 C   rY   rZ   r[   r^   r;   r;   r<   r_   {   r`   zLlama4TextRMSNorm._normc                 C   s   |  | |}|| j S rS   )r_   ra   rb   ri   )r8   rT   outputr;   r;   r<   rE   ~   s   
zLlama4TextRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupleri   shaperX   rd   r;   r;   r<   re      s   zLlama4TextRMSNorm.extra_repr)rh   )rF   rG   rH   r*   r_   rE   re   rJ   r;   r;   r9   r<   rg   r   s
    rg   Llama4TextMoec                       $   e Zd Z fddZdd Z  ZS )rm   c                    sP   t    |j| _|j| _|j| _t|| _	t
j|j|jdd| _t|| _d S rL   )r)   r*   Znum_experts_per_toktop_kr.   
hidden_dimr+   r,   r%   expertsr/   rO   routerrK   shared_expertr7   r9   r;   r<   r*      s   

zLlama4TextMoe.__init__c                 C   s  |j \}}}|d| j}| |}|| }tj|| jdd\}}t|td	d||
dd}	tj||jddd|	dd}t|	 |j}	|ddd|}tj|d|d|j}
|
|	dd }
| |
}| |}|jd||d|d ||	fS )	Nr?   r    r@   z-infr   device)inputrA   index)rA   rw   src)rl   reshaperp   rr   r1   Ztopkro   Z	full_likera   Zscatter_	transposearangeru   rB   expandsizeZsigmoidtodtypegatherrq   rs   Zscatter_add_)r8   r=   batchZseq_lenrp   router_logitsZtokens_per_expertZrouter_top_valueZrouter_indicesZrouter_scoresZ	routed_inZ
routed_outoutr;   r;   r<   rE      s.   
 $

zLlama4TextMoe.forwardrU   r;   r;   r9   r<   rm      s    	c                       s8   e Zd Zddef fddZe edd Z  Z	S )Llama4TextRotaryEmbeddingNr&   c                    sp   t    |jd urdnd| _|j| _|j| _|| _t| j | _	| 	| j|\}| _
| jd|dd | j| _d S )NZllama3defaultinv_freqF)
persistent)r)   r*   Zrope_scalingZ	rope_typeZmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr&   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r8   r&   ru   r   r9   r;   r<   r*      s   
z"Llama4TextRotaryEmbedding.__init__c                 C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}tj	|dd# |
|j| dd}tt||}|| j }W d    |S 1 saw   Y  |S )	Nr   r?   r    ZmpscpuF)device_typeenabledr(   )r   ra   r|   rl   
isinstanceru   typestrr1   Zautocastr~   rz   ZpolarZ	ones_liker   )r8   rT   position_idsZinv_freq_expandedZposition_ids_expandedr   freqs	freqs_cisr;   r;   r<   rE      s   (&
z!Llama4TextRotaryEmbedding.forwardrS   )
rF   rG   rH   r"   r*   r1   Zno_gradr   rE   rJ   r;   r;   r9   r<   r      s
    r   xqxkr   r>   c              	   C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t ||d d d d d d d f  d}t ||d d d d d d d f  d}|| ||fS )Nr?   r(   r	   )r1   view_as_complexra   ry   rl   view_as_realflattenrb   )r   r   r   Zxq_Zxk_Zxq_outZxk_outr;   r;   r<   apply_rotary_emb   s
   ,,,,r   r=   n_repc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rl   r|   ry   )r=   r   r   num_key_value_headsslenhead_dimr;   r;   r<   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj	|
|| j
d}
t|
|	}|dd }||
fS )Nr(   r	   r?   r@   ptrainingr    )r   num_key_value_groupsr1   matmulrz   rl   r/   
functionalsoftmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr;   r;   r<   eager_attention_forward   s   
&r   c                 K   s   t || j}t || j}	t||dd| jd  }
|d ur6|d d d d d d d |jd f }|
| }
tjj	|
dd}
tjj
|
|| jd}
t|
|	}|dd }||
fS )	Nr(   r	         r   r?   r@   r   r    )r   r   r1   r   rz   r   rl   r/   r   r   r   r   r   r   r;   r;   r<   vision_eager_attention_forward  s   
&r   c                       s   e Zd ZdZdef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr&   c                    s&  t    || _|| _t|d|j|j | _|j| _|j|j | _	|j| _| jd | _
|j| _|j| _|j| _|j| _d| _|j| | _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jjr| jrt|j| _d S d S d S )Nr   r   TrM   )r)   r*   r&   	layer_idxgetattrr.   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper/   rO   Zattention_biasq_projk_projv_projo_projZuse_qk_normrV   rms_norm_epsqk_normr8   r&   r   r9   r;   r<   r*   #  s:   
zLlama4TextAttention.__init__Nr=   position_embeddingsr   past_key_valuecache_positionr   r>   c                 K   s  |j d d }g |d| jR }| ||}	| |jg |d| jR  }
| ||dd}| jrDt|	|
|	|	j
\}	}
t| drS| |	}	| |
}
| jr| jstt| d | j d | j d }|d|d ddfg |ddR }|	| 	|	j}	|	dd}	|
dd}
|d urd|i}||
|| j|\}
}t}| jjdkr| jjdkr|d	d
rtd nt| jj }|| |	|
||f| jsdn| j | j!d|\}}|j"g |dR  # }| $|}||fS )Nr?   r    r(   r         ?r   eagersdpaoutput_attentionsF`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   )%rl   r   r   rB   r   r   rz   r   r   r~   ru   hasattrr   r   r1   logfloorra   r   r   r|   r   updater   r   r&   _attn_implementationgetloggerwarning_oncer   r   r   r   ry   r   r   )r8   r=   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   Zattn_scalesZcache_kwargsattention_interfacer   r   r;   r;   r<   rE   A  sX   	 


**

zLlama4TextAttention.forwardNN)rF   rG   rH   __doc__r"   r*   r1   rI   r   r   r   
LongTensorr   r   rE   rJ   r;   r;   r9   r<   r      s(    #r   c                       s   e Zd Z fddZ									ddejdeej deej deej d	eeej  d
ee	 dee	 dee	 deej deeejejf  de
e deejeeejejf  f fddZ  ZS )Llama4TextDecoderLayerc                    s   t    |j| _t||| _|jd uot|j| | _||j	v | _
| j
r+t|| _nt||jd| _t|j|jd| _t|j|jd| _|| _d S )N)r-   rc   )r)   r*   r.   r   	self_attnattention_chunk_sizeboolr   use_chunked_attentionZ
moe_layersis_moe_layerrm   feed_forwardrK   Zintermediate_size_mlprg   r   input_layernormpost_attention_layernormr   r   r9   r;   r<   r*     s   

zLlama4TextDecoderLayer.__init__NFr=   r   chunk_causal_maskr   r   r   output_router_logits	use_cacher   r   r   r>   c              
   K   s   |}|  |}| jr|d ur|}| jd||
|||||	d|\}}|| }|}| |}| |}| jr:|\}}nd }|||j }|f}|rN||f7 }|rU||f7 }|S )N)r=   r   r   r   r   r   r   r;   )r   r   r   r   r   r   rB   rl   )r8   r=   r   r   r   r   r   r   r   r   r   r   residualZattention_statesZself_attn_weightsr   outputsr;   r;   r<   rE     s:   







zLlama4TextDecoderLayer.forward)	NNNNFFFNN)rF   rG   rH   r*   r1   rI   r   r   r   r   r   r   FloatTensorrE   rJ   r;   r;   r9   r<   r     sH    	
r   c                   @   s>   e Zd ZeZdZdgZdZdZdZ	dZ
dZdZdZdd ZdS )Llama4PreTrainedModelTpast_key_valuesFc                 C   sJ  t | jdr
| jjn| jjj}t|tjr-|jjj	d|d |j
d ur+|j
j  d S d S t|tjrN|jjj	d|d |jd urL|jj|j   d S d S t|tjrc|jjd |j
j  d S t|trq|jjd d S t|tr|jjj	d|d |jjj	d|d d S t|tr|jjj	|jd |jjj	|jd d S d S )Ninitializer_ranger   )r]   stdr   )r   )r   r&   r   text_configr   r/   rO   ri   dataZnormal_rN   Zzero_	Embeddingpadding_idx	LayerNormZfill_rg   r%   r3   r4   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r8   r   r   r;   r;   r<   _init_weights  s4   






z#Llama4PreTrainedModel._init_weightsN)rF   rG   rH   r!   config_classZsupports_gradient_checkpointingZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacheZ_supports_attention_backendr   r;   r;   r;   r<   r     s    r   c                       sZ  e Zd ZdgZdZeZdef fddZdd Zdd	 Z	e
e	
	
	
	
	
	
	
	
	
	
d+dejdeej deej dee deej dee dee dee dee deej dee deeef fddZejjdd		
	d,dejdejdejdedef
ddZded ed!ed"ejdejf
d#d$Zedejd%ed&ed'ej dejd(efd)d*Z!  Z"S )-Llama4TextModelr   modelr&   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r;   )r   ).0r   r&   r;   r<   
<listcomp>  s    z,Llama4TextModel.__init__.<locals>.<listcomp>rc   r  F)r)   r*   pad_token_idr   
vocab_sizer/   r   r.   embed_tokens
ModuleListrangenum_hidden_layerslayersrg   r   normr   
rotary_embgradient_checkpointing	post_initr7   r9   r  r<   r*     s   zLlama4TextModel.__init__c                 C      | j S rS   r  rd   r;   r;   r<   get_input_embeddings
     z$Llama4TextModel.get_input_embeddingsc                 C   
   || _ d S rS   r  r8   r   r;   r;   r<   set_input_embeddings     
z$Llama4TextModel.set_input_embeddingsN	input_idsr   r   r   inputs_embedsr   r   output_hidden_statesreturn_dictr   flash_attn_kwargsr>   c                 K   s2  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rR| 
|| j
jj}|rq|d u rq| j  jd urnt| j |jd |jd }nt }|
d u r|d ur}| nd}tj|||jd  |jd}
|d u r|
d}| j|||
|||d\}}|}| ||}|rdnd }|rdnd }| jd | j j D ]>}|r||f7 }| jr| jr| |j||||||d||
|}n||f|||||||
|d	|}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||d
S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r    rt   )r   r;   )r   r   r   r   r   r   r   r   )last_hidden_stater   r=   
attentions)r&   r   r  r   use_return_dict
ValueErrorr  r   r   r   r  r~   ri   ru   Zget_text_configr   r   rl   r   Zget_seq_lengthr1   r{   	unsqueeze_update_causal_maskr  r
  r	  _gradient_checkpointing_func__call__r  r   )r8   r  r   r   r   r  r   r   r  r  r   r  Zpast_seen_tokensr   r   r=   freq_cisZall_hidden_statesZall_self_attnsZdecoder_layerlayer_outputsr;   r;   r<   rE     s   





zLlama4TextModel.forwardF)	recursiveTinput_tensorc              	   C   sP  | j jdkr|d ur|dk r||fS dS | j jdvrdS |jd }|| j}| j j}	|	d u}
|d }|d ur@| p>|}n|d urI|jd n|}|
rq||	k}||	k || |	k@ }|rot	||	| d t	||| |	n|}| j jdkrt
|tjr|
r|t||	 d df}t||	|||d	}t||||dfd
}||fS t
|tr||fS |j|j}}|
rt||	n|}| j||||||jd d}|
rW||	krWt||	 d d}|| }| j| j j|||d}|d d ||f }|jd |	k }|r
tj|d|	|jd  f}|s|d d | d d d f }n
|d d |d d f }||jd ddd}||d d d d d d f  }| j jdkrWt|j}t	|dk|d|}| j jdkr|d ur|jjdv r|jdkr|st|j}t||}| j jdkr|d ur| }|t|jk}tj|||| jdrd }||fS )NZflash_attention_2r   r   )r   flex_attentionr   r    r   r?   r)  )offsets)Zquery_length
key_lengthr*  )sequence_lengthtarget_lengthr   r   
batch_size)startendru   r   r   )cudaZxpuZnpu   )r  Zpast_key_values_lengthZis_training)r&   r   anyrl   r~   ru   r   Zget_max_cache_shaper1   wherer   rI   maxr$   r#   r   5_prepare_4d_causal_attention_mask_with_cache_positioncreate_chunked_attention_maskr/   r   padr|   finfominr   ndimr   Z_unmask_unattendedr   Z_ignore_causal_mask_sdpar   )r8   r   r(  r   r   r   Zchunked_attention_maskr   r,  r   Zusing_chunked_attentionZfirst_cache_positionZfull_cache_lengthZcond1Zcond2r+  r*  r   ru   r-  r   Z	start_idxZend_idxZlocal_attention_maskZrequires_padding	min_dtyper;   r;   r<   r"    s   

	




z#Llama4TextModel._update_causal_maskr   r/  r0  ru   c           	      C   s`   t j|||d}t |d| |d|  }|d|d }|dk|dk@ }||S )u  
        Generate the following:

        'What'      :  0 ■ ⬚ ⬚ ⬚ ⬚ ⬚    |
        '▁is'       :  1 ■ ■ ⬚ ⬚ ⬚ ⬚     |
        '▁ch'       :  2 ■ ■ ■ ⬚ ⬚ ⬚     |
        'unked'     :  3 ⬚ ⬚ ⬚ ■ ⬚ ⬚    |
        '▁attention':  4 ⬚ ⬚ ⬚ ■ ■ ⬚    |
        '?'         :  5 ⬚ ⬚ ⬚ ■ ■ ■     |

        If the chunk size is 3.
        This can just be applied over the already created attention mask
        rt   r   r    )r1   r{   absr!  r~   )	r8   r   r/  r0  ru   Zarange_vectorZ	block_posZ	token_posmaskr;   r;   r<   r7    s   
z-Llama4TextModel.create_chunked_attention_maskr,  r-  r   r.  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to place the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr2  Z
fill_valuer   ru   r    Zdiagonalrt   r?   r   rA   r1   r9  r:  fullru   Ztriur{   ry   r|   clonerl   r~   masked_fillr   r,  r-  r   r   r.  r   r   r<  Zmask_lengthZpadding_maskr;   r;   r<   r6    s,     $
6  zELlama4TextModel._prepare_4d_causal_attention_mask_with_cache_position)
NNNNNNNNNN)FNT)#rF   rG   rH   _no_split_modulesbase_model_prefixr"   r   r*   r  r  r   r   r1   r   r   rI   r   r   r   r   r   r   r   r   rE   compilerdisabler"  intru   r7  staticmethodr   r6  rJ   r;   r;   r9   r<   r     s    	

o 
r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)rF   rG   rH   r;   r;   r;   r<   rL  T  s    rL  c                !       s*  e Zd ZdgZdZdgZddiZeZdef fddZ	d	d
 Z
dd Zdd Zdd Zdd Zdd Zee												d'dejdeej deej deeeeej f  deej deej dee dee dee d ee d!eej d"eeejf d#ee d$eeef fd%d&Z   Z!S )(Llama4ForCausalLMr   language_modelzlm_head.weightlm_headZcolwise_repr&   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rL   )
r)   r*   r   r   r  r/   rO   r.   rO  r  r7   r9   r;   r<   r*   ^  s
   
zLlama4ForCausalLM.__init__c                 C   s   | j jS rS   r   r  rd   r;   r;   r<   r  g  s   z&Llama4ForCausalLM.get_input_embeddingsc                 C   s   || j _d S rS   rP  r  r;   r;   r<   r  j  rf   z&Llama4ForCausalLM.set_input_embeddingsc                 C   r  rS   rO  rd   r;   r;   r<   get_output_embeddingsm  r  z'Llama4ForCausalLM.get_output_embeddingsc                 C   r  rS   rQ  r8   Znew_embeddingsr;   r;   r<   set_output_embeddingsp  r  z'Llama4ForCausalLM.set_output_embeddingsc                 C   r  rS   r   r8   decoderr;   r;   r<   set_decoders  r  zLlama4ForCausalLM.set_decoderc                 C   r  rS   rU  rd   r;   r;   r<   get_decoderv  r  zLlama4ForCausalLM.get_decoderNr   r  r   r   r   r  labelsr   r   r  r  r   logits_to_keepr   r>   c                 K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| jd||||||||	d|d
|}|d }t|tr@t| dn|}| |dd|ddf }d}|durd| j	d||| j j
d|}t|||j|j|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```NT)
r  r   r   r   r  r   r   r  r  r   r   )logitsrZ  r  )lossr\  r   r=   r  r;   )r&   r   r  r  r   r   rJ  slicerO  Zloss_functionr  r   r   r=   r  )r8   r  r   r   r   r  rZ  r   r   r  r  r   r[  r   r   r=   Zslice_indicesr\  r]  r;   r;   r<   rE   y  s>   (zLlama4ForCausalLM.forward)NNNNNNNNNNNr   )"rF   rG   rH   rF  rG  Z_tied_weights_keys_tp_planr"   r   r*   r  r  rR  rT  rX  rY  r   r   r1   r   r   rI   r   r   r   r   r   rJ  r   rL  r   r   rE   rJ   r;   r;   r9   r<   rM  W  sp    		

rM  c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	Llama4CausalLMOutputWithPasta  
    Base class for Llava causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr]  r\  r   r=   r  image_hidden_states)rF   rG   rH   r   r]  r   r1   r   __annotations__r\  r   r   r=   r   r  ra  r;   r;   r;   r<   r`    s   
 r`  c                       rn   )Llama4VisionMLP2c                    s\   t    |j| _|j| _tj| j|jdd| _tj|j|jdd| _	t
 | _|j| _d S rL   )r)   r*   r.   r-   r/   rO   projector_input_dimfc1projector_output_dimfc2GELUrR   Zprojector_dropoutr   r7   r9   r;   r<   r*     s   

zLlama4VisionMLP2.__init__c                 C   s8   |  |}| |}tj|| j| jd}| | |S )Nr   )re  rR   Fr   r   rg  r8   r=   r;   r;   r<   rE     s   

zLlama4VisionMLP2.forwardrU   r;   r;   r9   r<   rc    s    	rc  c                       rn   )Llama4MultiModalProjectorc                    s(   t    tj|jj|jjdd| _d S rL   )	r)   r*   r/   rO   vision_configZvision_output_dimr   r.   linear_1r7   r9   r;   r<   r*     s   
z"Llama4MultiModalProjector.__init__c                 C   s   |  |}|S rS   )rm  )r8   image_featuresr=   r;   r;   r<   rE   	  s   
z!Llama4MultiModalProjector.forwardrU   r;   r;   r9   r<   rk     s    rk  c           
   	   C   s   | j \}}}tt|}| |||d} |  \}}}}| ||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }||d|j d }	|	S )Nr?   r   r(   r    r	   )rl   rJ  mathsqrtrB   r}   permuter   )
r(  Zshuffle_ratior.  num_patchesZchannels
patch_sizeheightwidthZreshaped_tensorZoutput_tensorr;   r;   r<   pixel_shuffle  s    $rv  c                       2   e Zd Z fddZdejdejfddZ  ZS )Llama4VisionPixelShuffleMLPc                    s>   t    |j| _t|j| jd  | _|j| _t|| _	d S r'   )
r)   r*   pixel_shuffle_ratiorJ  rd  Z	inner_dimrf  Z
output_dimrc  mlpr7   r9   r;   r<   r*   #  s
   
z$Llama4VisionPixelShuffleMLP.__init__encoded_patchesr>   c                 C   s   t || j}| |S rS   )rv  ry  rz  )r8   r{  r;   r;   r<   rE   *  s   
z#Llama4VisionPixelShuffleMLP.forwardrF   rG   rH   r*   r1   rI   rE   rJ   r;   r;   r9   r<   rx  "      rx  freqs_cic                    s(   |j   fddt|jD }| j| S )Nc                    s,   g | ]\}}|d ks| d  kr|nd qS )r    r;   )r  idr;  r;   r<   r  2  s   , z)reshape_for_broadcast.<locals>.<listcomp>)r;  	enumeraterl   rB   )r~  r   rl   r;   r  r<   reshape_for_broadcast0  s   
r  c                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t||d}||j}t || 	d}t || 	d}|
| |
|fS )Nr?   r(   )r~  r   r	   )r1   r   ra   ry   rl   r  r~   ru   r   r   rb   )r   r   r~  Zquery_key_Z	query_outZkey_outr;   r;   r<   vision_apply_rotary_emb6  s   ,,r  c                       sx   e Zd Zdef fddZ		ddejdejdeej dee d	e	e
 d
eejeej eeej  f fddZ  ZS )Llama4VisionAttentionr&   c                    s   t    || _|j| _|j| _|j|j | _d| _|j	| _	| jd | _
tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j | jdd| _d S )Nr    r   TrM   )r)   r*   r&   r.   Z	embed_dimr   Z	num_headsr   r   r   r   r/   rO   r   r   r   r   r7   r9   r;   r<   r*   E  s   
 zLlama4VisionAttention.__init__Nr=   r~  r   r   r   r>   c                 K   s"  |j d d }g |d| jR }| ||}| ||}	| ||}
t||	|d\}}	|dd}|	dd}	|
dd}
t}| j	j
dvrd| j	j
dkr^|ddr^td	 nt| j	j
 }|| ||	|
d f| jspd
n| jd dd|\}}|jg |dR   }| |}||fS )Nr?   )r~  r    r(   )r   r)  r   r   Fr   r   )r   r   r   )rl   r   r   rB   r   r   r  rz   r   r&   r   r   r   r   r   r   r   ry   r   r   )r8   r=   r~  r   r   r   r   r   r   r   r   r   r   r   r;   r;   r<   rE   T  s@   	

zLlama4VisionAttention.forwardr   )rF   rG   rH   r   r*   r1   rI   r   r   r   r   r   rE   rJ   r;   r;   r9   r<   r  D  s"    r  c                       rw  )Llama4VisionMLPc                    sJ   t    || _t | _tj|j|jdd| _	tj|j|jdd| _
d S )NTrM   )r)   r*   r&   r/   rh  rR   rO   r.   r-   re  rg  r7   r9   r;   r<   r*     s
   

zLlama4VisionMLP.__init__r=   r>   c                 C   s"   |  |}| |}| |}|S rS   )re  rR   rg  rj  r;   r;   r<   rE     s   


zLlama4VisionMLP.forwardr|  r;   r;   r9   r<   r    r}  r  c                
       sP   e Zd Zdef fddZ		ddejdejdeej dee fd	d
Z	  Z
S )Llama4VisionEncoderLayerr&   c                    sF   t    |j| _t|| _t|| _t|j| _	t|j| _
d S rS   )r)   r*   r.   r  r   r  rz  r/   r   r   r   r7   r9   r;   r<   r*     s   


z!Llama4VisionEncoderLayer.__init__Nhidden_stater~  r   r   c                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r/||f7 }|S )N)r~  r   )r   r   r   rz  )r8   r  r~  r   r   r   r   r   r;   r;   r<   rE     s    




z Llama4VisionEncoderLayer.forwardr   )rF   rG   rH   r   r*   r1   rI   r   r   rE   rJ   r;   r;   r9   r<   r    s    r  c                       st   e Zd ZdZdef fddZ				ddejdejdeej d	ee	 d
ee	 dee	 de
eef fddZ  ZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r&   c                    s@   t     | _t fddt jD | _d| _ | _d S )Nc                    s   g | ]}t  qS r;   )r  )r  _r  r;   r<   r    s    z0Llama4VisionEncoder.__init__.<locals>.<listcomp>F)	r)   r*   r&   r/   r  r  r	  r
  r  r7   r9   r  r<   r*     s
   
 
zLlama4VisionEncoder.__init__Nr=   r~  r   r   r  r  r>   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}| jD ]/}	|r6||f }| jrG| jrG| |	j||||}
n|	||||d}
|rX||
d f }|
d }q-|rd||f }|srt	dd |||fD S t
|||dS )	ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr;   )r  r   r   r~  r    r   c                 s       | ]	}|d ur|V  qd S rS   r;   r  vr;   r;   r<   	<genexpr>      z.Llama4VisionEncoder.forward.<locals>.<genexpr>r  r=   r  )r&   r   r  r  r
  r  r   r#  r$  rk   r   )r8   r=   r~  r   r   r  r  Zencoder_statesZall_attentionsZencoder_layerr&  r;   r;   r<   rE     sB   



zLlama4VisionEncoder.forwardNNNN)rF   rG   rH   r   r   r*   r1   rI   r   r   r   r   r   rE   rJ   r;   r;   r9   r<   r    s,    
r  c                       rw  )Llama4UnfoldConvolutionc                    s`   t    |j}t|tr||f}tjj||jd| _tj	|j
|d  |d  |jdd| _d S )N)kernel_sizeZstrider   r    FrM   )r)   r*   rs  r   rJ  r1   r/   ZUnfoldunfoldrO   num_channelsr.   linear)r8   r&   r  r9   r;   r<   r*     s   

z Llama4UnfoldConvolution.__init__r=   r>   c                 C   s&   |  |}|ddd}| |}|S )Nr   r(   r    )r  rq  r  rj  r;   r;   r<   rE   '  s   

zLlama4UnfoldConvolution.forwardr|  r;   r;   r9   r<   r    s    r  c                       rn   )Llama4VisionRotaryEmbeddingc                    sd  t    |j|j }tj|d tjd|d d}tj||d d gdd}d|d< || }|| }|j	|j
 d }d|jtd|dd |d   |   }|d d	 |d d d d f  jdd
d}|d d	 |d d d d f  jdd
d}	tj||	gd
d  dd d df }
|
|d
dddk d}
ttjt|
t|
gd
d}|| _d S )Nr(   )r   r    r   r@   r   )r?   r?   r   ).Nr?   .)r)   r*   
image_sizers  r1   r{   Zint32ry   catr.   r   Z
rope_thetara   Zrepeat_interleaver   rD  r   stackcossinr~  )r8   r&   idxZimg_idxZfrequencies_xZfrequencies_yZfreq_dimZ	rope_freqZfreqs_xZfreqs_yr   r%  r9   r;   r<   r*   /  s   
 ,((($
z$Llama4VisionRotaryEmbedding.__init__c                 C   s   | j |jS rS   )r~  r~   ru   rj  r;   r;   r<   rE   @  s   z#Llama4VisionRotaryEmbedding.forwardrU   r;   r;   r9   r<   r  .  s    r  c                       s   e Zd ZdZdgZeZdef fddZdd Z				dd	e	j
d
ee	j
 dee dee dee deeee	j
df f fddZ  ZS )r   vision_modelr  r&   c                    s   t  | |j| _|j| _|j| _|j| _| j| j d d | _|jd | _t|| _	t
| jt| j | _t
| jt| j| j | _t|| _t
| j| _t
| j| _t|| _t|| _|   d S )Nr(   r    r   )r)   r*   r  rs  r.   r  rr  r   r  patch_embeddingr/   r0   r1   Zrandnr   r   r  rotary_embeddingr   layernorm_prelayernorm_postr  r   rx  vision_adapterr  r7   r9   r;   r<   r*   I  s    



zLlama4VisionModel.__init__c                 C   r  )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rd   r;   r;   r<   r  b  s   z&Llama4VisionModel.get_input_embeddingsNpixel_valuesr   r   r  r  r>   .c                 C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|j\}}}}	d}
d}| |}|j\}}}|||
 | ||}| j|jd d|jd }t	j
||gdd}|d7 }|||
 |||}| jj|j|jd}|| }| |}||d|}| |}| j|d|||d}|j}| |}|ddddddf }| |}|r|jnd}|r|d }nd}|std	d
 |||fD S t|||dS )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr    r   r?   r@   )r   ru   )r   r  r   r~  r(   c                 s   r  rS   r;   r  r;   r;   r<   r    r  z,Llama4VisionModel.forward.<locals>.<genexpr>r  )r&   r   r  r  rl   r  ry   r   r|   r1   r  r   r~   r   ru   r  rB   r  r   r  r  r  r=   rk   r   )r8   r  r   r   r  r  Zbatch_size_times_num_tilesr  rt  ru  Znum_concurrent_mediaZ
num_chunksr  r  rr  rp   r   Zpositional_embeddingr~  rj   r=   r  r;   r;   r<   rE   h  sZ   





zLlama4VisionModel.forwardr  )rF   rG   rH   rG  rF  r   r   r*   r  r1   rI   r   r   r   r   r   rE   rJ   r;   r;   r9   r<   r   D  s.    	r   c                (       s  e Zd ZddgZi ZdZeZdef fddZdd Z	d	d
 Z
dd Zdd Zdd Zdd Zdejdeeee f defddZe																d3dejdejdeej deej deeej  deej deeeee f  dee deej d ee d!ee d"ee d#ee d$eej d%eeejf d&ejd'ee d(eeef f$d)d*Z						d4d+d,Z e!dejd-ed.ed/ej"d$ejd0efd1d2Z#  Z$S )5Llama4ForConditionalGenerationr   r   r&   c                    s^   t  | t|j| _t|| _t|j| _	|jj
| _
| jjd ur&| jjnd| _|   d S )Nr?   )r)   r*   r   rl  r  rk  multi_modal_projectorrM  r   rN  r  r&   r  r  r7   r9   r;   r<   r*     s   

z'Llama4ForConditionalGeneration.__init__c                 C   
   | j  S rS   )rN  r  rd   r;   r;   r<   r    r  z3Llama4ForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rS   )rN  r  r  r;   r;   r<   r       z3Llama4ForConditionalGeneration.set_input_embeddingsc                 C   r  rS   )rN  rR  rd   r;   r;   r<   rR    r  z4Llama4ForConditionalGeneration.get_output_embeddingsc                 C   r  rS   )rN  rT  rS  r;   r;   r<   rT    r  z4Llama4ForConditionalGeneration.set_output_embeddingsc                 C   r  rS   )rN  rX  rV  r;   r;   r<   rX    r  z*Llama4ForConditionalGeneration.set_decoderc                 C   r  rS   )rN  rY  rd   r;   r;   r<   rY    r  z*Llama4ForConditionalGeneration.get_decoderr  vision_feature_layervision_feature_select_strategyc                 K   sJ   |dvrt d| j dd | D }| j|fddi|}|j}|S )a  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, List[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   rB  z$Unexpected select feature strategy: c                 S   s   i | ]\}}|d ur||qS rS   r;   )r  kr  r;   r;   r<   
<dictcomp>  s    zELlama4ForConditionalGeneration.get_image_features.<locals>.<dictcomp>r  F)r   r  itemsr  r  )r8   r  r  r  r   Zimage_outputsr  r;   r;   r<   get_image_features  s   z1Llama4ForConditionalGeneration.get_image_featuresNr   r  r   r   r   r  rZ  r   r   r  r  r   r[  image_sizesr   r>   c           #      K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j jj}|dur/|n| j jj}|du |duA r@td|durL|durLtd|du rV|  |}|dur| j	||||d}|j
}|d|d}| |}|| j jkd}||j}|d|d}|d d}| }||dkrtd| d	|d |dd|d}|||}||}| jd|||||
|||||d

|}|d }d}|	durO|dur!|dd|j
d d  df |j}|dddddf ||jdk  }|	dddf ||	jdk  } n|dddddf  }|	dddf  } t }!|!|d|d| d|j}|sg|f|dd  }"|dure|f|" S |"S t|||j|j|j|dury|dS ddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nr  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r  r  r  r  r?   ).r   r   zMismatch: final_mask wants z0 embeddings, but multi_modal_projector returned )
r   r   r   r  r   r   r  r  r   r[  r    .)r]  r\  r   r=   r  ra  r;   )r&   r   r  r  rl  r  r  r   r  r  rl   rB   r}   r  Zimage_token_idr!  r~   ru   ry   sumr|   Zmasked_scatterrN  r   r/   ZCrossEntropyLossr`  r   r=   r  )#r8   r  r  r   r   r   r  r  r  rZ  r   r   r  r  r   r[  r  r   rn  Zoriginal_inputs_embeds_shapeZvision_flatZprojected_vision_flatZspecial_image_maskZ
final_maskZfinal_mask_1dZnum_tokens_to_fillZexpanded_maskr   r\  r]  Zshift_attention_maskZshift_logitsZshift_labelsZloss_fctrj   r;   r;   r<   rE     s   1



(*& z&Llama4ForConditionalGeneration.forwardc           
      K   s8   | j j|f|||||d|}	|d dkr||	d< |	S )N)r   r  r   r   r[  r   r  )rN  prepare_inputs_for_generation)
r8   r  r   r  r  r   r   r[  r   Zmodel_inputsr;   r;   r<   r    s   
z<Llama4ForConditionalGeneration.prepare_inputs_for_generationr,  r-  r   r.  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr2  r?  r    r@  rt   r?   r   rA  rE  r;   r;   r<   r6    s,    $
6  zTLlama4ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNNr   N)NNNNNN)%rF   rG   rH   rF  r_  rG  r!   r   r*   r  r  rR  rT  rX  rY  r1   r   r   rJ  r   r   r  r   r   r   rI   r   r   rL  r   r`  rE   r  rK  r   r6  rJ   r;   r;   r9   r<   r    s    
	

 
r  )r   r   r   rM  r  )r   )`ro  dataclassesr   typingr   r   r   r   r   r1   Ztorch.nnr/   Ztorch.nn.functionalr   ri  Z/transformers.models.llama4.configuration_llama4r   Zactivationsr
   Zcache_utilsr   r   r   Z
generationr   Zintegrations.hub_kernelsr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zconfiguration_llama4r!   r"   Z!torch.nn.attention.flex_attentionr#   Zintegrations.flex_attentionr$   Z
get_loggerrF   r   Moduler%   rK   rV   rg   rm   r   rI   r   rJ  r   ra   r   r   r   r   r   r   rL  rM  r`  rc  rk  rv  rx  r  r  r  r  r  r  r  r  r   r  __all__r;   r;   r;   r<   <module>   s   
!,

"
cI'  ap'
A,Z   1