o
    ZhT                    @   s  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 e* rd dl3m4Z4 ddl5m6Z6 e+7e8Z9eG dd de'Z:e(dde(G dd de"Z;edG dd  d e
j<Z=G d!d" d"e
j<Z>G d#d$ d$e
j<Z?d%d& Z@dMd'd(ZAd)ejBd*eCd+ejBfd,d-ZD	.dNd/e
j<d0ejBd1ejBd2ejBd3eejB d4eEd5eEfd6d7ZFG d8d9 d9e
j<ZGG d:d; d;eZHe(G d<d= d=e;ZIG d>d? d?e
j<ZJG d@dA dAee&ZKe(dBdG dCdD dDe;eZLG dEdF dFe
j<ZMe(G dGdH dHe;ZNe(dIdG dJdK dKe;e2ZOg dLZPdS )O    )	dataclass)CallableListOptionalTupleUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )	AutoModel   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixin)	BlockMask)make_flex_block_causal_maskc                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeeej   ed< dZeeejdf  ed< dZeeejdf  ed< dZeej ed	< dZejed
< dZeeeej   ed< dZeeejdf  ed< dZeeejdf  ed< dZeej ed< dS )CsmOutputWithPastaf  
    Base class for the model autoregressive outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction) of the depth decoder model.
        depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
        depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
        depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
        backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction) of the backbone model.
    Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss)__name__
__module____qualname____doc__r'   r   torchFloatTensor__annotations__r(   r)   r   r*   r+   r,   r-   r.   r/   r0   r1    r9   r9   S/var/www/auris/lib/python3.10/site-packages/transformers/models/csm/modeling_csm.pyr&   6   s   
 ,r&   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    )Zcustom_introc                   @   sD   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdd ZdS )CsmPreTrainedModelmodelTCsmDecoderLayerr)   c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tra|j}t|d D ]}|jj| jd|d qQd S t|tro|jjd d S d S )N        )meanstdr    g      ?)configZinitializer_range
isinstancennLinearweightdataZnormal_biasZzero_	Embeddingpadding_idxCsmCodebooksHeadnum_codebooksrange
CsmRMSNormZfill_)selfmoduler@   rK   ir9   r9   r:   _init_weights   s&   



z CsmPreTrainedModel._init_weightsN)r2   r3   r4   r!   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacheZ_supports_attention_backendrQ   r9   r9   r9   r:   r;   q   s    r;   ZRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	rM   ư>c                    s&   t    tt|| _|| _dS )z9
        CsmRMSNorm is equivalent to T5LayerNorm
        N)super__init__rC   	Parameterr6   onesrE   variance_epsilon)rN   hidden_sizeeps	__class__r9   r:   rU      s   

zCsmRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   T)Zkeepdim)	dtypetor6   float32powr?   ZrsqrtrX   rE   )rN   r*   Zinput_dtypeZvariancer9   r9   r:   forward   s
   zCsmRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerE   shaperX   rN   r9   r9   r:   
extra_repr   s   zCsmRMSNorm.extra_repr)rS   )r2   r3   r4   rU   rb   rf   __classcell__r9   r9   r[   r:   rM      s    rM   c                       s8   e Zd Zddef fddZe edd Z  Z	S )CsmRotaryEmbeddingNrA   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF
persistent)rT   rU   hasattrri   getrj   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrA   r   Zrope_init_fnattention_scalingregister_bufferrm   Zoriginal_inv_freq)rN   rA   devicerm   r[   r9   r:   rU      s   
zCsmRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r]   r    ZmpscpuF)device_typeenabledr   dim)r^   )rm   floatexpandrd   r_   rt   rB   rk   strr6   Zautocast	transposecatcosrr   sinr^   )
rN   xposition_idsZinv_freq_expandedZposition_ids_expandedrv   ZfreqsZembr   r   r9   r9   r:   rb      s   0&zCsmRotaryEmbedding.forwardN)
r2   r3   r4   r!   rU   r6   Zno_gradr   rb   rg   r9   r9   r[   r:   rh      s
    rh   c                       $   e Zd Z fddZdd Z  ZS )CsmMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )NrG   )rT   rU   rA   rY   Zintermediate_sizerC   rD   Zmlp_bias	gate_projup_proj	down_projr	   Z
hidden_actact_fnrN   rA   r[   r9   r:   rU      s   
zCsmMLP.__init__c                 C   s$   |  | | || | }|S r   )r   r   r   r   )rN   r   r   r9   r9   r:   rb      s    zCsmMLP.forwardr2   r3   r4   rU   rb   rg   r9   r9   r[   r:   r      s    
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr]   r   rx   )rd   r6   r~   )r   x1Zx2r9   r9   r:   rotate_half   s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   Zunsqueeze_dimZq_embedZk_embedr9   r9   r:   apply_rotary_pos_emb   s
   

r   r*   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rd   r{   reshape)r*   r   batchnum_key_value_headsslenhead_dimr9   r9   r:   	repeat_kv  s
   0r   r>   rO   querykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   r]   )ry   r^   )ptrainingr    )r   num_key_value_groupsr6   matmulr}   rd   rC   
functionalZsoftmaxr`   r_   r^   r   r   
contiguous)rO   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr9   r9   r:   eager_attention_forward  s   
&r   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	e
ej d
e
e de
ej dee de	eje
ej e
e	ej  f fddZ  ZS )CsmAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrA   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr   g      Tr   )rT   rU   rA   r   getattrrY   Znum_attention_headsr   r   r   r   attention_dropoutZ	is_causalrC   rD   Zattention_biasq_projk_projv_projo_projrN   rA   r   r[   r9   r:   rU   *  s(   
zCsmAttention.__init__Nr*   position_embeddingsr   past_key_valuecache_positionr   r   c                 K   sH  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkrw| jjdkrq|ddrqtd	 nt| jj }|| |	|
||f| jsd
n| j| jd|\}}|jg |dR   }| |}||fS )Nr]   r    r   )r   r   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r>   )r   r   )rd   r   r   viewr}   r   r   r   updater   r   rA   _attn_implementationrq   loggerwarning_oncer   r   r   r   r   r   r   )rN   r*   r   r   r   r   r   Zinput_shapeZhidden_shapeZquery_statesr   r   r   r   Zcache_kwargsZattention_interfacer   r   r9   r9   r:   rb   A  s@   	

zCsmAttention.forward)NN)r2   r3   r4   r5   r!   intrU   r6   Tensorr   r   r
   
LongTensorr   r   rb   rg   r9   r9   r[   r:   r   '  s(    r   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )r=   rA   r   c                    sR   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
d S )N)rA   r   rZ   )rT   rU   rY   r   	self_attnr   mlprM   rms_norm_epsinput_layernormpost_attention_layernormr   r[   r9   r:   rU   u  s   

zCsmDecoderLayer.__init__NFr*   r   r   r   r   	use_cacher   r   r   r   c	                 K   st   |}
|  |}| jd||||||||d|	\}}|
| }|}
| |}| |}|
| }|f}|r8||f7 }|S )N)r*   r   r   r   r   r   r   r   r9   )r   r   r   r   )rN   r*   r   r   r   r   r   r   r   r   ZresidualZself_attn_weightsoutputsr9   r9   r:   rb     s.   
	



zCsmDecoderLayer.forward)NNNFFNN)r2   r3   r4   r!   r   rU   r6   r   r   r   r
   boolr   r   r   r7   rb   rg   r9   r9   r[   r:   r=   t  s<    	
r=   c                       s   e Zd ZeZ fddZdd Zdd Zee											d!de
jd	ee
j d
ee
j dee
j dee dee
j dee dee dee dee
j dee deeef fddZ	d"d
ee
jdf de
jde
jdedef
ddZed
e
jdedede
jde
jdefdd Z  ZS )#CsmDepthDecoderModelc                    s   t     j| _ j| _t j j  j| _	t
 fddt jD | _t j jd| _t d| _d| _tj j jdd| _|   d S )Nc                       g | ]}t  |qS r9   r=   .0r   rA   r9   r:   
<listcomp>      z1CsmDepthDecoderModel.__init__.<locals>.<listcomp>r   r   Fr   )rT   rU   pad_token_idrI   
vocab_sizerC   rH   rK   Zbackbone_hidden_sizeembed_tokens
ModuleListrL   num_hidden_layerslayersrM   rY   r   normrh   
rotary_embgradient_checkpointingrD   inputs_embeds_projector	post_initr   r[   r   r:   rU     s   zCsmDepthDecoderModel.__init__c                 C      | j S r   r   re   r9   r9   r:   get_input_embeddings     z)CsmDepthDecoderModel.get_input_embeddingsc                 C   
   || _ d S r   r   rN   r   r9   r9   r:   set_input_embeddings     
z)CsmDepthDecoderModel.set_input_embeddingsN	input_idsbackbone_last_hidden_stater   r   r)   inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr   c                 K   sX  |durt j std d}|dur|n| jj}|	dur |	n| jj}	|dur*|n| jj}|du |duA r:t	d| j
rI| jrI|rItd d}|rR|du rRt }|
du r|dur^| nd}|duri|jd n|jd }|duru|jn|j}t j||| |d}
|du rt j|
d dd	}|| j }| || }|
d dk}|dur||dddf< nt j s|rtd
 | |}| |||
||}|}|
d}| ||}|	rdnd}|rdnd}| jd| jj D ]'}|	r||f7 }||f||||||
|d|}|d }|r||d f7 }q| |}|	r||f7 }t||r&|nd||dS )aJ  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.X`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r    rt   )minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.r9   r   r   r   r   r   r   r   Zlast_hidden_stater)   r*   r+   )r6   compilerZis_compilingr   r   rA   r   r   r   
ValueErrorr   r   r   get_seq_lengthrd   rt   arangeclampr   r   warningr   _update_causal_maskr   r   r   r   r   r   )rN   r   r   r   r   r)   r   r   r   r   r   r   past_seen_tokensZinputs_seq_lengthrt   codebook_idxsoffsetZinput_ids_are_first_codebookr   r*   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr9   r9   r:   rb     s   




	

zCsmDepthDecoderModel.forwardFr$   input_tensorc                 C   :  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S NZflash_attention_2r>   Zflex_attentionr   Fr   )r   Zpast_key_values_lengthZis_trainingr    r]   )sequence_lengthtarget_lengthr^   r   
batch_size)cudaZxpuZnpurA   r   anyrB   r6   r   r%   r   Zis_compileabler   Z_ignore_causal_mask_sdpar   r^   rd   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrt   rk   finfor   Z_unmask_unattendedrN   r   r  r   r)   r   r   Zusing_compilable_cacher^   r  r  r   	min_dtyper9   r9   r:   r   7  T   




z(CsmDepthDecoderModel._update_causal_maskr  r  r^   r  c                 K   D  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S 	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer^   rt   r    )Zdiagonalr   r]   r   ry   r6   r  r   fullrt   Ztriur   r   r{   clonerd   r_   Zmasked_fillr   r  r  r^   r   r  r   r   r  Zmask_lengthZpadding_maskr9   r9   r:   r  {  ,    $
6  zJCsmDepthDecoderModel._prepare_4d_causal_attention_mask_with_cache_position)
NNNNNNNNNNF)r2   r3   r4   r"   rR   rU   r   r   r   r   r6   r   r   r7   r   r
   r   r   r   r   r   r   rb   r   staticmethodr   r^   r  rg   r9   r9   r[   r:   r     s    	

x
Dr   c                       s&   e Zd Z fddZdddZ  ZS )rJ   c                    s0   t    || _tt| jd ||| _d S Nr    )rT   rU   rK   rC   rV   r6   emptyrE   )rN   rY   rK   r   r[   r9   r:   rU     s   
 zCsmCodebooksHead.__init__Nc                    sf   |d u rj d }| jt|  n	|d }| j|   fddt j d D tjddS )Nr    c              	      s2   g | ]}t jd d |d d f  | jqS r   )rC   r   ZlinearT)r   Zcodebook_idxZcodebook_weightr*   r9   r:   r     s    $z,CsmCodebooksHead.forward.<locals>.<listcomp>r   rx   )rd   rE   r6   r   rL   stack)rN   r*   r   Z
seq_lengthr   r9   r  r:   rb     s   

zCsmCodebooksHead.forwardr   r   r9   r9   r[   r:   rJ     s    rJ   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r2   r3   r4   r9   r9   r9   r:   r    s    r  a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                !       sJ  e Zd ZdZdZdZ fddZdd Zdd Zdd	 Z	d
d Z
ee												ddejdeej deej deej deeeeej f  deej deej dee dee dee deej deeejf dee deeef fddZ				d dejdee deej deej deej f
 fddZ  ZS )!CsmDepthDecoderForCausalLMNc                    s>   t  | t|| _|j| _t|j|j|j| _| 	  d S r   )
rT   rU   r   r<   r   rJ   rY   rK   codebooks_headr   r   r[   r9   r:   rU     s
   
z#CsmDepthDecoderForCausalLM.__init__c                 C      | j jS r   r<   r   re   r9   r9   r:   r        z/CsmDepthDecoderForCausalLM.get_input_embeddingsc                 C      || j _d S r   r#  r   r9   r9   r:   r        z/CsmDepthDecoderForCausalLM.set_input_embeddingsc                 C   r   r   r<   )rN   decoderr9   r9   r:   set_decoder  r   z&CsmDepthDecoderForCausalLM.set_decoderc                 C   r   r   r'  re   r9   r9   r:   get_decoder  r   z&CsmDepthDecoderForCausalLM.get_decoderr   r   r   r   r   r)   r   labelsr   r   r   r   logits_to_keepr   r   c                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
| jd||||||||	|
|d
|}|d }t|trA|dkr:tdd}n	t| d}n|}| |dd|ddf |durW|| nd}| }d}|dur}|dddf  }| j	d|d| j j
|d|}t|||j|j|jdS )	a  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)
r   r   r   r   r)   r   r   r   r   r   r   r    .)r(   r+  r   shift_labels)r'   r(   r)   r*   r+   r9   )rA   r   r   r<   rB   r   slicer!  r   loss_functionr   r   r)   r*   r+   )rN   r   r   r   r   r)   r   r+  r   r   r   r   r,  r   r   r*   slice_indicesr(   r'   r-  r9   r9   r:   rb     sT   
&z"CsmDepthDecoderForCausalLM.forwardc           	         sH   t  j|||||fi |}|d d dk}|s|d |d |S )Nr   r   r   r   )rT   prepare_inputs_for_generationpop)	rN   r   r)   r   r   r   r   model_inputsZis_first_generation_stepr[   r9   r:   r1  <  s   	


z8CsmDepthDecoderForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNr   NNNN)r2   r3   r4   _tied_weights_keysZ_tp_planZ_pp_planrU   r   r   r)  r*  r   r   r6   r   r   r7   r   r   r
   r   r   r   r   r  r   r   rb   r1  rg   r9   r9   r[   r:   r     s    		

Or   c                       r   )CsmBackboneModelEmbeddingsc                    sD   t    t|j|j |j| _| jdt	
|j|j dd d S )Naudio_tokens_offsetsFrn   )rT   rU   rC   rH   rK   r   rY   embed_audio_tokensrs   r6   r   r   r[   r9   r:   rU   T  s
   

z#CsmBackboneModelEmbeddings.__init__c                 C   s    |  || j }|jdd}|S )Nr   rx   )r8  r7  sum)rN   r   Zinput_embedsr9   r9   r:   rb   [  s   z"CsmBackboneModelEmbeddings.forwardr   r9   r9   r[   r:   r6  S  s    r6  c                       s  e Zd Z fddZdd Zdd Zee									d dee	j
 d	ee	j d
ee	j
 dee dee	j dee dee dee dee	j
 dee defddZ	d!d	ee	jdf de	jde	jdedef
ddZed	e	jdedede	jde	jdefddZ  ZS )"CsmBackboneModelc                    sv   t     j| _ j| _t | _t fddt	 j
D | _t j jd| _t d| _d| _|   d S )Nc                    r   r9   r   r   r   r9   r:   r   i  r   z-CsmBackboneModel.__init__.<locals>.<listcomp>r   r   F)rT   rU   r   rI   r   r6  r   rC   r   rL   r   r   rM   rY   r   r   rh   r   r   r   r   r[   r   r:   rU   c  s   
zCsmBackboneModel.__init__c                 C   r   r   r   re   r9   r9   r:   r   r  r   z%CsmBackboneModel.get_input_embeddingsc                 C   r   r   r   r   r9   r9   r:   r   u  r   z%CsmBackboneModel.set_input_embeddingsNr   r   r   r)   r   r   r   r   r   r   r   c
                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du |duA r*td| jr9| jr9|r9td d}t	|t
dtfsFtd|du rO| |}|rX|du rXt }|	du rt|durd| nd}tj|||jd  |jd}	|du r}|	d}| |||	||}|}| ||}|rd	nd}|rd	nd}| jd| j j D ]&}|r||f7 }||f||||||	|d
|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd||dS )a&  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r    r   r9   r   r   )rA   r   r   r   r   r   r   r   r   rB   rk   r
   r   r   r   r6   r   rd   rt   r   r   r   r   r   r   r   )rN   r   r   r   r)   r   r   r   r   r   r   r   r   r*   r   r   r   r   r  r9   r9   r:   rb   x  sx   



	


zCsmBackboneModel.forwardFr$   r  c                 C   r  r  r	  r  r9   r9   r:   r     r  z$CsmBackboneModel._update_causal_maskr  r  r^   r  c                 K   r  r  r  r  r9   r9   r:   r  (  r  zFCsmBackboneModel._prepare_4d_causal_attention_mask_with_cache_position)	NNNNNNNNNr  )r2   r3   r4   rU   r   r   r   r   r   r6   r   r   r
   r7   r   r   r   r   rb   r   r   r  r   r^   r  rg   r9   r9   r[   r:   r:  a  s    	
p
Dr:  z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                #       s  e Zd ZddgZ fddZdd Zdd Zd	d
 Zdd Zdd Z	e
 fddZ fddZ				d0deej deej deej deej deej f
ddZ				d0dejdee deej deej deej f
 fdd Zee													!d1dejdeej deej deej d"eej deeeeej f  deej deej d#ee d$ee d%ee deej d&eeejf d'ee deeef fd(d)Ze dejd*ed+ed,ej!dejd-efd.d/Z"  Z#S )2CsmForConditionalGenerationz5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                    sp   t  | |j| _tj|j|jdd| _t|j|j| _	t
|| _t|j| _t|j| _|   d S )NFr   )rT   rU   r   rC   rD   rY   lm_headrH   Ztext_vocab_sizeembed_text_tokensr:  Z_from_configbackbone_modelr   Zdepth_decoder_configdepth_decoderr   Zfrom_configZcodec_configcodec_modelr   r   r[   r9   r:   rU   k  s   z$CsmForConditionalGeneration.__init__c                 C   r"  r   r>  r   re   r9   r9   r:   r   w  r$  z0CsmForConditionalGeneration.get_input_embeddingsc                 C   r%  r   rA  r   r9   r9   r:   r   z  r&  z0CsmForConditionalGeneration.set_input_embeddingsc                 C   r   r   r<  re   r9   r9   r:   get_output_embeddings}  r   z1CsmForConditionalGeneration.get_output_embeddingsc                 C   r   r   rB  )rN   Znew_embeddingsr9   r9   r:   set_output_embeddings  r   z1CsmForConditionalGeneration.set_output_embeddingsc                 C   s(   | j jr| | jjj| jjj d S d S r   )rA   Ztie_codebooks_embeddingsZ_tie_or_clone_weightsr>  r   r8  r?  r<   re   r9   r9   r:   _tie_weights  s   z(CsmForConditionalGeneration._tie_weightsc                    s   | ddrt j|i |\}}n	t j|i |}d t  fddt|j D }t|jjddi| |D ]
}t	|j |  q?d|v rR||fS |S )NZoutput_loading_infoFdepth_decoder_c                    s(   i | ]\}}|  r|d  |qS r   )
startswith)r   attrr   prefix
prefix_lenr9   r:   
<dictcomp>  s    z?CsmForConditionalGeneration.from_pretrained.<locals>.<dictcomp>Z_from_model_config)
rq   rT   from_pretrainedlenvarsgeneration_configitemsr?  r   delattr)clsargsr   r<   Zloading_infodepth_decoder_attrsrH  r[   rI  r:   rM    s   z+CsmForConditionalGeneration.from_pretrainedc                    sV   d}| j j }|dd  | D ]\}}t| j|| | qt j|i | d S )NrF  Ztransformers_version)r?  rP  Zto_diff_dictr2  rQ  setattrrT   save_pretrained)rN   rT  r   rJ  rU  rH  r   r[   r9   r:   rW    s   z+CsmForConditionalGeneration.save_pretrainedNr   input_valuesinput_values_cutoffsr+  r   c                    s  |  |}|durtj|d}||dk  }||dk }tj| |jd	t
|d}||dk }g }t||D ]?\}	}
|
|
dk }
t|
jd d D ]+}|
| }|
|d  }|	d||f }| j|d}|jdd}||d  qPq=tdd	 |D  t fd
d|D }| j|}| jj}||k}| j|}|| ||< tjdd| jjf|jtjd| jj }| j|d}|| jjk}| |! d||< |dur|d dd| jj}|| ||< |dkj"dd}d||d |d ddf< |}||dS )a  
        Merges the input_ids and input_values to produce a single inputs_embeds tensor:
        1 - Infers the codec model on the input_values to retreive codebook token.
        2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
        3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The input ids to embed.
            input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
                The audio input values to embed.
            input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
                The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
        Nr    r   r   r   r]   r    .c                 s   s    | ]}|j d  V  qdS )r   N)rd   r   elr9   r9   r:   	<genexpr>  s    zQCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>c                    s,   g | ]}t j|d d d  |jd   fqS )r   )rC   r   padrd   r[  Zmax_audio_framesr9   r:   r     s   , zRCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<listcomp>)rt   r^   iTas_tuple)r   r+  )#r=  rC   r   r^  diffr6   r   maxrt   r{   rN  r   ziprL   rd   r@  encodeZaudio_codesr}   appendr  Zget_audio_codes_maskrA   audio_token_idr>  r   rW   rK   longZcodebook_eos_token_idZsqueezeZaudio_eos_token_idrepeatr9  nonzero)rN   r   rX  rY  r+  r   Zaudio_lengthsZinput_values_maskZaudio_tokens_listZbatch_input_valuesZbatch_input_values_cutoffsrP   Z	start_idxZend_idxZaudio_batchZcodec_outputsZcodebook_idsZbatched_audio_token_idsZaudio_codes_maskrh  Zaudio_token_maskZaudio_embedsZaudio_eos_frame_idsZaudio_eos_embedsZaudio_eos_token_maskZlabels_expandedZ depth_decoder_ignore_frames_idxsr9   r_  r:   "_merge_input_ids_with_input_values  sV   



z>CsmForConditionalGeneration._merge_input_ids_with_input_valuesr)   r   r   r   c           	         s   t  jd	|||||d|}|d ur>|jdkr>|dd u r>| j||d|d|dd}||d |d d d |S )
N)r   r)   r   r   r   r   r   rX  rY  r+  )r   rX  rY  r+  )r   r+  r   r9   )rT   r1  ndimrq   rl  r   )	rN   r   r)   r   r   r   r   r3  merged_inputsr[   r9   r:   r1    s(   	 	z9CsmForConditionalGeneration.prepare_inputs_for_generationr   r   r   r   r   r,  r   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|dur/|jdkr/| ||||}|d }|d }d}| jd||||||	|
||d	|}|d }t|trPt| dn|}| 	|dd|ddf }d}d}d}d}|dur|dddddf }| j
d||| j jd|}|ddddddf d	kjd
d }|| dd| j jd f }tjj|ddd}|jdd}||d |d d ddf }|| }| j|||	|
|d|d}|j}|| }t|||||j|j|j|dur|jnd|dur|jnd|dur|jnd|dur	|jdS ddS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
            Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
            If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
            where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
            the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
            Requires targeted `input_values` to be provided as audio tokens will be infered from it using the `codec_model`.
            - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

            Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            Kept for compatibility. Does not support another value than:
            1. `0`, which is equivalent to keeping all logits, used in the training regime
            2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

        Example:

        ```python
        >>> import torch
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

        >>> model_id = "eustlb/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        >>> # ensure the audio is 24kHz
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

        >>> conversation = []
        >>> # prepare a conversation with text and corresponding audio
        >>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
        ...     conversation.append(
        ...         {
        ...             "role": f"{speaker_id}",
        ...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
        ...         }
        ...     )

        >>> inputs = processor.apply_chat_template(
        ...     conversation,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     output_labels=True,
        ... ).to(torch_device)

        >>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
        >>> output = model(**inputs)
        >>> output.loss.backward()
        ```Nr   r   r+  )	r   r   r   r)   r   r   r   r   r   r   )r(   r+  r   r    rb  r]   rx   .rZ  )r   Tr`  )r   r   r   r   r   Zreturn_dictr+  )r'   r1   r,   r(   r)   r*   r+   r-   r.   r/   r0   r9   )rA   r   r   rm  rl  r>  rB   r   r.  r<  r/  r   allrK   rC   r   r^  rk  r?  r'   r&   r)   r*   r+   r(   )rN   r   rX  r   rY  r   r)   r   r+  r   r   r   r   r,  r   rn  Zbackbone_outputsZbackbone_hidden_statesr0  Zbackbone_logitsr'   r1   r,   Zdepth_decoder_outputsZbackbone_labelsZ
train_maskZdepth_decoder_input_idsZ
train_idxsZbackbone_last_hidden_statesZdepth_decoder_labelsr9   r9   r:   rb     s   V

(
z#CsmForConditionalGeneration.forwardr  r  r^   r  c                 K   r  r  r  r  r9   r9   r:   r    r  zQCsmForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_positionr4  )NNNNNNNNNNNNr   )$r2   r3   r4   r5  rU   r   r   rC  rD  rE  classmethodrM  rW  r   r6   r   rl  r   r
   r7   r1  r   r   r   r   r   r   r   r  r   r&   rb   r  r^   r  rg   r9   r9   r[   r:   r;  `  s    
S	

 )r;  )r;   r:  r   r   r;  r  )r>   )Qdataclassesr   typingr   r   r   r   r   r6   Ztorch.nnrC   Zactivationsr	   Zcache_utilsr
   r   Z
generationr   Zintegrationsr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   r   autor   Zconfiguration_csmr!   r"   Zgeneration_csmr#   Z!torch.nn.attention.flex_attentionr$   Zintegrations.flex_attentionr%   Z
get_loggerr2   r   r&   r;   ModulerM   rh   r   r   r   r   r   r   rz   r   r   r=   r   rJ   r  r   r6  r:  r;  __all__r9   r9   r9   r:   <module>   s    
:!"

M5      