o
    Zh-                     @   s  d dl mZmZmZmZ d dlZd dlZd dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZ ddlmZ eeZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deeZ$G dd deZ%g dZ&dS )    )ListOptionalTupleUnionN)nn   )CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)
LossKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                       s0   e Zd ZdZddedee f fddZ  ZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s   t  || |j| _d S N)super__init__Zattention_multiplierZscalingselfr   r   	__class__ Z/var/www/auris/lib/python3.10/site-packages/transformers/models/granite/modular_granite.pyr   +   s   zGraniteAttention.__init__r   )	__name__
__module____qualname____doc__r   r   intr   __classcell__r"   r"   r    r#   r   (   s    $r   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  deejeeejejf  f fddZ  ZS )GraniteDecoderLayerr   r   c                    s(   t  || |j| _t||d| _d S )N)r   r   )r   r   residual_multiplierr   	self_attnr   r    r"   r#   r   1   s   zGraniteDecoderLayer.__init__NFhidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                 K   s   |}
|  |}| jd||||||||d|	\}}|
|| j  }|}
| |}| |}|
|| j  }|f}|r>||f7 }|S )a.  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r-   r.   r/   r0   r1   r2   r3   r4   Nr"   )Zinput_layernormr,   r+   Zpost_attention_layernormZmlp)r   r-   r.   r/   r0   r1   r2   r3   r4   kwargsZresidualZself_attn_weightsoutputsr"   r"   r#   forward6   s.   "
	



zGraniteDecoderLayer.forward)NNNFFNN)r$   r%   r&   r   r(   r   torchTensorr   
LongTensorr   boolr   FloatTensorr8   r)   r"   r"   r    r#   r*   0   s8    	r*   c                   @      e Zd ZdS )GranitePreTrainedModelNr$   r%   r&   r"   r"   r"   r#   r?   x   s    r?   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	 d	eej
 d
ee dee dee deej dee defddZ  ZS )GraniteModelr   c                    s8   t     j| _t fddt jD | _d S )Nc                    s   g | ]}t  |qS r"   )r*   ).0r   r   r"   r#   
<listcomp>   s    z)GraniteModel.__init__.<locals>.<listcomp>)r   r   embedding_multiplierr   Z
ModuleListrangenum_hidden_layerslayers)r   r   r    rC   r#   r   }   s
   
zGraniteModel.__init__N	input_idsr.   r/   past_key_valuesinputs_embedsr2   r1   output_hidden_statesr3   flash_attn_kwargsr5   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|| j
 }|rP|d u rPt }|	d u rl|d ur\| nd}tj|||jd  |jd}	|d u ru|	d}| |||	||}|}| ||}|rdnd }|rdnd }| jd | j j D ]&}|r||f7 }||f||||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicer"   )r.   r/   r0   r1   r2   r3   r4   )last_hidden_staterJ   r-   
attentions)r   r1   rL   r2   
ValueErrorZgradient_checkpointingZtrainingloggerZwarning_onceZembed_tokensrE   r	   Zget_seq_lengthr9   ZarangeshaperN   Z	unsqueezeZ_update_causal_maskZ
rotary_embrH   rG   Znormr   )r   rI   r.   r/   rJ   rK   r2   r1   rL   r3   rM   Zpast_seen_tokensZcausal_maskr-   r4   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr"   r"   r#   r8      sv   




	


zGraniteModel.forward)	NNNNNNNNN)r$   r%   r&   r   r   r   r9   r;   r:   r   r=   r<   r   r
   r   r8   r)   r"   r"   r    r#   rA   |   sD    		
rA   c                   @   r>   )KwargsForCausalLMNr@   r"   r"   r"   r#   rT      s    rT   c                   @   s   e Zd Z											ddeej deej deej deeee	ej
 f  deej
 deej d	ee d
ee dee deej deeejf dee defddZdS )GraniteForCausalLMNr   rI   r.   r/   rJ   rK   labelsr2   r1   rL   r3   logits_to_keepr6   r5   c                 K   s   |d ur|n| j j}|	d ur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| d n|}| |d d |d d f }|| j j	 }d }|d ur^| j
d||| j jd|}t|||j|j|jdS )N)	rI   r.   r/   rJ   rK   r2   r1   rL   r3   )logitsrV   
vocab_size)lossrX   rJ   r-   rP   r"   )r   r1   rL   modelrO   
isinstancer(   sliceZlm_headZlogits_scalingZloss_functionrY   r   rJ   r-   rP   )r   rI   r.   r/   rJ   rK   rV   r2   r1   rL   r3   rW   r6   r7   r-   Zslice_indicesrX   rZ   r"   r"   r#   r8      s<   
zGraniteForCausalLM.forward)NNNNNNNNNNr   )r$   r%   r&   r   r9   r;   r:   r   r   r   r=   r<   r(   r   rT   r   r8   r"   r"   r"   r#   rU      sN    	
rU   )rU   rA   r?   )'typingr   r   r   r   r9   Ztorch.utils.checkpointr   Zcache_utilsr   r	   Zmodeling_flash_attention_utilsr
   Zmodeling_outputsr   r   Zprocessing_utilsr   utilsr   r   Zllama.modeling_llamar   r   r   r   r   Zconfiguration_graniter   Z
get_loggerr$   rR   r   r*   r?   rA   rT   rU   __all__r"   r"   r"   r#   <module>   s&   
He6