o
    Zh                     @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z# e$e%Z&dZ'G dd deZ(G dd deZ)G dd deZ*G dd deZ+G dd de!Z,G dd de
eZ-G dd deZ.G d d! d!eZ/G d"d# d#eZ0G d$d% d%eZ1g d&Z2dS )'zPyTorch Qwen3 model.    )CallableOptionalTupleN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)
LossKwargslogging   )GemmaMLP)	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaRMSNormapply_rotary_pos_embeager_attention_forward)MistralModel   )Qwen3ConfigzQwen/Qwen3-8Bc                   @      e Zd ZdS )Qwen3RMSNormN__name__
__module____qualname__ r!   r!   V/var/www/auris/lib/python3.10/site-packages/transformers/models/qwen3/modular_qwen3.pyr   1       r   c                   @   r   )Qwen3MLPNr   r!   r!   r!   r"   r$   5   r#   r$   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )Qwen3Attentionconfig	layer_idxc                    sp   t  || t| j|jd| _t| j|jd| _|j| _| jj	r1t
| jdd d ur1| j| jjks6d | _d S d S )N)Zepssliding_window)super__init__r   head_dimZrms_norm_epsq_normk_normr(   r&   Zuse_sliding_windowgetattrr'   Zmax_window_layersselfr&   r'   	__class__r!   r"   r*   :   s   
zQwen3Attention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                 K   sX  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t}| jjdkr}| jjdkrw|ddrwtd	 nt| jj }|| |	|
||f| jsd
n| j| j| jd|\}}|jg |dR   }| |}||fS )Nr   r   )sincosr7   eagerZsdpaZoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.g        )Zdropoutscalingr(   )shaper+   r,   Zq_projviewZ	transposer-   Zk_projZv_projr   updater'   r   r&   _attn_implementationgetloggerwarning_oncer	   ZtrainingZattention_dropoutr>   r(   Zreshape
contiguousZo_proj)r0   r3   r4   r5   r6   r7   r8   Zinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesr<   r;   Zcache_kwargsZattention_interfaceZattn_outputZattn_weightsr!   r!   r"   forwardF   sB   		

zQwen3Attention.forward)NN)r   r   r    r   intr*   torchZTensorr   r   r   Z
LongTensorr
   r   rG   __classcell__r!   r!   r1   r"   r%   9   s&    r%   c                       s&   e Zd Zdedef fddZ  ZS )Qwen3DecoderLayerr&   r'   c                    sR   t    t||d| _t|| _|jr%|jdkr't	d|j d d S d S d S )N)r&   r'   Zflash_attention_2z=Sliding Window Attention is enabled but not implemented for `z)`; unexpected results may be encountered.)
r)   r*   r%   Z	self_attnr$   Zmlpr(   rB   rD   rE   r/   r1   r!   r"   r*   z   s   


zQwen3DecoderLayer.__init__)r   r   r    r   rH   r*   rJ   r!   r!   r1   r"   rK   y   s    rK   c                   @   r   )
Qwen3ModelNr   r!   r!   r!   r"   rL      r#   rL   c                   @   r   )KwargsForCausalLMNr   r!   r!   r!   r"   rM      s    rM   c                       s*   e Zd Zdee def fddZ  ZS )Qwen3ForCausalLMsuper_kwargsr9   c                    s   t  jdi |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nr!   )r)   rG   )r0   rO   r1   r!   r"   rG      s   zQwen3ForCausalLM.forward)r   r   r    r
   rM   r   rG   rJ   r!   r!   r1   r"   rN      s    rN   c                   @   r   )Qwen3ForSequenceClassificationNr   r!   r!   r!   r"   rP      r#   rP   c                   @   r   )Qwen3ForTokenClassificationNr   r!   r!   r!   r"   rQ      r#   rQ   c                   @   r   )Qwen3ForQuestionAnsweringNr   r!   r!   r!   r"   rR      r#   rR   )rN   rR   rL   ZQwen3PreTrainedModelrP   rQ   )3__doc__typingr   r   r   rI   Ztorch.utils.checkpointZcache_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   Zmodeling_utilsr	   Zprocessing_utilsr
   utilsr   r   Zgemma.modeling_gemmar   Zllama.modeling_llamar   r   r   r   r   r   r   r   r   Zmistral.modeling_mistralr   Zconfiguration_qwen3r   Z
get_loggerr   rD   Z_CHECKPOINT_FOR_DOCr   r$   r%   rK   rL   rM   rN   rP   rQ   rR   __all__r!   r!   r!   r"   <module>   s6   ,
@