
    fTh                        S r SSKJrJrJr  SSKrSSKrSSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#  \RH                  " \%5      r&Sr' " S S\5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S S\!5      r, " S S\
\5      r- " S S\5      r. " S  S!\5      r/ " S" S#\5      r0 " S$ S%\5      r1/ S&Qr2g)'zPyTorch Qwen3 model.    )CallableOptionalTupleN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)
LossKwargslogging   )GemmaMLP)	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaRMSNormapply_rotary_pos_embeager_attention_forward)MistralModel   )Qwen3ConfigzQwen/Qwen3-8Bc                       \ rS rSrSrg)Qwen3RMSNorm1    N__name__
__module____qualname____firstlineno____static_attributes__r       _/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   1       r&   r   c                       \ rS rSrSrg)Qwen3MLP5   r   Nr    r   r&   r'   r*   r*   5   r(   r&   r*   c                   B  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\R                     S
\\   S\	\R                  \
\R                     \
\	\R                        4   4S jjrSrU =r$ )Qwen3Attention9   config	layer_idxc                   > [         TU ]  X5        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        UR                  U l        U R                  R                  (       a<  [        U R                  SS 5      b$  U R                  U R                  R                  :  d  S U l        g g )N)epssliding_window)super__init__r   head_dimrms_norm_epsq_normk_normr3   r/   use_sliding_windowgetattrr0   max_window_layersselfr/   r0   	__class__s      r'   r5   Qwen3Attention.__init__:   s    +"4==f6I6IJ"4==f6I6IJ$33KK**%5t<H$++"?"??"&D @r&   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U R                  U5      R	                  U5      5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [         R#                  S	5        O[$        U R                  R                     nU" U U	U
UU4U R&                  (       d  S
OU R(                  U R*                  U R,                  S.UD6u  nnUR.                  " / UQSP76 R1                  5       nU R3                  U5      nUU4$ )Nr   r   )sincosrE   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.g        )dropoutscalingr3   )shaper6   r8   q_projview	transposer9   k_projv_projr   updater0   r   r/   _attn_implementationgetloggerwarning_oncer
   trainingattention_dropoutrP   r3   reshape
contiguouso_proj)r>   rA   rB   rC   rD   rE   rF   input_shapehidden_shapequery_states
key_statesvalue_statesrK   rJ   cache_kwargsattention_interfaceattn_outputattn_weightss                     r'   forwardQwen3Attention.forwardF   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL..
%
 
%
!\ "));;;;FFHkk+.L((r&   )r9   r8   r3   )NN)r!   r"   r#   r$   r   intr5   torchTensorr   r   r   
LongTensorr   r   rj   r%   __classcell__r?   s   @r'   r-   r-   9   s    
'{ 
's 
'" +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0) 0)r&   r-   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Qwen3DecoderLayery   r/   r0   c                    > [         TU ]  5         [        XS9U l        [	        U5      U l        UR                  (       a5  UR                  S:w  a$  [        R                  SUR                   S35        g g g )N)r/   r0   flash_attention_2z=Sliding Window Attention is enabled but not implemented for `z)`; unexpected results may be encountered.)
r4   r5   r-   	self_attnr*   mlpr3   rX   rZ   r[   r=   s      r'   r5   Qwen3DecoderLayer.__init__z   sn    'vKF#!!f&A&AEX&XOPVPkPkOl m9 9 'Y!r&   )rx   rw   )	r!   r"   r#   r$   r   rl   r5   r%   rp   rq   s   @r'   rs   rs   y   s    
{ 
s 
 
r&   rs   c                       \ rS rSrSrg)
Qwen3Model   r   Nr    r   r&   r'   r{   r{      r(   r&   r{   c                       \ rS rSrSrg)KwargsForCausalLM   r   Nr    r   r&   r'   r~   r~      s    3r&   r~   c                   :   ^  \ rS rSrS\\   S\4U 4S jjrSrU =r	$ )Qwen3ForCausalLM   super_kwargsrG   c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Qwen3ForCausalLM

>>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```r   )r4   rj   )r>   r   r?   s     r'   rj   Qwen3ForCausalLM.forward   s    4 w...r&   r   )
r!   r"   r#   r$   r   r~   r	   rj   r%   rp   rq   s   @r'   r   r      s%    /01/ 
 / /r&   r   c                       \ rS rSrSrg)Qwen3ForSequenceClassification   r   Nr    r   r&   r'   r   r      r(   r&   r   c                       \ rS rSrSrg)Qwen3ForTokenClassification   r   Nr    r   r&   r'   r   r      r(   r&   r   c                       \ rS rSrSrg)Qwen3ForQuestionAnswering   r   Nr    r   r&   r'   r   r      r(   r&   r   )r   r   r{   Qwen3PreTrainedModelr   r   )3__doc__typingr   r   r   rm   torch.utils.checkpointcache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr	   modeling_utilsr
   processing_utilsr   utilsr   r   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_qwen3r   
get_loggerr!   rZ   _CHECKPOINT_FOR_DOCr   r*   r-   rs   r{   r~   r   r   r   r   __all__r   r&   r'   <module>r      s     , ,     B 6 5 & ( +
 
 
 4 , 
		H	%% 	< 		x 	=)^ =)@) 	 	 ?,j >/' /<	%C 		"= 		 9 	r&   