
    fTh                     t   S SK JrJrJr  S SKrSSKJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SS	KJrJr  S
SKJrJrJrJr  S
SKJr  SSKJr  SSKJr  \R<                  " \5      r Sr! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\	\5      r% " S S\5      r& " S S\5      r' " S S\5      r(/ SQr)g)     )OptionalTupleUnionN   )Cache)FlashAttentionKwargs)GradientCheckpointingLayer)CausalLMOutputWithPast)Unpack)
LossKwargslogging   )GlmAttentionGlmForCausalLMGlmForSequenceClassificationGlmForTokenClassification)Phi3MLP   )
Glm4Config)Glm4RMSNormzTHUDM/GLM-4-9B-Chat-0414c                       \ rS rSrSrg)Glm4MLP%    N__name__
__module____qualname____firstlineno____static_attributes__r       ]/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/glm4/modular_glm4.pyr   r   %       r!   r   c                     ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\   S\\R                   \	\\R                   \R                   4      4   4S jjrSrU =r$ )Glm4DecoderLayer)   config	layer_idxc                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )N)r'   r(   )eps)super__init__hidden_sizeGlm4Attention	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernorm)selfr'   r(   	__class__s      r"   r,   Glm4DecoderLayer.__init__*   s    !--&fJ6?*6+=+=6CVCVW(3F4F4FFL_L_(`%(3F4F4FFL_L_(`%"-f.@.@fFYFY"Zr!   hidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingskwargsreturnc	                     Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pU R                  U5      nX-   nUn
U R                  U5      nU R	                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )N)r9   r:   r;   r<   r=   r>   r?   r@   r   )r2   r/   r4   r3   r0   r5   )r6   r9   r:   r;   r<   r=   r>   r?   r@   rA   residualself_attn_weightsoutputss                r"   forwardGlm4DecoderLayer.forward5   s     !,,]; ,0>> 
,
')%)/) 3
,
 
,
( 55mD 0 !55mD///> 0 "++Gr!   )r-   r2   r0   r3   r5   r4   r/   )NNNFFNN)r   r   r   r   r   intr,   torchTensorr   
LongTensorr   boolr   r   r   FloatTensorrG   r    __classcell__r7   s   @r"   r%   r%   )   s   	[z 	[c 	[ 2637*.,1$)59KO+||+ !.+ u//0	+
 !+ $D>+ D>+ !!1!12+ &eELL%,,,F&GH+ -.+ 
u  (51B1BEDUDU1U+V"WW	X+ +r!   r%   c                       \ rS rSrSrg)r.   c   r   Nr   r   r!   r"   r.   r.   c   r#   r!   r.   c                       \ rS rSrSrg)KwargsForCausalLMg   r   Nr   r   r!   r"   rT   rT   g   s    3r!   rT   c                   D   ^  \ rS rSrS\\   S\\\4   4U 4S jjr	Sr
U =r$ )Glm4ForCausalLMj   super_kwargsrB   c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Glm4ForCausalLM

>>> model = Glm4ForCausalLM.from_pretrained("THUDM/GLM-4-9B-Chat-0414")
>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4-9B-Chat-0414")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```r   )r+   rG   )r6   rY   r7   s     r"   rG   Glm4ForCausalLM.forwardk   s    4 w...r!   r   )r   r   r   r   r   rT   r   r   r
   rG   r    rO   rP   s   @r"   rW   rW   j   s0    /01/ 
u,,	-/ /r!   rW   c                       \ rS rSrSrg)Glm4ForSequenceClassification   r   Nr   r   r!   r"   r]   r]      r#   r!   r]   c                       \ rS rSrSrg)Glm4ForTokenClassification   r   Nr   r   r!   r"   r`   r`      r#   r!   r`   )Glm4PreTrainedModel	Glm4ModelrW   r]   r`   )*typingr   r   r   torch.utils.checkpointrJ   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   processing_utilsr   utilsr   r   glm.modeling_glmr   r   r   r   phi3.modeling_phi3r   configuration_glm4r   modeling_glm4r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r%   r.   rT   rW   r]   r`   __all__r   r!   r"   <module>rt      s     * )    B 9 6 & ( t t ( * & 
		H	%0 	g 	71 7t	L 	 ?,j >/n /<	$@ 		!: 	r!   