
    fTh                        S r SSKrSSKJrJrJrJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  \R6                  " \5      r " S S\	R<                  5      r " S S\	R@                  5      r! " S S\	R@                  5      r" " S S\	R@                  5      r#\ " S S\5      5       r$\ " S S\$5      5       r%\" SS9 " S S\$\5      5       r&/ SQr'g)zPyTorch XGLM model.    N)ListOptionalTupleUnion)nn   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )
XGLMConfigc            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )XGLMScaledWordEmbedding%   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)super__init__r   )selfr   r   r   r   	__class__s        ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/xglm/modeling_xglm.pyr    XGLMScaledWordEmbedding.__init__*   s    D&    	input_idsc                 <   > [         TU ]  U5      U R                  -  $ r   )r   forwardr   )r   r$   r    s     r!   r&   XGLMScaledWordEmbedding.forward.   s    wy)D,<,<<<r#   r   )      ?)__name__
__module____qualname____firstlineno____doc__intr   floatr   torchTensorr&   __static_attributes____classcell__r    s   @r!   r   r   %   sJ    's '3 'S '_ghm_n ' '= = =r#   r   c            	          ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5       SS
\\R                     S\4S jj5       rSrU =r$ )!XGLMSinusoidalPositionalEmbedding2   zDThis module produces sinusoidal positional embeddings of any length.num_positionsr   r   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g )N   )r   r   offsetr   r   make_weights)r   r9   r   r   r    s       r!   r   *XGLMSinusoidalPositionalEmbedding.__init__5   s8    *&-++5}Rr#   r   c                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor@   rB   rC   register_buffer)r   r   r   r   emb_weightss        r!   r=   .XGLMSinusoidalPositionalEmbedding.make_weights<   s\    ((T4##%..t||/A/A$,,J]J].^KYFr#   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
r;   i'  r   )rB   r   dimN)mathlogr1   exparangeint64r0   	unsqueezecatsincosviewzerosrG   get_default_dtype)r   r   r   half_dimembs        r!   rE   /XGLMSinusoidalPositionalEmbedding.get_embeddingD   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r#   position_idspast_key_values_lengthc                    UR                  5       u  p4XR                  -  nSU-   U-   nXPR                  R                  S5      :  a&  U R                  XPR                  U R
                  5        U R                  R                  SUR                  S5      5      R                  X4U R                  R                  S   5      R                  5       $ )Nr;   r   rN   )
sizer<   r@   r=   r   r   index_selectrX   shapedetach)r   r^   r_   bszseq_lenmax_poss         r!   r&   )XGLMSinusoidalPositionalEmbedding.forwardY   s    #((*# g+ 66\\&&q))g'9'94;K;KL||((L,=,=b,ABGGVZVbVbVhVhikVlmttvvr#   )r   r<   r   r   )Nr   )r*   r+   r,   r-   r.   r/   r   r   r=   staticmethodrE   r1   no_gradr2   r&   r3   r4   r5   s   @r!   r7   r7   2   s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( ]]_	wHU\\$: 	w[^ 	w 	wr#   r7   c                     ^  \ rS rSrSr   SS\S\S\S\S\4
U 4S jjjrS	\	R                  S
\S\4S jr     SS\	R                  S\\	R                     S\\\	R                        S\\	R                     S\\	R                     S\S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )XGLMAttentionf   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbiasc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rr   )r   r   rn   ro   rp   head_dim
ValueErrorscalingrq   r   Lineark_projv_projq_projout_proj)r   rn   ro   rp   rq   rr   r    s         r!   r   XGLMAttention.__init__i   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$ii	4@ii	4@ii	4@		)TBr#   tensorrf   re   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r;   )rX   ro   ru   	transpose
contiguous)r   r~   rf   re   s       r!   _shapeXGLMAttention._shape   s5    {{3GQQRSUVWbbddr#   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 4   USLnUR                  5       u  pn
U R                  U5      U R                  -  nU(       a  Ub  US   nUS   nGOU(       aE  U R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nOUby  U R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      n[        R                  " US   U/SS9n[        R                  " US   U/SS9nODU R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nU R                  (       a  X4nXR                  -  SU R                  4nU R                  XU5      R                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XR                  -  X4:w  a-  [        SXR                  -  X4 SUR                  5        35      eUb  UR                  5       USX4:w  a"  [        S	USX4 SUR                  5        35      eUR                  XR                  X5      U-   n[        R                  " U[        R                   " [        R"                  " UR$                  5      R&                  UR(                  S
95      nUR                  XR                  -  X5      nUR$                  [        R*                  :X  aK  [,        R.                  R1                  US[        R2                  S9R5                  [        R*                  5      nO[,        R.                  R1                  USS9nUb  UR                  5       U R                  4:w  a*  [        SU R                  4 SUR                  5        35      eUR                  SSSS5      UR                  XR                  X5      -  nUR                  XR                  -  X5      nU(       a;  UR                  XR                  X5      nUR                  XR                  -  X5      nOSn[,        R.                  R7                  UU R6                  U R8                  S9n[        R                  " UU5      nUR                  5       XR                  -  XR                  4:w  a5  [        SXR                  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR;                  XU R<                  5      nU R?                  U5      nUUU4$ )z#Input shape: Batch x Time x ChannelNr   r   rN   r;   rL   z$Attention weights should be of size z	, but is z!Attention mask should be of size )rC   )rM   rB   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size ) ra   r{   rw   r   ry   rz   r1   rU   rq   ro   ru   rX   bmmr   rv   maxr~   finforB   minrC   float16r   
functionalsoftmaxfloat32rG   rp   r   reshapern   r|   )r   r   r   r   r   r   r   is_cross_attentionre   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r!   r&   XGLMAttention.forward   s    .T9',,.a {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? )7NNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL 99ell5;;|7I7I+J+N+NWcWjWjkL (,,S>>-A7TL .==002U]]0[^^_d_l_lmL==0020FL&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK01>AAr#   )
rp   rn   ru   rq   ry   ro   r|   r{   rw   rz   )        FT)NNNNF)r*   r+   r,   r-   r.   r/   r0   boolr   r1   r2   r   r   r   r&   r3   r4   r5   s   @r!   rl   rl   f   s9   G  CC C 	C
 C C C6eU\\ eC ec e 488<1526"'vB||vB #5<<0vB !u||!45	vB
 !.vB "%,,/vB  vB 
u||Xell3XeELL>Q5RR	SvB vBr#   rl   c                   L  ^  \ rS rSrS\4U 4S jjr        SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	\R                        S\\
   S\\
   S\R                  4S jjrSrU =r$ )XGLMDecoderLayer   configc                 4  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SS9U l        UR                  U l        [        UR                     U l        UR                  U l        UR                  (       aT  [	        U R                  UR
                  UR                  SS9U l        [        R                   " U R                  5      U l        [        R                   " U R                  5      U l        [        R&                  " U R                  UR(                  5      U l        [        R&                  " UR(                  U R                  5      U l        [        R                   " U R                  5      U l        g )NT)rn   ro   rp   rq   )r   r   d_modelrn   rl   attention_headsattention_dropout	self_attnrp   r	   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrx   ffn_dimfc1fc2final_layer_normr   r   r    s     r!   r   XGLMDecoderLayer.__init__  s   &nn,,,,	
 ~~#F$>$>?"(";";%% -.. 0000	!D ,.<<+GD($&LL$@!99T^^V^^<99V^^T^^< "T^^ <r#   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   	use_cacher   c
           	      T   Un
U R                  U5      nUb  USS OSnU R                  UUUUUS9u  pn[        R                  R	                  XR                  U R
                  S9nX-   nSnSnUbn  Un
U R                  U5      nUb  USS OSnU R                  UUUUUUS9u  pn[        R                  R	                  XR                  U R
                  S9nX-   nX-   nUn
U R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  UX4-  nU	(       a  UU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
Nr;   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   rp   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valueoutputss                     r!   r&   XGLMDecoderLayer.forward  s   < !11-@ :H9S>"1#5Y] >Bnn'3)+/ ?M ?
;*; --m||VZVcVc-d 0 (,$! ,$H 88GM @N?Yrs(;_c%NRN_N_+!65 :8"3 O` OKM/K MM11-<<Z^ZgZg1hM$4M !2 P !--m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0 ")>>G)++Gr#   )r   r   rp   rn   r   r   r   r   r   r   r   )NNNNNNFT)r*   r+   r,   r-   r   r   r1   r2   r   r   r   r&   r3   r4   r5   s   @r!   r   r      s    =z =@ 268<9=26=A8<,1$(W||W !.W  (5	W
 !) 6W "%,,/W %-U\\$:W !u||!45W $D>W D>W 
W Wr#   r   c                   ,    \ rS rSr\rSrSrS/rS r	Sr
g)XGLMPreTrainedModelix  modelTr   c                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_std
isinstancer   rx   weightdatanormal_rr   zero_	Embeddingr   )r   moduler   s      r!   _init_weights!XGLMPreTrainedModel._init_weights  s    kk""fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r#    N)r*   r+   r,   r-   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r3   r   r#   r!   r   r   x  s!    L&*#+,	?r#   r   c                      ^  \ rS rSrSS\S\\R                     4U 4S jjjrS r	S r
\             SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\\R                        S\\R                     S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )	XGLMModeli  r   embed_tokensc                   > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSnUb  X l        O/[        UR                  UR                  U R
                  US9U l        [        UR                  UR                  UR                  5      U l        ["        R$                  " ['        UR(                  5       Vs/ s H  n[+        U5      PM     sn5      U l        ["        R.                  " UR                  5      U l        SU l        U R5                  5         gs  snf )zB
embed_tokens (`nn.Embedding`, *optional*):
    output embeddings
r)   Nr(   F)r   r   rp   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrO   sqrtr   r   r   
vocab_sizer7   embed_positionsr   
ModuleListrange
num_layersr   layersr   
layer_normgradient_checkpointing	post_init)r   r   r   r   r   r    s        r!   r   XGLMModel.__init__  s   
 	 ~~))!..$*$B$B!393I3Idii/s# , 7!!6>>43C3CQ\!D  A**NN 

 mmuVM^M^G_$`G_!%5f%=G_$`a,,v~~6&+# %as   E6c                     U R                   $ r   r   r   s    r!   get_input_embeddingsXGLMModel.get_input_embeddings  s       r#   c                     Xl         g r   r   r   values     r!   set_input_embeddingsXGLMModel.set_input_embeddings  s    !r#   r$   r   r^   r   r   	head_maskcross_attn_head_maskpast_key_valuesinputs_embedsr   r   output_hidden_statesreturn_dictr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  U	b  [        S5      eUb7  U R                  X5        UR                  5       nUR                  SUS   5      nO"U	b  U	R                  5       SS nO[        S5      eUb  US   S   R                  S   OSnUcU  [        R                  " UUS   U-   [        R                  Ub  UR                  OU	R                  S9nUR                  S5      nU	c  U R                  U5      n	[!        X.X5      nUb  Ub  [#        XYR$                  US   S9nXR'                  X?5      R)                  U	R                  5      -   n[*        R,                  R/                  U[1        U R.                  5      U R2                  S	9nU R4                  (       a/  U R2                  (       a  U
(       a  [6        R9                  S
5        Sn
U(       a  SOSnU(       a  SOSnU(       a  Ub  SOSnU
(       a  SOSn[;        Xg/SS/5       Hn  u  nnUc  M  UR                  5       S   [=        U R>                  5      :w  d  M7  [        SU S[=        U R>                  5       SUR                  5       S    S35      e   [A        U R>                  5       GH  u  nnU(       a  UU4-  nU R2                  (       a(  [        RB                  " / 5      nUU RD                  :  a  MM  Ub  UU   OSnU R4                  (       aF  U R2                  (       a5  U RG                  URH                  UUUUUb  UU   OSUb  UU   OSSUU
5
      nOU" UUUUUb  UU   OSUb  UU   OSUUU
S9	nUS   nU
(       a  UUU(       a  SOS   4-  nU(       d  M  UUS   4-  nUc  GM  UUS   4-  nGM     U RK                  U5      nU(       a  UU4-  nU
(       a  UOSnU(       d  [M        S UUUUU4 5       5      $ [O        UUUUUS9$ )a  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NzDYou cannot specify both input_ids and inputs_embeds at the same timerN   z5You have to specify either input_ids or inputs_embedsr   r;   rA   )r   r   z_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...Fr   r   r  zThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   ).0vs     r!   	<genexpr>$XGLMModel.forward.<locals>.<genexpr>T  s      mA ms   	)last_hidden_stater  r   
attentionscross_attentions)(r   r   r  r   use_return_dictrv   %warn_if_padding_and_no_attention_maskra   rX   rc   r1   rR   longrC   rT   r   r   r   rB   r   rG   r   r   rp   r0   r   r   loggerwarning_onceziplenr   	enumeraterandr   _gradient_checkpointing_func__call__r   tupler   )r   r$   r   r^   r   r   r   r  r  r  r   r   r  r  input_shaper_   r   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilityr   layer_outputs
next_caches                                r!   r&   XGLMModel.forward  s   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>cdd"66yQ#..*K!r;r?;I&',,.s3KTUUCRC^!3A!6!<!<Q!?de <<&B"88jj+4+@y''mFZFZ	L (11!4L  --i8M:

 !,1G1S%?&(;(;[QS_&" &(<(<\(b(e(e  )
 
 --muT\\?R]a]j]j-k&&4==##u "	 #7BD0d&7<Q<]rdh#,R$ %((IKYoKp$q Iy$>>#A&#dkk*::$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! *!,M"}:KQQR'S&UU"  =#3"55(4(]1-=,??(_ #9b 6  -!11+4'$
 '5FXlm  
 9+&+%1
 	
r#   )	rp   r   r   r   r   r   r   r   r   r   )NNNNNNNNNNNNN)r*   r+   r,   r-   r   r   r   r   r   r   r   r   r1   r2   r   FloatTensorr   r   r   r   r&   r3   r4   r5   s   @r!   r   r     s   z ",,9O  >!"  -115/38<9=,07;=A04$(,0/3&*l
ELL)l
 !.l
 u||,	l

  (5l
 !) 6l
 ELL)l
 'u||4l
 "$u'8'8"9:l
  -l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$MM	Nl
 l
r#   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            "         ^  \ rS rSrSrS/rU 4S jrS rS rS r	S r
\              SS	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\\R                         S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       r\S 5       rSrU =r$ )XGLMForCausalLMib  r   zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFrt   )
r   r   r   r   r   rx   hidden_sizer   lm_headr   r   s     r!   r   XGLMForCausalLM.__init__l  sH     v&
yy!3!3V5F5FUS 	r#   c                 .    U R                   R                  $ r   r   r   r   s    r!   r   $XGLMForCausalLM.get_input_embeddingst  s    zz&&&r#   c                 $    XR                   l        g r   r2  r   s     r!   r   $XGLMForCausalLM.set_input_embeddingsw  s    "'

r#   c                     U R                   $ r   r/  r   s    r!   get_output_embeddings%XGLMForCausalLM.get_output_embeddingsz  s    ||r#   c                     Xl         g r   r7  )r   new_embeddingss     r!   set_output_embeddings%XGLMForCausalLM.set_output_embeddings}  s    %r#   r$   r   r^   r   r   r   r  r  r  labelsr   r   r  r  r   c                 ,   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUUUU	UUUUS9nU R                  US   5      nSnU
b?  U R                  " UU
4U R                   R                  U R                   R                  S.UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)r$   r   r^   r   r   r   r  r  r  r   r   r  r  r   )r   r   r   )losslogitsr  r   r  r  )r   r   r  r  r   r/  loss_functionr   r   r   r  r   r  r  )r   r$   r   r^   r   r   r   r  r  r  r>  r   r   r  r  kwargsr   rA  r@  outputs                       r!   r&   XGLMForCausalLM.forward  sT   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **)%"7#9!5+'/!5#  
  gaj)%%  ;;11![[55	
 D Y,F'+'7D7V#CVC0#33!//))$55
 	
r#   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nr   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)rb   rG   rC   )r	  
past_statebeam_idxs     r!   r  1XGLMForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)r  )r  rI  reordered_past
layer_pasts    `  r!   _reorder_cacheXGLMForCausalLM._reorder_cache  s8    )Jncmnn N * r#   )r/  r   )NNNNNNNNNNNNNN)r*   r+   r,   r-   r   _tied_weights_keysr   r   r   r8  r<  r   r   r1   r2   r   r)  r   r   r   r   r&   ri   rM  r3   r4   r5   s   @r!   r,  r,  b  s     *+'(&  -115/38<9=,07;=A04)-$(,0/3&*W
ELL)W
 !.W
 u||,	W

  (5W
 !) 6W
 ELL)W
 'u||4W
 "$u'8'8"9:W
  -W
 &W
 D>W
 $D>W
 'tnW
 d^W
" 
uU\\"$EE	F#W
 W
r  r#   r,  )r,  r   r   )(r.   rO   typingr   r   r   r   r1   torch.utils.checkpointr   activationsr	   
generationr
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_xglmr   
get_loggerr*   r  r   r   Moduler7   rl   r   r   r   r,  __all__r   r#   r!   <module>r\     s      / /    ! ) e l - , * 
		H	%
=bll 
=1w		 1whWBBII WBturyy up ?/ ? ?$ S
# S
 S
l y)? yyx Br#   