
    fTh}                        S r SSKrSSKJrJrJrJr  SSKrSSKJ	s  J
r  SSKrSSKJ	r	  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  \R:                  " \5      r " S S\	R@                  5      r! " S S\	R@                  5      r" " S S\	R@                  5      r# " S S\	R@                  5      r$ " S S\	R@                  5      r% " S S\	R@                  5      r& " S S\	R@                  5      r' " S S\	R@                  5      r( " S S\	R@                  5      r) " S  S!\	R@                  5      r* " S" S#\	R@                  5      r+\ " S$ S%\5      5       r,\ " S& S'\,5      5       r-\" S(S)9 " S* S+\,\5      5       r./ S,Qr/g)-zPyTorch CPMAnt    N)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringlogging   )CpmAntConfigc                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )CpmAntLayerNorm%   zn
We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
configc                    > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " [        R                  " UR                  5      5      U l	        g N)
super__init__epshidden_sizedim_normr   	Parametertorchemptyweightselfr   	__class__s     b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/cpmant/modeling_cpmant.pyr   CpmAntLayerNorm.__init__*   sE    ::**ll5;;v/A/A#BC    hidden_statesc                 l   UR                  S5      U R                  :w  a  [        S5      eUR                  nUR	                  [
        R                  5      R                  S5      R                  SSS9nU[
        R                  " X0R                  -   5      -  R	                  U5      U R                  -  nU$ )N
Args:
    hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
z'hidden_states.size(-1) != self.dim_norm   T)dimkeepdim)sizer   AssertionErrordtypetor   float32powmeanrsqrtr   r!   )r#   r(   	old_dtypevariances       r%   forwardCpmAntLayerNorm.forward1   s    
 b!T]]2 !JKK!''	 ##EMM266q9>>2t>T&X5H)IIMMiX[_[f[ffr'   )r   r   r!   )__name__
__module____qualname____firstlineno____doc__r   r   r   Tensorr9   __static_attributes____classcell__r$   s   @r%   r   r   %   s+    D| D
U\\ 
 
r'   r   c                      ^  \ rS rSrS\4U 4S jjr   SS\R                  S\R                  S\R                  S\R                  S\	\
   S	\	\\R                  \R                  4      S
\	\
   4S jjrSrU =r$ )CpmAntAttention>   r   c                 .  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R
                  U R                  -  SS9U l	        [        R                  " U R                  U R
                  U R                  -  SS9U l
        [        R                  " U R                  U R
                  U R                  -  SS9U l        [        R                  " U R
                  U R                  -  U R                  SS9U l        [        R                  R                  SS9U l        UR                   b-  [        R                  R#                  UR                   S9U l        g S U l        g )NFbiasr+   r-   )p)r   r   r   	dim_modelnum_attention_heads	num_headsdim_headr   Linear	project_q	project_k	project_vattention_outr   Softmaxsoftmax	dropout_pDropoutdropoutr"   s     r%   r   CpmAntAttention.__init__?   s   ++334>>4>>DMM3QX]^4>>4>>DMM3QX]^4>>4>>DMM3QX]^YYt~~'Et~~\abxx''B'/' 88++f.>.>+?DLDLr'   hidden_q	hidden_kvattention_maskposition_biasoutput_attentionspast_key_values	use_cachec           
         UR                  S5      nUR                  S5      n	UR                  S5      n
U R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SSSS5      nUR	                  XU R
                  U R                  5      R                  SSSS5      nUR	                  XU R
                  U R                  5      R                  SSSS5      nUbE  [        R                  " US   U/SS9n[        R                  " US   U/SS9nUR                  S5      n
[        R                  " XR                  SS5      5      [        R                  " U R                  5      -  nX-   n[        R                  " UUR	                  USX5      [        R                  " S	5      :H  [        R                   " [#        S
5      UR$                  UR&                  S95      nU R)                  U5      n[        R                  " UUR	                  USX5      [        R                  " S	5      :H  [        R                   " SUR$                  UR&                  S95      nU(       a  UnOSnU R*                  b  U R+                  U5      n[        R                  " X5      nUR	                  XR
                  XR                  5      R                  SSSS5      nUR-                  5       R	                  XU R
                  U R                  -  5      nU R/                  U5      nSnU(       a  X4nXU4$ )a  
Args:
    hidden_q (`torch.Tensor`):
        Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
    hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
        Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
    attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
        Avoid invalid areas to participate in the calculation of self-attention.
    position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
        Provide positional information to self-attention block.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers.
    past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *optional*):
        Cached past key and value projection states.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
r   r   r,   r	   NrJ   r+   Fz-inf)devicer1   )r/   rQ   rR   rS   viewrN   rO   permuter   catmatmul	transposemathsqrtmasked_filltensorscalar_tensorfloatrd   r1   rV   rY   
contiguousrT   )r#   r[   r\   r]   r^   r_   r`   ra   
batch_sizelen_qlen_kquerykeyvaluescoreattn_weightss                   r%   r9   CpmAntAttention.forwardR   s   8 ]]1%
a q!x(nnY'y)

:dnndmmLTTUVXY[\^_`hhz$..$--HPPQRTUWXZ[\

:dnndmmLTTUVXY[\^_`&))_Q/52>CIIq159rBEHHRLE UMM"b$9:TYYt}}=UU%!!
Au<U@SSfell%++V

 U#!!
Au<U@SS%,,ekkJ

  LL<<#LL'E U*

:~~ummLTTUVXY[\^_`  "''
4>>DMM;YZ""5)"lOO33r'   )	rT   rO   rL   rY   rN   rR   rQ   rS   rV   )FNN)r;   r<   r=   r>   r   r   r   r@   
BoolTensorr   boolr   r9   rA   rB   rC   s   @r%   rE   rE   >   s     |  2 -2GK$(Q4,,Q4 <<Q4 ((	Q4
 ||Q4 $D>Q4 "%ell(B"CDQ4 D>Q4 Q4r'   rE   c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\R                  S\\R                     S\\	   S\\
\R                  \R                  4      S	\\	   4S
 jjrSrU =r$ )CpmAntSelfAttentionBlock   r   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        UR                  (       a/  [        R                  R                  UR                  5      U l
        g S U l
        g r   )r   r   r   layernorm_before_attentionrE   self_attentionrW   r   r   rX   rY   r"   s     r%   r   !CpmAntSelfAttentionBlock.__init__   sT    *9&*A'-f5 88++F,<,<=DLDLr'   r(   r]   r^   r_   r`   ra   c           	          U R                  U5      nU R                  XwX#XEU5      nUu  pxn	U R                  b  U R                  U5      nX-   nXU	4$ )a  
Args:
    hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
        Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
    attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
        Avoid invalid areas to participate in the calculation of self-attention.
    position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
        Provide positional information to self-attention block.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers.
    past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
        Cached past key and value projection states.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
)r   r   rY   )
r#   r(   r]   r^   r_   r`   ra   outputsrx   current_key_values
             r%   r9    CpmAntSelfAttentionBlock.forward   sh    2 11-@%%n=Naj
 4;00<<#ll7+G%/,===r'   )rY   r   r   NFNNr;   r<   r=   r>   r   r   r   r@   r   r{   r   r9   rA   rB   rC   s   @r%   r}   r}      s     |   15,1GK$($>||$> $>  -	$>
 $D>$> "%ell(B"CD$> D>$> $>r'   r}   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )CpmAntDenseGatedACT   r   c                 $  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  R                  5       U l
        g NFrH   )r   r   r   rP   r   dim_ffw_0w_1r   GELUactr"   s     r%   r   CpmAntDenseGatedACT.__init__   s[    99V//UK99V//UK88==?r'   r(   c                 p    U R                  U R                  U5      5      nU R                  U5      nX!-  nU$ )zTransform an input tensor from one feature space to another via a nonlinear operation

Args:
    hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
r   r   r   )r#   r(   
gate_scores      r%   r9   CpmAntDenseGatedACT.forward   s7     XXdhh}56
/"2r'   r   r;   r<   r=   r>   r   r   r   r@   r9   rA   rB   rC   s   @r%   r   r      s$    #| #
U\\ 
 
r'   r   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )CpmAntFeedForward   r   c                 &  > [         TU ]  5         [        U5      U l        UR                  b/  [
        R                  R                  UR                  5      U l        OS U l        [        R                  " UR                  UR                  SS9U l        g r   )r   r   r   w_inrW   r   r   rX   rY   rP   r   r   w_outr"   s     r%   r   CpmAntFeedForward.__init__   sg    '/	' 88++F,<,<=DLDLYYv}}f.@.@uM
r'   r(   c                     U R                  U5      nU R                  b  U R                  U5      nU R                  U5      nU$ )r*   )r   rY   r   r#   r(   s     r%   r9   CpmAntFeedForward.forward   s>    
 		-0<<# LL7M

=1r'   )rY   r   r   r   rC   s   @r%   r   r      s&    N| NU\\  r'   r   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )CpmAntFFNBlocki  r   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        UR                  (       a/  [        R                  R                  UR                  5      U l
        g S U l
        g r   )r   r   r   layernorm_before_ffnr   ffnrW   r   r   rX   rY   r"   s     r%   r   CpmAntFFNBlock.__init__  sS    $3F$;!$V, 88++F,<,<=DLDLr'   r(   c                     U R                  U5      nU R                  U5      nU R                  b  U R                  U5      nX-   nU$ )z
Args:
    hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
        Hidden states before feed forward layer.
)r   r   rY   )r#   r(   
ln_outputsr   s       r%   r9   CpmAntFFNBlock.forward  sH     ..}=
((:&<<#ll7+G%/r'   )rY   r   r   r   rC   s   @r%   r   r     s%     |  || r'   r   c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\R                  S\\R                     S\\	   S\\
\R                  \R                  4      S	\\	   4S
 jjrSrU =r$ )CpmAntTransformerBlocki   r   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )r   r   r}   self_attr   r   r"   s     r%   r   CpmAntTransformerBlock.__init__!  s&    08!&)r'   r(   r]   r^   r_   r`   ra   c           	      ^    U R                  UUUUUUS9nUu  pnU R                  U5      nXU4$ )a,  
Args:
    hidden_states (`torch.Tensor`):
        Input to the layer of shape `(batch, seq_len, dim_model)`
    attention_mask (`torch.Tensor`):
        Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
    position_bias (`torch.Tensor`):
        Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers.
    past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
        Cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
)r]   r^   r_   r`   ra   )r   r   )	r#   r(   r]   r^   r_   r`   ra   rx   r   s	            r%   r9   CpmAntTransformerBlock.forward&  sQ    2 )'/+ & 
 :G6%6/,===r'   )r   r   r   r   rC   s   @r%   r   r      s    *| * 15,1GK$(&>||&> &>  -	&>
 $D>&> "%ell(B"CD&> D>&> &>r'   r   c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\R                  S\R                  S\\	   S\\	   S	\\
\R                  \R                  4      S
\\	   4S jjrSrU =r$ )CpmAntEncoderiO  r   c                    > [         TU ]  5         UR                  U l        [        R
                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        U5      U l
        g s  snf r   )r   r   num_hidden_layers
num_layersr   
ModuleListranger   layersr   output_layernorm)r#   r   ithr$   s      r%   r   CpmAntEncoder.__init__P  sa     22mmuUYUdUdOe$fOe%;F%COe$fg / 7 %gs   A:r(   r]   r^   r_   output_hidden_statesr`   ra   c           
      <   U(       a  SOSnU(       a  SOSn	U(       a  SOSn
[        U R                  5       HB  u  pU(       a  X4-  nU" UUUUU(       a  Xk   OSUS9nUu  pnU(       a  X4-  n	Uc  M=  X4-   n
MD     U R                  U5      nU(       a  X4-  nXX4$ )a  
Args:
    hidden_states (`torch.Tensor`):
        Input to the layer of shape `(batch, seq_len, dim_model)`
    attention_mask (`torch.Tensor`):
        Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
    position_bias (`torch.Tensor`):
        Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers.
    past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
        Cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
 N)r_   r`   ra   )	enumerater   r   )r#   r(   r]   r^   r_   r   r`   ra   all_hidden_statesall_self_attnscurrent_key_valuesilayerlayer_outputsrx   r   s                   r%   r9   CpmAntEncoder.forwardW  s    8 #7BD0d#,R$!$++.HA#!%55!!"36E 24#M >K:M): /1 ,%7:N%N" /" --m<!112CSSr'   )r   r   r   )NNNNr   rC   s   @r%   r   r   O  s    8| 8 -1/3GK$(6T||6T 6T ||	6T
 $D>6T 'tn6T "%ell(B"CD6T D>6T 6Tr'   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )CpmAntIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r   r   rP   r   intermediate_sizedense
isinstance
hidden_actstrr
   intermediate_act_fnr"   s     r%   r   CpmAntIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r'   r(   returnc                 J    U R                  U5      nU R                  U5      nU$ r   r   r   r   s     r%   r9   CpmAntIntermediate.forward  s&    

=100?r'   r   
r;   r<   r=   r>   r   r   r@   r9   rA   rB   rC   s   @r%   r   r     s(    9U\\ ell  r'   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S jrS	 r	SS
 jr
SrU =r$ )CpmAntSegmentPositionEmbeddingi  r   c                 f  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l	        [        R                  " [        R                  " UR                  UR                  -  UR                  -   UR                  5      5      U l        g r   )r   r   rM   rN   position_bias_num_bucketsnum_bucketsposition_bias_max_distancemax_distancesegment_typesnum_segmentsr   r   r   r    relative_attention_biasr"   s     r%   r   'CpmAntSegmentPositionEmbedding.__init__  s    33!;;"=="00')||KK$$v';';;f>^>^^**(
$r'   key_pos	query_poskey_segmentquery_segmentc           
      *   [         R                  " 5          UR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      UR                  S5      :w  a0  [        SUR                  S5       SUR                  S5       S35      eXcR                  S5      :w  d  XtR                  S5      :w  a!  [        SU SUR                  S5       S35      eXtR                  S5      :w  a!  [        SU SUR                  S5       S35      eUR	                  USU5      nUR	                  XWS5      nUR	                  USU5      nUR	                  XWS5      nU R                  XC5      nXR                  -   nU R                  [         R                  " U[         R                  UR                  S	9S S S 24   [         R                  " U[         R                  UR                  S	9S S 2S 4   -
  U R                  U R                  S
9n	[         R                  " X4:H  U	S S S 2S S 24   U5      nS S S 5        [        R                  " WU R                  5      n
U
R!                  SSSS5      R#                  5       n
U
$ ! , (       d  f       NS= f)Nr   r   z>key_pos.size(0) should be equal to query_pos.size(0), but got z and !z7keylen should be equal to key_segment.size(1), but got z;querylen should be equal to query_segment.size(1), but got r+   r1   rd   )r   r   r	   r,   )r   no_gradr/   r0   re   !_segment_relative_position_bucketr   _position_bucketarangeint32rd   r   whereF	embeddingr   rf   rp   )r#   r   r   r   r   batchkeylenquerylenrelative_position_bucketabsolute_position_bucketembedss              r%   r9   &CpmAntSegmentPositionEmbedding.forward  s|    ]]_LLOE\\!_F ~~a(H||A).."33$TU\UaUabcUdTeejktkykyz{k|j}}~  ))!,,<N<Nq<Q0Q$MfXUZ[f[k[klm[nZoopq  --a00$QRZQ[[`anasastuav`wwxy  ll5"f5G!u;I%**5"f=K)..uCM'+'M'Mm'i$'?BRBR'R$ (,'<'<V5;;?W?^?^_`dfg`gh,,xu{{C[CbCbcdegkdklm ,,!..	 (= ($ (-{{-(q!4(($C P 5t7S7ST1a+668W _s   H!J
Jc                 $    XR                   -  U-   $ r   )r   )r#   r   r   s      r%   r   @CpmAntSegmentPositionEmbedding._segment_relative_position_bucket  s    000;>>r'   c                 0   SnUS-  nUS:  R                  [        R                  5      U-  n[        R                  " U5      nUS-  nX:  nU[        R                  " UR                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " XaR                  [        R                  5      U5      -  nU$ )Nr   r,   r   )
r2   r   r   abslogro   rj   min	full_liker   )r#   relative_positionr   r   relative_buckets	max_exactis_smallrelative_postion_if_larges           r%   r   /CpmAntSegmentPositionEmbedding._position_bucket  s    -155ekkB[P!II&781$	$0$-II'--/);<hh|/01&( "U[[/	%!
 %*II%OO5QG%
! 	EKK2F2Fu{{2SUnoor'   )r   r   rN   r   r   )       )r;   r<   r=   r>   r   r   r   r@   r9   r   r   rA   rB   rC   s   @r%   r   r     sZ    
| 
22 <<2 \\	2
 ||2h?   r'   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )CpmAntOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )N)r   )r   r   r   rP   r   r   r   	LayerNormlayer_norm_epsrX   hidden_dropout_probrY   r"   s     r%   r   CpmAntOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r'   r(   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   )r   rY   r  )r#   r(   r  s      r%   r9   CpmAntOutput.forward  s5    

=1]3}'CDr'   )r  r   rY   r   rC   s   @r%   r	  r	    s6    >U\\  RWR^R^  r'   r	  c                   "    \ rS rSr\rSrS rSrg)CpmAntPreTrainedModeli  cpmantc                 *   [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a&  UR                  R                  R                  S5        g[        U[        5      (       a9  UR                   R                  R                  SU R                  R                  S9  gg)zInitialize the weightsg        )r5   stdNg      ?)r   r   rP   r!   datanormal_r   init_stdrI   zero_	Embeddingpadding_idxr  fill_r   r   r   )r#   modules     r%   _init_weights#CpmAntPreTrainedModel._init_weights  sj   fbii((MM&&CT[[5I5I&J{{&  &&( '--MM&&CT[[5I5I&J!!-""6#5#56<<> .--KK""$MM$$S)00MM$$S) >??**//77SdkkFZFZ7[ @r'   r   N)	r;   r<   r=   r>   r   config_classbase_model_prefixr  rA   r   r'   r%   r  r    s    L \r'   r  c                      ^  \ rS rSrS\4U 4S jjrS rS rS r\	      SS\
\R                     S\
\   S	\
\   S
\
\\\R                           S\
\   S\
\   S\\\R                     \4   4S jj5       rSrU =r$ )CpmAntModeli#  r   c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  UR                  -  -   UR                  5      U l        [        U5      U l        UR                  U l        UR                  U l	        U R                  5         g r   )r   r   r   encoderr   r  r   r   segment_embedding
vocab_sizeprompt_typesprompt_lengthinput_embeddingr   r^   	post_initr"   s     r%   r   CpmAntModel.__init__%  s     $V,!#f.B.BFDVDV!W!|| 3 3f6J6J JJFL^L^ 
 <FC#11 ++r'   c                     U R                   $ r   r+  r#   s    r%   get_input_embeddings CpmAntModel.get_input_embeddings2  s    ###r'   c                     Xl         g r   r/  )r#   
embeddingskwargss      r%   set_input_embeddings CpmAntModel.set_input_embeddings5  s    )r'   c                    UR                  S5      nUR                  S5      nUR                  n[        R                  " XgS9[        R                  " XgS9R	                  SS5      :*  nUS S 2S S S 24   US S 2S S 2S 4   R                  5       UR	                  SXf5      -  -  n	XS S 2S S S 24   US S 2S S 2S 4   :H  -  n	[        R                  " [        [        X`R                  -
  5      5      S S S2   US9S S S 24   R                  US5      US S 2S 4   :  n
[        R                  " [        R                  " XPR                  US9R                  5       U
4SS9n
U
R	                  XVS5      U
R	                  USU5      -  U	-  n	U	$ )Nr   r   )rd   r+   rJ   )r/   rd   r   r   re   logical_notrm   listr   r*  repeatrg   onesr{   )r#   	input_idsspancontextlengthr   seqlenrd   directional_mask_2dr]   mask_1ds              r%   _prepare_attention_mask#CpmAntModel._prepare_attention_mask8  sy   q!"!!#ll6AU\\RXEhEmEmnprsEtt D!,Aq$J++-0C0H0HF0[[
 (4
+;tAq$J?O+OP LLeF-?-?$?@A$B$GPVWX\^_X_`gghmopqQWo 	 ))UZZ/A/A&QVVXZabhij eQ7',,uaQW:XX[iir'   r=  r_   r   r`   ra   return_dictr   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR
                  [        R                  :w  a  UR                  [        R                  5      nUR
                  UR                  p[        R                  " US:g  SS5      R                  XS9n
U
S:g  R                  S5      R                  XS9n[        R                  " [        R                  " U R                  S-  U R                  -   U R                  S-  U R                  -   UU	S9R!                  UR#                  S5      S5      U4SS9nUR#                  5       u  p[        R                  " [        R$                  " XR                  XS9U
4SS9n
[        R&                  " X4SXS9n[        R                  " XU	S9R!                  US5      n[        R&                  " X4SXS9nUc]  Sn[)        S/U R*                  R,                  -  5      nUR/                  5       nU R1                  U5      nU R3                  U
5      nUU-   nOHUS   S   R#                  S	5      nU R3                  U
5      nU R1                  U5      USS2SS2SS24   -   nU R5                  UUX5      nU R7                  XX5      nUSS2US2SS24   nUSS2SS2US2SS24   nUSS2US2SS24   nU R+                  UUUUUUU5      u  nnnnUS:X  a}  USS2U R                  S2SS24   nUb6  S
nU H,  nUUSS2SS2U R                  S2U R                  S24   4-  nM.     UnUb)  S
nU H  nUUSS2U R                  S2SS24   4-  nM!     UnU(       d  [)        S UUUU4 5       5      $ [9        UUUUS9$ )a9  
input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
Nr   r,   r   r+   r	   r   rJ   rc   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     r%   	<genexpr>&CpmAntModel.forward.<locals>.<genexpr>  s      aaas   	)last_hidden_stater`   r(   
attentions)r   r_   r   use_return_dictra   r1   r   r   r2   rd   r   sumrg   r   r*  r(  r;  r/   zerosfulltupler&  r   rp   r+  r'  rD  r^   r   )r#   r=  r_   r   r`   ra   rF  r5  r1   rd   segmentr@  r   
seq_lengthr?  positionr>  past_lengthr(   segment_statesr]   r^   present_key_valuesr   all_attentionsnew_attentions	attentionnew_hidden_stateshidden_states                                r%   r9   CpmAntModel.forwardJ  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!*!6IDKK<Q<Q	 ??ekk)!U[[1I!)9)9v++i1na366U6RQ,##B'***FII&&*T__<&&*T__<!	
 &*A. 
	 &NN,))U[[0B0B%_ahiopq**e0!5P<<
GNNuVWXzz5-qM"K#TFT\\-D-D$DEO!,,.I 00;M!33G<N)N:M)!,Q/44R8K!33G<N 00;nQPRPSUVY>WWM55iwW**8wP';<(:;%aKL!&;<%aq&89OS|| P
L)+<n !)!T-?-?-A1*DEM)!#!/I"yAt7I7I7KTM_M_Ma1a'b&ddN "0!/ ,$&!$5L%,q$:L:L:NPQ7Q*R)TT% %6$5! )+=?PR`a   '+.+%	
 	
r'   )r&  r+  r^   r*  r'  r(  )NNNNNN)r;   r<   r=   r>   r   r   r1  r6  rD  r   r   r   r@   r{   r   r   r   r9   rA   rB   rC   s   @r%   r$  r$  #  s    | $*$  -1,0/3@D$(&*g
ELL)g
 $D>g
 'tn	g

 "%ell(;"<=g
 D>g
 d^g
 
uU\\"$;;	<g
 g
r'   r$  zy
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                   N  ^  \ rS rSrS/rS\4U 4S jjr\        SS\\	R                     S\\\\	R                  \	R                  4         S\\   S\\   S	\\   S
\\	R                     S\\   S\\	R                     S\\\4   4S jj5       rS rS rS rS rS rSrU =r$ )CpmAntForCausalLMi  zlm_head.weightr   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  UR                  UR                  -  -   SS9U l
        U R                  5         g r   )r   r   r$  r  r   rP   r   r(  r)  r*  lm_headr,  r"   s     r%   r   CpmAntForCausalLM.__init__  sd     !&) yy 1 1F4G4G&J^J^4^ ^ej
 	r'   r=  r`   ra   r_   r   labelsrF  r]   r   c	                    Ub  UOU R                   R                  nU R                  XXRX75      n
U(       a  U
R                  OU
S   nU R	                  U5      nSnUbA  [        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                  U
R                  U
R                  S9$ )u  
input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss.

Example:

Text Generation with CpmAntForCausalLM.
```python
>>> from transformers import CPMAntTokenizer, CpmAntForCausalLM

>>> texts = "今天天气不错，"
>>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
>>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
>>> input_ids = tokenizer(texts, return_tensors="pt")
>>> outputs = model.generate(**input_ids)
>>> output_texts = tokenizer.batch_decode(outputs)
>>> print(output_texts)
['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
```
Nr   r+   r   )losslogitsr`   r(   rN  )r   rO  r  rM  rd  r   re   r/   r   r`   r(   rN  )r#   r=  r`   ra   r_   r   rf  rF  r]   r5  model_outputr(   ri  rh  	loss_funcoutputs                   r%   r9   CpmAntForCausalLM.forward  s    P &1%<k$++B]B]{{*>QZ
 ;F66<XY?m,(*IV[[V[[_=v{{2ODYab!11F)-)9TGf$EvE%(88&44#..
 	
r'   c                 .    U R                   R                  $ r   r  r+  r0  s    r%   r1  &CpmAntForCausalLM.get_input_embeddings	  s    {{***r'   c                 $    XR                   l        g r   ro  )r#   r4  s     r%   r6  &CpmAntForCausalLM.set_input_embeddings  s    &0#r'   c                     U R                   $ r   rd  r0  s    r%   get_output_embeddings'CpmAntForCausalLM.get_output_embeddings  s    ||r'   c                     Xl         g r   rt  )r#   new_embeddingss     r%   set_output_embeddings'CpmAntForCausalLM.set_output_embeddings  s    %r'   c                     U Vs/ s H  o3b  [        U5      OUPM     nnU H  nUS   U   US'   US   U   US'   M     U$ s  snf )Nr   r   )r:  )r#   r`   beam_idxeachkey_value_layers        r%   _reorder_cache CpmAntForCausalLM._reorder_cache  sd    P_`P_)94:tCP_`.O!0!3H!=OA!0!3H!=OA  / 	 as   A)r  rd  )NNNNNNNN)r;   r<   r=   r>   _tied_weights_keysr   r   r   r   r   r@   r   r   r{   r   r   r9   r1  r6  ru  ry  r  rA   rB   rC   s   @r%   rb  rb    s    ++|   -1MQ$(,0/3)-&*15?
ELL)?
 "$uU\\5<<-G'H"IJ?
 D>	?

 $D>?
 'tn?
 &?
 d^?
 !.?
 
u,,	-?
 ?
B+1& r'   rb  )rb  r$  r  )0r?   rj   typingr   r   r   r   r   torch.nn.functionalr   
functionalr   torch.utils.checkpointtorch.nnr   activationsr
   
generationr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_cpmantr   
get_loggerr;   loggerModuler   rE   r}   r   r   r   r   r   r   r   r	  r  r$  rb  __all__r   r'   r%   <module>r     sv     / /      % ! ) O - , . 
		H	%bii 2e4bii e4P.>ryy .>b")) (		 4RYY 6,>RYY ,>^>TBII >TD Y RYY Y z299  \O \ \. N
' N
 N
b 
`- `
`F Hr'   