
    fTh+                     P   S r SSKrSSKJr  SSKJr  SSKJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJr  \R6                  " \5      rSqS r " S S\R@                  RB                  5      r"S%S jr#S%S jr$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RJ                  5      r(\ " S S\5      5       r)\ " S S\5      5       r*\ " S S\5      5       r+\ " S S\)5      5       r,\" S S!9 " S" S#\)\5      5       r-/ S$Qr.g)&zPyTorch RWKV model.    N)	dataclass)Path)ListOptionalTupleUnion)nn   )GenerationMixin)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                    SSK Jn  [        [        5      R	                  5       R
                  R
                  R
                  S-  S-  nS Vs/ s H  o2U-  PM	     nn[        b  [        R                  U :X  a  g [        R                  SU  S35        SS	S
SSSSU  3/nU" SU  3U[        R                  " 5       [        R                  :H  US9qU [        l        g s  snf )Nr   )loadkernelsrwkv)z
wkv_op.cppzwkv_cuda.cuzwkv_cuda_bf16.cuz2Loading CUDA kernel for RWKV at context length of .z
-res-usagez--maxrregcount 60z--use_fast_mathz-O3z-Xptxas -O3z--extra-device-vectorizationz-DTmax=wkv_)namesourcesverboseextra_cuda_cflags)torch.utils.cpp_extensionr   r   __file__resolveparentrwkv_cuda_kernelmax_seq_lengthloggerinfor   get_verbosityDEBUG)context_lengthload_kernelkernel_folderfcuda_kernel_filesflagss         ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/rwkv/modeling_rwkv.pyload_wkv_cuda_kernelr0   .   s    = N**,33::AAIMPVVM4ef4eq*4ef #(8(G(G>(Y
KKD^DTTUVW 	&
.!"E #N#$!&&(GMM9	 '5#/ gs   Cc                   <    \ rS rSr\SS j5       r\SS j5       rSrg)RwkvLinearAttentionN   Nc                    UR                  5       u  pxn	U[        R                  :  a   [        SU S[        R                   S35      eXy-  [	        U	S5      -  S:w  a  [        SU SU	 S[	        U	S5       S	35      eUR
                  U l        UR                  R                  S
:w  dN  UR                  R                  S
:w  d4  UR                  R                  S
:w  d  UR                  R                  S
:w  a  [        S5      e[        R                  " UR                  5       R                  5       5      * nUR
                  [        R                  :X  a0  UR                  5       nUR                  5       nUR                  5       nUR                  5       nUR                  5       nUR                  5       n[        R                  " U[        R                  S9n
U(       d  Ub  UcT  [        R                   " UU	S[        R"                  UR                  [        R                  S9nUS S 2S S 2S4==   S-  ss'   OB[        R$                  " U Vs/ s H  oR'                  S5      PM     snSS9R                  5       nUR
                  [        R(                  :X  a  [        R*                  nO[        R,                  nU" XX4X5        OHUR
                  [        R(                  :X  a  [        R.                  O[        R0                  nU" XX4U
5        U R3                  XX4U
5        Ub4  [        R4                  " USSS9 Vs/ s H  oR7                  S5      PM     nnU
R9                  U R                  5      U4$ s  snf s  snf )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of r   cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr
   )dtypedevicer8      籡*G)dim)sizer#   r$   
ValueErrorminr9   input_dtyper:   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32cat	unsqueezebfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunksqueezeto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputsforward_funcs                r/   rR   RwkvLinearAttention.forwardO   s   +.88:(
[%444.wi7b#2233DF  #c+r&::a?-j\9L[M Z";34A7 
 )) ""f,  %%/zz&(||  F*tuuii
 0 0 2 = = ?@@
99%#))+J))+CKKME**,
nn  "!!#U5L5LM5,}--::"'"9"9 aAg$&		5"A5a;;q>5"AqITTVyyENN*/GG/BBVK<?II<W+88]m]u]uLVDjc&I+0;;uaQ+GH+GaYYq\+GEHyy)500 #B Is   	M07M5c                 .   U R                   nU R                  u  pEpgn[        R                  " U[        R                  U[        R
                  :X  a  [        R
                  O[        R                  S9n	[        R                  " U[        R                  S9n
[        R                  " U[        R                  S9n[        R                  " U[        R                  S9nU[        R                  :X  a  UR                  5       nU[        R
                  :X  a  [        R                  O[        R                  nU" UUUUUUR                  5       U	U
UU5
        U	R                  U5      U
R                  U5      UR                  U5      UR                  U5      S S 4$ )N)r8   r9   r7   )rA   saved_tensorsrC   rH   rI   rN   rK   rG   rE   r#   backward_bf16backwardrF   rV   )rW   g_outputg_staterA   rX   rY   rZ   r[   ra   g_time_decayg_time_firstg_keyg_valuebackward_funcs                 r/   rh   RwkvLinearAttention.backward   sF    oo585F5F2
F''11$/5>>$A%..u}}

 ''
%BYBYZ  E4K4KL""58O8OP%--'~~'H:E:W(66]m]v]v!	
 OOK(OOK(HH[!JJ{#
 	
     NFN)__name__
__module____qualname____firstlineno__staticmethodrR   rh   __static_attributes__rr   rq   r/   r2   r2   N   s)    <1 <1| %
 %
rq   r2   c                    UR                  5       u  pgn[        R                  " U5      nUc  [        R                  " US S 2S4   [        R                  S9n	[        R                  " US S 2S4   [        R                  S9n
[        R                  " US S 2S4   [        R                  S9S-
  nOUu  pn[        R                  " U 5      * n [        U5       GH
  nUS S 2U4   R                  5       nUS S 2U4   n[        R                  " XU-   5      n[        R                  " X-
  5      n[        R                  " X-   U-
  5      nUU	-  UU-  -   nUU
-  U-   nUU-  R                  UR                  5      US S 2U4'   [        R                  " X-   U5      n[        R                  " X-   U-
  5      n[        R                  " UU-
  5      nUU	-  UU-  -   n	UU
-  U-   n
UnGM     U(       d  Ub  XU/nX4$ )Nr   )r9   r<   )
r>   rC   
zeros_likerK   rD   rangerE   maximumrV   r9   )rX   rY   rZ   r[   r\   r]   _
seq_lengthra   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_states                        r/   rwkv_linear_attention_cpur      s    xxzA1c"F}$$SAYemmD	$$SAYemmD	$$SAYemmDtK	*/'	i
 ))J''Jz*!]*+113a./ y
2JKYYy12YY{/.@ANR-%77	9nr)$-$;#?#?#Mq-  i&<kJYYy-=>YY{]23NR-%77	NR'	!	% +( u(y1=rq   c           	          [        S XX#4 5       5      nUR                  S5      S:H  n[        b  U(       d  U(       a  [        XX#XES9$ [        R                  XX#XE5      $ )Nc              3   R   #    U  H  oR                   R                  S :g  v   M     g7f)r6   N)r:   rB   ).0ts     r/   	<genexpr>(rwkv_linear_attention.<locals>.<genexpr>   s     X3Wa((--6)3Ws   %'r   r\   r]   )anyr>   r#   r   r2   apply)rX   rY   rZ   r[   r\   r]   no_cuda	one_tokens           r/   rwkv_linear_attentionr      sZ    XJC3WXXG q I7i(SXtt"((Uaarq   c                   @   ^  \ rS rSrSU 4S jjrSS jrSS jrSrU =r$ )	RwkvSelfAttention   c                   > [         TU ]  5         Xl        [        S L=(       a    [        R                  UR
                  :H  n[        5       (       a,  [        5       (       a  U(       d   [        UR
                  5        X l        UR                  nUR                  b  UR                  OUnXPl        [        R                   " ["        R$                  " U5      5      U l        [        R                   " ["        R$                  " U5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R0                  " S5      U l        [        R4                  " XESS9U l        [        R4                  " XESS9U l        [        R4                  " XESS9U l        [        R4                  " XTSS9U l        g ! [         a    [        R                  S5         GNf = f)Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr#   r$   r)   r   r   r0   	Exceptionr%   r&   layer_idr`   attention_hidden_sizer	   	ParameterrC   emptyrX   rY   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearrZ   r[   
receptancera   )selfr   r   kernel_loadedr`   r   	__class__s         r/   r   RwkvSelfAttention.__init__   s   (4q9I9X9X\b\q\q9q$;$=$=mY$V%:%:; !((,2,H,H,TF((Ze 	 &;",,u{{3H'IJ,,u{{3H'IJLLQ;)GH ll5;;q![+IJ#%<<Aq+0N#O ,,}599[eLYY{N
))KUSii 5O)  YWXYs   (H% %IIc                 p   UR                  S5      S:X  a  Ub  US   S S 2S S 2U R                  4   nO4U R                  U5      nUb   US   S S 2S S 2U R                  4   US S 2S4'   XR                  -  USU R                  -
  -  -   nXR                  -  USU R                  -
  -  -   nXR
                  -  USU R
                  -
  -  -   nU R                  U5      nU R                  U5      n[        R                  " U R                  U5      5      nUb   US S 2S4   US   S S 2S S 2U R                  4'   XdXR4$ Nr   r   r   )r>   r   r   r   r   r   rZ   r[   rC   sigmoidr   )r   hiddenr\   shiftedrZ   r[   r   s          r/   extract_key_value#RwkvSelfAttention.extract_key_value  s2   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1(((7a$:K:K6K+LL,,,w!d>Q>Q:Q/RR666AH`H`D`9aa
hhsm

5!]]4??:#>?
,21b5ME!HQ4==(),,rq   c           	        ^  T R                  XS9u  pEpbUb  [        U 4S jUSS   5       5      OS n[        T R                  T R                  UUUUS9u  pUbT  US   US   S S 2S S 2T R
                  4'   US   US   S S 2S S 2T R
                  4'   US   US   S S 2S S 2T R
                  4'   T R                  XH-  5      U4$ )	Nr\   c              3   N   >#    U  H  oS S 2S S 2TR                   4   v   M     g 7frt   r   )r   rb   r   s     r/   r   ,RwkvSelfAttention.forward.<locals>.<genexpr>#  s     FIqaDMM12Is   "%r;   r   r   r   r
      )r   tupler   rX   rY   r   ra   )	r   r   r\   	use_cacher   rZ   r[   layer_stater   s	   `        r/   rR   RwkvSelfAttention.forward!  s    (,(>(>v(>(S%
JOJ[eFE!"IFFae1OOOO"
 ",7NE!HQ4==(),7NE!HQ4==(),7NE!HQ4==(){{:,-u44rq   )r   r   rZ   r   ra   r   rX   rY   r   r   r   r   r[   r   rt   rs   )	ru   rv   rw   rx   r   r   rR   rz   __classcell__r   s   @r/   r   r      s    P<-&5 5rq   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )RwkvFeedForwardi5  c                 8  > [         TU ]  5         Xl        X l        UR                  nUR
                  b  UR
                  OSUR                  -  n[        R                  " S5      U l        [        R                  " [        R                  " SSU5      5      U l        [        R                  " [        R                  " SSU5      5      U l        [        R                  " X4SS9U l        [        R                  " X3SS9U l        [        R                  " XCSS9U l        g )Nr   r   r   Fr   )r   r   r   r   r`   intermediate_sizer	   r   r   r   rC   r   r   r   r   rZ   r   r[   )r   r   r   r`   r   r   s        r/   r   RwkvFeedForward.__init__6  s     (((.(@(@(LF$$RSV\VhVhRh 	 ,,}5LLQ;)GH#%<<Aq+0N#O 99[%H))K5IYY0EJ
rq   c                    UR                  S5      S:X  a  Ub  US   S S 2S S 2U R                  4   nO4U R                  U5      nUb   US   S S 2S S 2U R                  4   US S 2S4'   XR                  -  USU R                  -
  -  -   nXR                  -  USU R                  -
  -  -   n[
        R                  " [
        R                  " U R                  U5      5      5      nU R                  U5      n[
        R                  " U R                  U5      5      nUb   US S 2S4   US   S S 2S S 2U R                  4'   XV-  U4$ r   )r>   r   r   r   r   rC   squarerelurZ   r[   r   r   )r   r   r\   r   rZ   r   r[   s          r/   rR   RwkvFeedForward.forwardG  s#   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1(((7a$:K:K6K+LL666AH`H`D`9aa
ll5::dhhsm45

3]]4??:#>?
,21b5ME!HQ4==()!5((rq   )r   rZ   r   r   r   r   r   r[   r   rt   ru   rv   rw   rx   r   rR   rz   r   r   s   @r/   r   r   5  s    K") )rq   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )	RwkvBlocki[  c                   > [         TU ]  5         Xl        X l        US:X  a.  [        R
                  " UR                  UR                  S9U l        [        R
                  " UR                  UR                  S9U l	        [        R
                  " UR                  UR                  S9U l
        [        X5      U l        [        X5      U l        g )Nr   )eps)r   r   r   r   r	   	LayerNormr`   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   s      r/   r   RwkvBlock.__init__\  s     q=,,v'9'9v?X?XYDK<< 2 28Q8QR<< 2 28Q8QR*6<+F=rq   c                    U R                   S:X  a  U R                  U5      nU R                  U R                  U5      X#S9u  pRX-   nU R	                  U R                  U5      US9u  pbX-   nX4nU(       a  Xu4-  nU$ US-  nU$ )Nr   )r\   r   r   rt   )r   r   r   r   r   r   )r   r   r\   r   output_attentionsr   r   outputss           r/   rR   RwkvBlock.forwardj  s    ==A[[(F>>$((6*:%>]	#"//0@/N&/|#G  wGrq   )r   r   r   r   r   r   r   )NFFr   r   s   @r/   r   r   [  s    > rq   r   c                   8    \ rS rSr\rSrS/rSS/rSr	Sr
S rSrg	)
RwkvPreTrainedModeli}  r   r   rX   rY   Tc           	         [        U[        5      (       Gav  UR                  nUR                  R                  nUR                  R
                  nUR                  nX#S-
  -  nSX#-  -
  n[        R                  " [        U5       Vs/ s H  oU-  PM	     snUR                  R                  UR                  R                  S9n	U	SSSS24   n	[        U5       V
s/ s H  n
SSXS-
  -  SSU-  -   -  -  -   PM     nn
[        R                  " XR                  R                  UR                  R                  S9n[        R                  " [        U5       Vs/ s H  oS-   S	-  S-
  PM     snUR                  R                  UR                  R                  S9S
-  n[        R                  " 5          XR                  l        [        R"                  " UR                  [$        R&                  " S5      -  U-   5      UR                  l        [        R(                  " X5      UR                  l        [        R(                  " X5      SU-  -   UR*                  l        [        R(                  " U	S
U-  5      UR,                  l        SSS5        g[        U[.        5      (       Ga  UR                  nUR                  R                  nUR                  R
                  nSX#-  -
  n[        R                  " [        U5       Vs/ s H  oU-  PM	     snUR                  R                  UR                  R                  S9n	U	SSSS24   n	[        R                  " 5          [        R(                  " X5      UR                  l        [        R(                  " X5      UR,                  l        SSS5        ggs  snf s  sn
f s  snf ! , (       d  f       g= fs  snf ! , (       d  f       g= f)zInitialize the weights.r   g      ?r9   r:   N   gffffff?g?r
   g      ?g333333?)
isinstancer   r   r   num_hidden_layersr`   r   rC   tensorr}   r   r9   r:   rX   rY   no_graddata	ones_likemathlogpowr   r   r   )r   moduler   r   r`   r   ratio_0_to_1ratio_1_to_almost0itime_weighthdecay_speedzigzags                r/   _init_weights!RwkvPreTrainedModel._init_weights  s6   f/00H & ? ? --33K$*$@$@!#1'<=L!$(D!E,,*/*<=*<Q[*<=))//**11K
 &dD!m4K 455A Q!q89sS<EW?WXXX5    ,,{:K:K:Q:QZ`ZkZkZrZrsK.34I.JK.J!eq[1_.JK ++11!,,33
   )4!!&).9J9JTXXVY]9Z]c9c)d!!&+099[+U##(-2YY{-WZ]`lZl-l%%*27))KOaIa2b**/ ! 00H & ? ? --33K!$(D!E,,*/*<=*<Q[*<=))//**11K
 &dD!m4K+099[+U##(27))K2\**/ ! 17 > L ! > !s2   N0 N5N:1CN??OAO?
O
O#rr   N)ru   rv   rw   rx   r   config_classbase_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr   rz   rr   rq   r/   r   r   }  s1    L$)<8&*#L7]rq   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)

RwkvOutputi  a^  
Class for the RWKV model outputs.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlast_hidden_stater\   .hidden_states
attentionsrr   )ru   rv   rw   rx   __doc__r  r   rC   FloatTensor__annotations__r\   r   r  r   r  rz   rr   rq   r/   r  r    sw    , 6:x 1 129/3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rq   r  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)RwkvCausalLMOutputi  a0  
Base class for causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlosslogitsr\   .r  r  rr   )ru   rv   rw   rx   r  r  r   rC   r  r	  r  r\   r   r  r   r  rz   rr   rq   r/   r  r    s    0 )-D(5$$
%,*.FHU&&'./3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rq   r  c                     ^  \ rS rSrU 4S jrS rS r\        SS\\	R                     S\\	R                     S\\	R                     S\\\	R                        S	\\   S
\\   S\\   S\\   S\\\4   4S jj5       rS rS rSrU =r$ )	RwkvModeli  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [        R                  " UR
                  5      U l        SU l        SU l        U R!                  5         g s  snf )Nr   F)r   r   r	   	Embedding
vocab_sizer`   
embeddings
ModuleListr}   r   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_init)r   r   idxr   s      r/   r   RwkvModel.__init__  s     ,,v'8'8&:L:LMmmPUV\VnVnPo$pPoYv%DPo$pqll6#5#56#( &+# 	 %qs   (Cc                     U R                   $ rt   r  r   s    r/   get_input_embeddingsRwkvModel.get_input_embeddings  s    rq   c                     Xl         g rt   r  r   new_embeddingss     r/   set_input_embeddingsRwkvModel.set_input_embeddings  s    (rq   	input_idsattention_maskinputs_embedsr\   r   r   output_hidden_statesreturn_dictreturnc	           	      6   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R
                  nUb  [        R                  S5        U R                  U R                  :X  a  U R                  5         Ub  Ub  [        S5      eUc  Uc  [        S5      eUc  U R                  U5      nU(       a  Uc  UR                  S5      U R                   R                  U R                   R                  4n	[        S5       V
s/ s HC  n
[         R"                  " XS::  a  UR$                  O[         R&                  UR(                  S	.6PME     nn
US
==   S-  ss'   U R*                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUnU(       a  SOSnU(       a  SOSn[-        U R.                  5       H  u  pU R*                  (       a2  U R                  (       a!  U R1                  UR2                  XXV5      u  pnO
U" XXVS9u  pnU R                  (       a?  U R                   R4                  S:  a%  US-   U R                   R4                  -  S:X  a  US-  nU(       a  X4-   nU(       d  M  UU4-   nM     U R7                  U5      nU(       a  X4-   nU(       d  [9        S XX4 5       5      $ [;        UUUUS9$ s  sn
f )a'  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the last state is returned and can be used to quickly generate the next logits.
NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr      r   r   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...rr   )r\   r   r   r;   c              3   .   #    U  H  oc  M  Uv   M     g 7frt   rr   )r   xs     r/   r   $RwkvModel.forward.<locals>.<genexpr>{  s     t$bq$bs   	)r  r\   r  r  )r   r   r)  trainingr   use_return_dictr%   warning_oncer  _rescale_layersr?   r  r>   r`   r   r}   rC   rJ   r9   rK   r:   r  	enumerater  _gradient_checkpointing_func__call__rescale_everyr  r   r  )r   r&  r'  r(  r\   r   r   r)  r*  shaper   r  all_self_attentionsall_hidden_statesr  blockr  s                    r/   rR   RwkvModel.forward  s   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]% ^_==D444  " ]%>cdd=#8TUU  OOI6M"''*DKK,C,CT[[EbEbcE
 q	 "A a-"5"5U]][h[o[o "	   !HH&&4==##p "	%$5b4"6BD#DKK0JC**t}}373T3TNNM)40j 49!)40j
 ((KK--11W 9 99Q> - 1#$58H$H!  &9ZM&I#+ 1. M2 14D Dt];L$bttt++*	
 	
es   A
Lc           	         U R                   U R                  (       + :X  a  g U R                  R                  S:  Ga  [        R
                  " 5          [        U R                  5       GH  u  pU R                  (       a  UR                  R                  R                  R                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R                  S[        XR                  R                  -  5      -  5        M  [        UR                  R                  R                  S5      (       a  UR                  R                  R                  R                   R#                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R                   R#                  S[        XR                  R                  -  5      -  5        GM  [        UR                  R                  R                  S5      (       aO  U R%                  UR                  R                  U5        U R%                  UR                  R                  U5        GM!  UR                  R                  R                  R#                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R#                  S[        XR                  R                  -  5      -  5        GM     S S S 5        U R                  (       + U l         g ! , (       d  f       N%= f)Nr   r;   SCBquant_state)r  r1  r   r8  rC   r   r5  r  r   ra   weightmul_intr   r[   hasattrr?  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr<  s      r/   r4  RwkvModel._rescale_layers  s?   ##DMM(9:;;$$q('0'=OH}}..55::1HP[P[PiPiDi@j;jk**0077<<Q#hR]R]RkRkFkBl=lm #5??#9#9#@#@%HH!OO2299==BB1HXcXcXqXqLqHrCrs!..44;;??DDQ#hZeZeZsZsNsJtEtu$U__%;%;%B%BMRR AA%//BXBXZbc AA%BTBTBZBZ\de!OO2299>>qCT_T_TmTmHmDn?no!..44;;@@c(VaVaVoVoJoFpApq (> !" (,}}#4 # !s   KM
Mc                    [        5       (       d  [        S5      eSSKnUR                  R	                  UR
                  R                  UR
                  R                  5      nUR                  S[        X R                  R                  -  5      -  5        UR                  R                  UR                  S5      SS9R                  UR                  5      n[!        USU5        g)	z
Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
be quantized again.
z/Please install bitsandbytes to use this method.r   Nr;   cpuF)requires_gradrA  )r   ImportErrorbitsandbytes
functionaldequantize_4bitrA  r   r@  rE  rC  r   r8  r	   
Params4bitrV   r:   setattr)r   target_layerrG  bnbdequant_weightsquant_weights         r/   rF  *RwkvModel._bnb_4bit_dequantize_and_rescale  s    
 )**OPP"..889L9L9Q9QS_SfSfSrSrsQ#h++2K2K&K"LLM vv((););E)BRW(X[[\k\r\rsh5rq   )r  r  r  r  r  )NNNNNNNN)ru   rv   rw   rx   r   r  r$  r   r   rC   
LongTensorr  r   boolr   r   r  rR   r4  rF  rz   r   r   s   @r/   r  r    s    )  15595937$(,0/3&*l
E,,-l
 !!1!12l
   1 12	l

 U../0l
 D>l
 $D>l
 'tnl
 d^l
 
uj 	!l
 l
\506 6rq   r  z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                   @  ^  \ rS rSrS/rU 4S jrS rS rSS jr\	         SS\
\R                     S\
\R                     S	\
\R                     S
\
\\R                        S\
\R                     S\
\   S\
\   S\
\   S\
\   S\\\4   4S jj5       rSrU =r$ )RwkvForCausalLMi  zhead.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r   r   r  r   r	   r   r`   r  headr  )r   r   r   s     r/   r   RwkvForCausalLM.__init__  sH     f%	IIf00&2C2C%P	 	rq   c                     U R                   $ rt   r]  r  s    r/   get_output_embeddings%RwkvForCausalLM.get_output_embeddings  s    yyrq   c                     Xl         g rt   r`  r"  s     r/   set_output_embeddings%RwkvForCausalLM.set_output_embeddings  s    "	rq   c                 j    Ub  US S 2S4   R                  S5      nUb  Uc  SU0nOSU0nX&S'   XFS'   U$ )Nr   r(  r&  r\   r   )rM   )r   r&  r\   r(  r   kwargsmodel_inputss          r/   prepare_inputs_for_generation-RwkvForCausalLM.prepare_inputs_for_generation  sY     !!R%(2226I $+];L'3L %W$-[!rq   r&  r'  r(  r\   labelsr   r   r)  r*  r+  c
           
      x   U	b  U	OU R                   R                  n	U R                  UUUUUUU	S9nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0U
D6nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the last state is returned and can be used to quickly generate the next logits.
N)r(  r\   r   r   r)  r*  r   r  r   )r  r  r\   r  r  )
r   r2  r   r]  loss_functionr  r  r\   r  r  )r   r&  r'  r(  r\   rk  r   r   r)  r*  rg  rwkv_outputsr  r  r  ra   s                   r/   rR   RwkvForCausalLM.forward  s    J &1%<k$++B]B]yy'/!5# ! 
 %Q=)%%  ;;11 	D Yab!11F)-)9TGf$EvE!$$&44#..
 	
rq   )r]  r   )NNN)	NNNNNNNNN)ru   rv   rw   rx   _tied_weights_keysr   ra  rd  ri  r   r   rC   rW  r  r   rX  r   r   r  rR   rz   r   r   s   @r/   r[  r[    s    (#"  15595937-1$(,0/3&*F
E,,-F
 !!1!12F
   1 12	F

 U../0F
 ))*F
 D>F
 $D>F
 'tnF
 d^F
 
u((	)F
 F
rq   r[  )r[  r  r   rs   )/r  r   dataclassesr   pathlibr   typingr   r   r   r   rC   torch.utils.checkpointr	   
generationr   modeling_utilsr   utilsr   r   r   r   r   r   configuration_rwkvr   
get_loggerru   r%   r#   r0   autogradFunctionr2   r   r   Moduler   r   r   r   r  r  r  r[  __all__rr   rq   r/   <module>r~     s[      !  / /    ) -  + 
		H	%  5@g
%..11 g
T)XbC5		 C5L#)bii #)L		 D ?]/ ?] ?]D ? ? ?: ? ? ?@ o6# o6 o6d i
)? i
i
X Brq   