
    fThl/                       S r SSKrSSKrSSKJrJrJrJr  SSKr	SSK
r
SSK
Jr  SSKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJr  SSK J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.J/r/J0r0  \" 5       (       a  SSKJ1r1  \$Rd                  " \35      r4 " S S\Rj                  5      r6 " S S\65      r7\6\7S.r8 " S S\Rj                  5      r9 " S S\Rj                  5      r: " S S\Rj                  5      r;\! " S S \5      5       r< " S! S"\<\5      r=\!" S#S$9 " S% S&\=5      5       r>\!" S'S$9 " S( S)\=5      5       r?\!" S*S$9 " S+ S,\<5      5       r@\!" S-S$9 " S. S/\<5      5       rA/ S0QrBg)1zPyTorch BARK model.    N)DictOptionalTupleUnion)nn)
functional   )GenerationMixin)#AlternatingCodebooksLogitsProcessor!BarkEosPrioritizerLogitsProcessorSuppressTokensLogitsProcessor)_prepare_4d_attention_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)CausalLMOutputWithPastMaskedLMOutput)PreTrainedModelget_parameter_device)auto_docstringis_accelerate_availableis_torch_accelerator_availablelogging   )	AutoModel   )BarkCoarseConfig
BarkConfigBarkFineConfigBarkSemanticConfigBarkSubModelConfig)BarkCoarseGenerationConfigBarkFineGenerationConfigBarkSemanticGenerationConfig)_flash_attention_forwardc                   V   ^  \ rS rSrSU 4S jjrS rS rS	S jr     S
S jrSr	U =r
$ )BarkSelfAttention@   c                   > [         TU ]  5         UR                  U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        UR                  U l        UR                  U l	        U R                  U R                  -  U l
        UR                  UR                  -  S:w  a&  [        SU R                   SU R                   S35      e[        R                  " UR                  SUR                  -  UR                  S9U l        [        R                  " UR                  UR                  UR                  S9U l        X l        U(       a^  UR"                  n[$        R&                  " [$        R(                  " X34[*        S95      R-                  SSX35      nU R/                  S	U5        g g )
Nr   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r	   biasdtyper   r*   )super__init__dropoutr   Dropoutattn_dropoutresid_dropouthidden_size	embed_dim	num_headshead_dim
ValueErrorLinearr*   att_projout_proj	is_causal
block_sizetorchtrilonesboolviewregister_buffer)selfconfigr;   r<   r*   	__class__s        ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/bark/modeling_bark.pyr.   BarkSelfAttention.__init__D   sd    ~~JJv~~6ZZ7++))$..8 0 00A5MdnnM] ^NN#2'  		&"4"4a&:L:L6LSYS^S^_		&"4"4f6H6Hv{{["**J::ejj*)ANOTTUVXY[erD  .     c                 z    UR                  5       SS X#4-   nUR                  U5      nUR                  SSSS5      $ ):
Splits hidden_size dim into attn_head_size and num_heads
Nr   r   r   r	   )sizerA   permuterC   tensorr5   attn_head_size	new_shapes        rF   _split_headsBarkSelfAttention._split_headsb   sA     KKM#2&))DD	Y'~~aAq))rH   c                     UR                  SS5      R                  5       nUR                  UR                  5       SS X#-  4-   5      nU$ )C
Merges attn_head_size dim and num_attn_heads dim into hidden_size
r   r   N)	transpose
contiguousrA   rL   rC   rO   r5   rP   s       rF   _merge_headsBarkSelfAttention._merge_headsj   sJ     !!!Q'224V[[]3B/93M2OOPrH   c                    [         R                  " XR                  SS5      5      S[        R                  " U R
                  5      -  -  nU R                  (       ay  UR                  S5      UR                  S5      pUR                  U R                  S S 2S S 2X-
  U2S U24   S:H  [         R                  " UR                  5      R                  5      nUb  Xd-   n[        R                  R                  USS9nUR!                  UR                  5      nU R#                  U5      nUb  Xe-  n[         R                  " Xc5      n	X4$ )NrK   rV         ?r   dim)r=   matmulrW   mathsqrtr6   r;   rL   masked_fillr*   finfor,   minr   r   softmaxtor1   )
rC   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthattn_outputs
             rF   _attnBarkSelfAttention._attnv   s   ||E==R+@AS499UYUbUbKcEcd>>',zz"~sxx|* (33		!Q
 9J FSTXYYL../33L
 %'8L}},,\r,B#u{{3((6  '3L ll<7((rH   c                    U R                  U5      R                  U R                  SS9u  pxn	U R                  XpR                  U R
                  5      nU R                  XR                  U R
                  5      nU R                  XR                  U R
                  5      n	Ub6  US   n
US   n[        R                  " X4SS9n[        R                  " X4SS9n	USL a  X4nOS nU R                  XxXU5      u  pU R                  XR                  U R
                  5      nU R                  U5      nU R                  U5      nX4nU(       a  X4-  nU$ )Nr   r^   r   r   rV   T)r9   splitr4   rR   r5   r6   r=   catrq   rZ   r:   r2   )rC   hidden_statesrk   past_key_valuesrl   	use_cacheoutput_attentionsrh   ri   rj   past_key
past_valuepresentrp   rm   outputss                   rF   forwardBarkSelfAttention.forward   s5    !MM-8>>t~~ST>UE!!%G^^T]]C!!%G&&q)H(+J))XO4CIIz1r:ElGG$(JJu5R[$\!''^^T]]SmmK0((5(&GrH   )	r9   r1   r/   r4   r6   r;   r5   r:   r2   F)NNNNNFF)__name__
__module____qualname____firstlineno__r.   rR   rZ   rq   r~   __static_attributes____classcell__rE   s   @rF   r&   r&   @   s3    /<*
)D % %rH   r&   c                   L   ^  \ rS rSrSrU 4S jrS rS r     SS jrSr	U =r
$ )	BarkSelfFlashAttention2   a8  
Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g N)r-   r.   r   _flash_attn_uses_top_left_mask)rC   argskwargsrE   s      rF   r.    BarkSelfFlashAttention2.__init__   s#    $)&)
 /P.Q+rH   c                 V    UR                  5       SS X#4-   nUR                  U5      nU$ )rJ   NrK   )rL   rA   rN   s        rF   rR   $BarkSelfFlashAttention2._split_heads   s3     KKM#2&))DD	Y' rH   c                 V    UR                  UR                  5       SS X#-  4-   5      nU$ )rU   NrV   )rA   rL   rY   s       rF   rZ   $BarkSelfFlashAttention2._merge_heads   s/     V[[]3B/93M2OOPrH   c                    UR                  5       u  pxn	U R                  U5      R                  U R                  SS9u  pnU R	                  XR
                  U R                  5      n
U R	                  XR
                  U R                  5      nU R	                  XR
                  U R                  5      nUbV  US   R                  SS5      nUS   R                  SS5      n[        R                  " X4SS9n[        R                  " X4SS9nUSL a%  UR                  SS5      UR                  SS5      4nOS n[        U
UUUUU R                  (       a  U R                  OSU R                  U R                  S9nU R                  UU R
                  U R                  5      nU R!                  U5      nU R#                  U5      nUU4nU(       a  S nUU4-  nU$ )Nr   r^   r   r   T        )r/   use_top_left_maskr;   )rL   r9   rt   r4   rR   r5   r6   rW   r=   ru   r$   trainingr/   r   r;   rZ   r:   r2   )rC   rv   rk   rw   rl   rx   ry   
batch_size	query_len_rh   ri   rj   rz   r{   r|   rp   r}   rm   s                      rF   r~   BarkSelfFlashAttention2.forward   s    $1#5#5#7 
q !MM-8>>t~~ST>UE!!%G^^T]]C!!%G&&q)33Aq9H(+55a;J))XO3CIIz1q9E}}Q*EOOAq,ABGG.$(MMDLLs"AAnn	
 ''T^^T]]SmmK0((5(L&GrH   )r   r   )r   r   r   r   __doc__r.   rR   rZ   r~   r   r   r   s   @rF   r   r      s3    R 4 4rH   r   )eagerflash_attention_2c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )BarkLayerNormi  zOLayerNorm but with an optional bias. PyTorch doesn't support simply bias=False.c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        U(       a0  [        R                  " [        R                  " U5      5      U l        g S U l        g r   )	r-   r.   r   	Parameterr=   r?   weightzerosr*   )rC   r3   r*   rE   s      rF   r.   BarkLayerNorm.__init__  sH    ll5::k#:;>BBLL[!9:		rH   c                     [         R                  " XR                  R                  U R                  U R                  SS9$ )Ngh㈵>)eps)F
layer_normr   shaper*   )rC   inputs     rF   r~   BarkLayerNorm.forward$  s*    ||E;;#4#4dkk499RVWWrH   )r*   r   )T)	r   r   r   r   r   r.   r~   r   r   r   s   @rF   r   r     s    YM
X XrH   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BarkMLPi(  c                   > [         TU ]  5         [        R                  " UR                  SUR                  -  UR
                  S9U l        [        R                  " SUR                  -  UR                  UR
                  S9U l        [        R                  " UR                  5      U l	        [        R                  " 5       U l        g )N   r)   )r-   r.   r   r8   r3   r*   in_projr:   r0   r/   GELUgelurC   rD   rE   s     rF   r.   BarkMLP.__init__)  s    yy!3!3Q9K9K5KRXR]R]^		!f&8&8"8&:L:LSYS^S^_zz&..1GGI	rH   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r:   r/   )rC   rv   s     rF   r~   BarkMLP.forward0  s@    ]3		-0m4]3rH   )r/   r   r   r:   r   r   r   r   r.   r~   r   r   r   s   @rF   r   r   (  s     rH   r   c                   @   ^  \ rS rSrSU 4S jjr     SS jrSrU =r$ )	BarkBlocki8  c                   > [         TU ]  5         U(       aG  [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        OJ[        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        UR                     " XS9U l        [        U5      U l        g )Nr)   r;   )r-   r.   r   r3   r*   layernorm_1layernorm_2r   	LayerNormBARK_ATTENTION_CLASSES_attn_implementationattnr   mlp)rC   rD   r;   rE   s      rF   r.   BarkBlock.__init__9  s      -V-?-?fkkRD,V-?-?fkkRD!||F,>,>?D!||F,>,>?D*6+F+FGd	6?rH   c           	          U R                  U5      nU R                  UUUUUUS9nUS   n	USS  n
X-   nXpR                  U R                  U5      5      -   nU(       a  U4U
-   n
U
$ U4U
SS  -   n
U
$ )Nrw   rk   rl   rx   ry   r   r   )r   r   r   r   )rC   rv   rw   rk   rl   rx   ry   intermediary_hidden_statesattn_outputsrp   r}   s              rF   r~   BarkBlock.forwardJ  s     &*%5%5m%D"yy&+)/ ! 
 #1oqr"%2%@"%?((78C
 &
" 13g=G  23gabkAGrH   )r   r   r   r   r   r   r   r   s   @rF   r   r   8  s#    #( ! !rH   r   c                   f   ^  \ rS rSr\rSrSrS rU 4S jr	\
S\R                  4S j5       rSrU =r$ )	BarkPreTrainedModelin  FTc                    [        U[        R                  45      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weights.r   )meanstdNr]   )
isinstancer   r8   r   datanormal_rD   initializer_ranger*   zero_	Embeddingpadding_idxr   fill_rC   modules     rF   _init_weights!BarkPreTrainedModel._init_weightst  s   fryyl++ MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .rH   c                 &   > [         TU ]  " U0 UD6  g r   )r-   r.   )rC   inputsr   rE   s      rF   r.   BarkPreTrainedModel.__init__  s    &+F+rH   returnc                 d   [        U S5      (       d  [        U 5      $ U R                  5        Hv  n[        US5      (       d  M  [        UR                  S5      (       d  M3  UR                  R                  c  ML  [
        R                  " UR                  R                  5      s  $    [        U 5      $ )u
`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
device).
_hf_hookexecution_device)hasattrr   modulesr   r   r=   devicer   s     rF   r   BarkPreTrainedModel.device  s     tZ(('--llnF
++FOO-?@@OO44@||FOO$D$DEE % $D))rH    )r   r   r   r   r   config_classsupports_gradient_checkpointing_supports_flash_attn_2r   r.   propertyr=   r   r   r   r   s   @rF   r   r   n  s=    L&+#!* , * * *rH   r   c                     ^  \ rS rSr\rU 4S jrS rS rSS jr	\
           SS\\R                     S\\\R                        S\\R                     S	\\R                     S
\\R                     S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       r\S\\\R                        S\R                  S\\\R                        4S j5       rSrU =r$ )BarkCausalModeli  c           
        > [         TU ]  U5        Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        USS9PM     sn5      U l        UR$                  S:H  U l        [)        UR                  UR*                  S9U l        [        R.                  " UR                  UR0                  SS9U l        SU l        U R7                  5         g s  snf )NTr   r   r)   F)r-   r.   rD   r   r   input_vocab_sizer3   input_embeds_layerr<   position_embeds_layerr0   r/   drop
ModuleListrange
num_layersr   layersr   _use_flash_attention_2r   r*   layernorm_finalr8   output_vocab_sizelm_headgradient_checkpointing	post_initrC   rD   r   rE   s      rF   r.   BarkCausalModel.__init__  s     #%,,v/F/FHZHZ"[%'\\&2C2CVEWEW%X"JJv~~.	mmPUV\VgVgPh$iPh1Yv%FPh$ij&,&A&AEX&X#,V-?-?fkkRyy!3!3V5M5MTYZ&+# 	 %js   E!c                     U R                   $ r   r   rC   s    rF   get_input_embeddings$BarkCausalModel.get_input_embeddings  s    &&&rH   c                     Xl         g r   r  rC   new_embeddingss     rF   set_input_embeddings$BarkCausalModel.set_input_embeddings  s    "0rH   c                     UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUbZ  UR                  S   nUS   S   R                  S   nUR                  S   U:  a  Un	OUR                  S   S-
  n	US S 2U	S 24   nS nO8Ub&  UR                  S5      (       a  UR                  S   nOUR                  S   nUb  US S 2S U24   nUb  US S 2S U24   nUb[  UcX  UR                  5       R                  S5      S-
  nUR	                  US:H  S5        U(       a  US S 2UR                  S   * S 24   nOS nUb.  UR                  S5      (       a  S UUUR                  S5      UUS	.$ UUUR                  S5      UUS
.$ )Ninput_embedsrk   position_idsr   r   r   rx   rK   )	input_idsr  rw   rx   r  rk   )r  rw   rx   r  rk   )getr   longcumsummasked_fill_)
rC   r  rw   r   r  rk   r  seq_lenpast_lengthremove_prefix_lengths
             rF   prepare_inputs_for_generation-BarkCausalModel.prepare_inputs_for_generation  s   zz.$7$4d;zz.$7&ooa(G)!,Q/55a8K q!K/'2$ (1q'9A'=$!!%9%:":;I  L'FJJ{,C,C&,,Q/#//!, %+AxxK8N#'8G84L%,*>)..077;a?L%%n&91=+A	0B/B/D,DEL#

;(?(?! ,#2#ZZ4 ,"0  #.K0(,
 	
rH   r  rw   rk   r  rl   labelsr  rx   ry   output_hidden_statesreturn_dictr   c           
      j   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nSnUb  [        S5      eUb  Ub  [        S5      eUb  Uc  O$Ub  U R                  U5      nOUb  O[        S5      eUR                  5       SS nUR                  S   nUS   nUb  UR                  OUR                  nUc%  Sn[        S/[        U R                  5      -  5      nOUS   S   R                  S5      nUc9  [        R                  " UUU-   [        R                   US9nUR#                  S5      nU R%                  U5      nUbS  US::  a  [        S	5      eU R&                  (       a  SU;   a  UOSnO&UR)                  US5      n[+        X7R,                  S
S9nU R/                  XPR                   R0                  5      nU R3                  UU-   5      nUUR                  S5      4-   nU R4                  (       a/  U R6                  (       a  U(       a  [8        R;                  S5        SnU(       a  SOSnU	(       a  SOSnU
(       a  SOSn[=        [?        U R                  U5      5       H  u  nu  nnU
(       a  UU4-   nU R4                  (       a6  U R6                  (       a%  U RA                  URB                  USUUU   UU	5      nOU" UUUUU   UU	S9nUS   nU(       a	  UUS
   4-   nU	(       d  M  UUU(       a  SOS
   4-   nM     U RE                  U5      nUR)                  U5      nU
(       a  UU4-   nU RG                  U5      nU(       d  [        S SUUUU4 5       5      $ [I        UUUUUS9$ )a  
input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
    Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
    have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
    is used in priority instead of `input_ids`.
NzXTraining is not implemented yet for Bark - ensure you do not pass `labels` to the model.CYou cannot specify both input_ids and input_embeds at the same time4You have to specify either input_ids or input_embedsrK   r   rV   r,   r   $batch_size has to be defined and > 0r   tgt_lenzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   .0vs     rF   	<genexpr>*BarkCausalModel.forward.<locals>.<genexpr>  s      eae   	)losslogitsrw   rv   
attentions)%rD   ry   r  rx   use_return_dictNotImplementedErrorr7   r   rL   r   r   tuplelenr   r=   aranger  	unsqueezer   r   rA   r   r,   get_head_maskr   r   r   r   loggerwarning_once	enumeratezip_gradient_checkpointing_func__call__r   r   r   )rC   r  rw   rk   r  rl   r  r  rx   ry   r  r  r)  input_shaper   
seq_lengthr   r  position_embedsrv   output_shapepresent_key_valuesall_self_attentionsall_hidden_statesiblockpast_layer_key_valuesr}   r*  s                                rF   r~   BarkCausalModel.forward  s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]%j   \%=bcc%/*A "229=L%STT"'')#2.!''*
 _
%.%:!!@S@S"K#TFS-=$=>O)!,Q/44R8K <<Z+5MUZU_U_hnoL'11!4L44\B %Q !GHH**343FD!/!4!4Z!D "<NL^L^hi!j &&y++2H2HI			,"@A"m&8&8&<%>>&&4==##p "	#,R$$5b4"6BD1:3t{{O;\1]-A-,#$58H$H!**t}};;NN!"aL%  !$9#1'l'&7 $AJM%771:-%G"  &9W)QYZ=[<]&]#= 2^@ ,,];%**<8   1]4D Dm,  &*<>OQde   &.+*
 	
rH   beam_idxc                 .   ^ [        U4S jU  5       5      $ )z
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
c              3   N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectrg   r   )r$  
past_staterD  s     rF   r&  ;BarkCausalModel._reorder_cache.<locals>.<genexpr>.<genexpr>  s1     j_iQ[))!X[[9J9J-KLL_is   7:Nr.  )r$  
layer_pastrD  s     rF   r&  1BarkCausalModel._reorder_cache.<locals>.<genexpr>  s'      
-
 j_ijjj-s   "%rK  )rw   rD  s    `rF   _reorder_cacheBarkCausalModel._reorder_cache  s      
-
 
 	
rH   )	r   rD   r   r   r   r   r   r   r   r   )NNNNNNNNNNN)r   r   r   r   r    r   r.   r  r	  r  r   r   r=   Tensorr   FloatTensor
LongTensorr@   r   r   r~   staticmethodrN  r   r   r   s   @rF   r   r     s   %L*'1<
|  -1>B15/3,0-1/3$(,0/3&*X
ELL)X
 "%(9(9":;X
 !.	X

 u||,X
 ELL)X
 ))*X
 u||,X
 D>X
 $D>X
 'tnX
 d^X
 
uU\\"$::	;X
 X
t 
uU\\23
?D||
	uU\\"	#
 
rH   r   z
    Bark semantic (or text) model. It shares the same architecture as the coarse model.
    It is a GPT-2 like autoregressive model with a language modeling head on top.
    )custom_introc                      ^  \ rS rSrSr\r   S
S\R                  S\	S\
\\\R                  4      S\
\R                     S\R                  4
U 4S jjjrS	rU =r$ )BarkSemanticModeli  semanticr  semantic_generation_confighistory_promptrk   r   c           
        > Uc  [        S5      eUR                  S   nUR                  nXR                  -   nUb-  UR	                  SU-
  R                  5       UR                  5      nUbB  US   U* S n[        R                  R                  USU[        U5      -
  4UR                  SS9nOJ[        R                  " UR                  /U-  [        R                  S9R                  U R                   5      n[        R"                  " US   USS	9n[        R                  " UR$                  //U-  [        R                  S9R                  U R                   5      n	[        R&                  " U R)                  USS2SU24   5      U R)                  USS2SUS-   24   5      -   U R)                  U	5      /SS	9n
[+        [-        UR.                  UR                  5      5      nUR1                  [+        [-        UR                  S-   U R2                  R4                  5      5      5        [7        XR                   S
9nUR9                  SUR:                  5      n[=        UR>                  XR                   S9n[@        TU ]  " [        RD                  " XgS-   4[        R                  U R                   S94U
X/US.UD6nUSS2US-   S24   nU$ )a   
Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.

Args:
    input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
        Input ids, i.e tokenized input sentences. Will be truncated up to
        semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
        long as the longest generation among the batch.
    semantic_generation_config (`BarkSemanticGenerationConfig`):
        Generation config indicating how to generate the semantic tokens.
    history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
        Optional `Bark` speaker prompt.
    attention_mask (`Optional[torch.Tensor]`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
Returns:
    torch.LongTensor: Output semantic tokens.
N/`semantic_generation_config` has to be providedr   r   semantic_promptconstant)rj   moder+   r^   r   	min_eos_p)eos_token_idr`  r   r  )r  logits_processorgeneration_config)#r7   r   max_input_semantic_lengthtext_encoding_offsetrc   r@   text_pad_tokenr   r   padr/  semantic_pad_tokenr=   rO   intrg   r   repeat_interleavesemantic_infer_tokenru   r   listr   semantic_vocab_sizeextendrD   r   r   r  r`  r   ra  r-   generater?   )rC   r  rX  rY  rk   r   r   rd  semantic_historyinfer_arrayr  tokens_to_suppress suppress_tokens_logits_processorr`  early_stopping_logits_processorsemantic_outputrE   s                   rF   ro  BarkSemanticModel.generate  s   < &-NOO__Q'
$>$X$X! O OO	%!--q>/A.G.G.IKeKtKtuI%-.?@B[A[A\]!}}00 -4D0EEF0CC	  1    %||+>>?B[[chclcl bo  !223CD3I:[\]ll(==>?*LTYT]T]

"T[[/ 	 yy''	!5O6O5O2O(PQ))*:1>]@Y\]@]>];]*^_`''4
 
 ",@@B\BoBop
 	!!1DDqH$++JgJghi	
 ,II[dtdt+u(JJ{,F,P,PQ	*K3@@I^n^n+
'  '*JJ
$AB%))\`\g\gh
%>`8	

 
 *!-F-J-L*LMrH   r   NNN)r   r   r   r   base_model_prefixr   r   r=   rP  r#   r   r   strrR  ro  r   r   r   s   @rF   rV  rV    s     #%L
 DH<@15a<<a %Aa !c5<<&7!89	a
 !.a 
		a arH   rV  z
    Bark coarse acoustics model.
    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
    language modeling head on top.
    c                   8  ^  \ rS rSrSr\r SS\S\S\S\S\S\\	\
\R                  4      4S	 jjr     SS
\R                  S\S\S\S\\	\
\R                  4      S\\   S\\R$                  \\R$                  \R$                  4   4   4U 4S jjjrSrU =r$ )BarkCoarseModeli  coarse_acousticsmax_coarse_historysemantic_to_coarse_ratior   rX  codebook_sizerY  c           
         UGb  [         R                  " US   S   USS9nUS   R                  5       nUb2  [        SUR                  S   5       H  n	XSS24==   XY-  -  ss'   M     [         R
                  " USS5      R                  S5      nXR                  -   n[         R                  " US   USS9n[        [        R                  " X-  5      5      n
[        U
UR                  S   UR                  S   S-  -
  [        [        R                  " UR                  S   U-  5      5      /5      n[        [        X-  5      5      nUSS2U* S24   R                  5       nUSS2U* S24   R                  5       nUSS2SS	24   nXx4$ [         R                  " / /U-  [         R                  U R                  S
9n[         R                  " / /U-  [         R                  U R                  S
9nXx4$ )ax  
Preprocess the optional `Bark` speaker prompts before `self.generate`.

Args:
    max_coarse_history (`int`):
        Maximum size of coarse tokens used.
    semantic_to_coarse_ratio (`int`):
        Ratio of semantic to coarse frequency
    batch_size (`int`):
        Batch size, i.e the number of samples.
    semantic_generation_config (`BarkSemanticGenerationConfig`):
        Generation config indicating how to generate the semantic tokens.
    codebook_size (`int`):
        Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
    history_prompt (`Optional[Dict[str,torch.Tensor]]`):
        Optional `Bark` speaker prompt.
Returns: Returns:
    `tuple(torch.FloatTensor)`:
    - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
    - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
Nr\  r   r^   coarse_promptr   rK   r   rV   r  )r=   rj  cloner   r   rW   reshaperm  ri  npfloorre   roundrO   r   )rC   r}  r~  r   rX  r  rY  x_semantic_historyx_coarse_historynmax_semantic_historyn_semantic_hist_providedn_coarse_hist_provideds                rF   preprocess_histories$BarkCoarseModel.preprocess_histories  s   < %!&!8!8HY9Z[_9`blrs!t-o>DDF (q"2"8"8";<A$T*m.??* =
  %/?AFNNrR/2`2``$667G7Mz_`a $'rxx0B0]'^#_ '*(&,,Q/2D2J2J12MPQ2QQ!1!7!7!:=U!UVW($ &)/G/b)c%d"!3A8P7P7Q4Q!R!V!V!X/4J3J3K0KLPPR/3B37 "33 "'rdZ.?uyyY]YdYd!e$||RD:,=UYYW[WbWbc!33rH   ru  coarse_generation_configreturn_output_lengthsr   c           
        > Uc  [        S5      eUc  [        S5      eUR                  nUR                  n	UR                  n
UR	                  XR
                  :H  UR                  5        UR                  UR                  -  UR                  -  n[        [        R                  " X-  5      5      nXR                  :g  R                  S5      n[        R                  " X-  UR                  -  5      n[        R                  " XR                  -  5      R                  5       n[        R                   " U5      R#                  5       nUR$                  S   nU R'                  UU	UUUUS9u  nnUR$                  S   n[        R(                  " UU/5      n[        [        R*                  " X-  5      5      nSnUR$                  S   n[-        U5       GHZ  nU[        [        UU-  5      5      -   nUSS2[        R                   " SUU-
  /5      S24   nUSS2SU24   n[.        R0                  " USUUR$                  S   -
  4SUR                  5      n[        R(                  " U[        R2                  " UR4                  //U-  U R6                  S	9USS2U	* S24   /5      n[9        UR$                  S   UR:                  U5      n[<        TU ]|  " U4U/[A        XU-
  5      US
.UD6nUR$                  S   n[        R(                  " UUSS2US24   /5      nUR$                  S   U-
  nAGM]     USS2US24   nU(       a  UU4$ U$ )a  
Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
prompt.

Args:
    semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
        Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
    semantic_generation_config (`BarkSemanticGenerationConfig`):
        Generation config indicating how to generate the semantic tokens.
    coarse_generation_config (`BarkCoarseGenerationConfig`):
        Generation config indicating how to generate the coarse tokens.
    codebook_size (`int`, *optional*, defaults to 1024):
        Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
    history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
        Optional `Bark` speaker prompt.
    return_output_lengths (`bool`, *optional*):
        Whether or not to return the output lengths. Useful when batching.
Returns:
    By default:
        torch.LongTensor: Output coarse acoustics tokens.
    If `return_output_lengths=True`:
        `Tuple(torch.Tensor, torch.Tensor): The output coarse acoustics tokens, and the length of each sample
        of the batch.
Nr[  -`coarse_generation_config` has to be providedr   r   )rY  r}  r~  r   rX  r  rK   r]  r_  )rb  max_new_tokensrc  )!r7   max_coarse_input_lengthr}  sliding_window_lenr  rh  coarse_semantic_pad_tokencoarse_rate_hzsemantic_rate_hzn_coarse_codebooksri  r  r  sumr=   r  maxitemr   r  hstackceilr   r   rg  rO   coarse_infer_tokenr   r   rm  r-   ro  re   )rC   ru  rX  r  r  rY  r  r   r  r}  r  r~  r  output_lengthsmax_generated_lenr   r  x_coarsebase_semantic_idxn_window_stepstotal_generated_lenlen_coarse_historyr   semantic_idxinput_coarsealternatingLogitsProcessoroutput_coarseinput_coarse_lencoarse_outputrE   s                                rF   ro  BarkCoarseModel.generateg  s   F &-NOO#+LMM":"R"R5HH5HH 	$$LLL$>>	
 %33(99:&99: 	!
  #288,>,Y#Z[)-_-__ddefg58P8c8cc
 ^6a6a%abffh!IIn5::<$**1-
'+'@'@)1%=!'A' (A (
$H /44Q7,,(:O'LMRWW%6%KLM%^^A.~&A,s59LOg9g3h/iiL +1bffaH\9\5].^.`+`aL'+C,C+C(CDL55+l.@.@.DDE(BB	L !<< LL#;#N#N"O!PS]!]fjfqfqrQ!3 3 445L *M""1%*>>*& "G,"<!="#5K^7^_":	
 M  ,11!4||X}Q@P@Q=Q/R$STH"*.."36H"HQ 'T !$6$7!78  .00rH   r   r   )NN   NN)r   r   r   r   rx  r   r   ri  r   r   ry  r=   rP  r  r#   r!   r@   r   rR  r   ro  r   r   r   s   @rF   r{  r{    s    +#L =AH4H4 #&H4 	H4
 %(H4 H4 !c5<<&7!89H4Z DH?C!<@04FF %AF #=	F
 F !c5<<&7!89F  (~F 
uu'7'79I9I'I!JJ	KF FrH   r{  z
    Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
    language modeling heads, one for each codebook.
    c                   N  ^  \ rS rSrSr\rSrU 4S jrS r	S r
S rS rS#S	 jr   S$S
\\   S\\   S\S\R$                  4S jjrS rS r\         S%S\S\\R0                     S\\R0                     S\\R0                     S\\R0                     S\\R2                     S\\R0                     S\\   S\\   S\\   S\\\R0                     \4   4S jj5       r     S&S\R0                  S\S\S\ S\S \\!\"\R0                  4      S\R2                  4S! jjr#S"r$U =r%$ )'BarkFineModeli  fine_acousticscodebook_idxc                   > [         TU ]  U5        Xl        [        R                  " [        UR                  5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " [        UR                   5       Vs/ s H  n[#        USS9PM     sn5      U l        UR&                  S:H  U l        [        R*                  " UR                  5      U l        [        R                  " [        UR.                  UR                  5       Vs/ s H-  n[        R0                  " UR                  UR2                  SS9PM/     sn5      U l        SU l        UR                  U l        U R9                  5         g s  snf s  snf s  snf )NFr   r   r)   )r-   r.   rD   r   r   r   n_codes_totalr   r   r3   input_embeds_layersr<   r   r0   r/   r   r   r   r   r   r   r   r   n_codes_givenr8   r   lm_headsr   r   r   s      rF   r.   BarkFineModel.__init__  s     $&==PUV\VjVjPklPk1R\\&1163E3EFPkl$
  &(\\&2C2CVEWEW%X"JJv~~.	mmQVW]WhWhQi$jQiAYv%GQi$jk&,&A&AEX&X#!||F,>,>? v33V5I5IJJA 		&,,f.F.FUSJ
 ',##11 	+ m %ks   5G/;G44G9c                     U R                   $ r   r  r  s    rF   r  "BarkFineModel.get_input_embeddings  s    '''rH   c                     Xl         g r   r  r  s     rF   r	  "BarkFineModel.set_input_embeddings  s    #1 rH   c                     U R                   $ r   r  r  s    rF   get_output_embeddings#BarkFineModel.get_output_embeddings"  s    }}rH   c                     Xl         g r   r  )rC   new_output_embeddingss     rF   set_output_embeddings#BarkFineModel.set_output_embeddings&  s    -rH   c                 "   U R                  5       n[        R                  " U Vs/ s H  nU R                  XQX#5      PM     sn5      nU R	                  U5        US   R
                  R                  S   nU R                  5       bq  U R                  R                  (       dV  U R                  5       n[        R                  " U Vs/ s H  oR                  X5      PM     sn5      n	U R                  U	5        U R                  5       $ s  snf s  snf )Nr   )r  r   r   _get_resized_embeddingsr	  r   r   r  rD   tie_word_embeddings_get_resized_lm_headr  )
rC   new_num_tokenspad_to_multiple_ofmean_resizingold_embeddings_listold_embeddingsnew_embeddings_listold_lm_head_listold_lm_headnew_lm_head_lists
             rF   _resize_token_embeddings&BarkFineModel._resize_token_embeddings*  s    "779 mm ':&9N ,,^M_o&9
 	!!"56,Q/66<<Q? %%'3DKK<[<[#99;!}}[kl[kK**;G[kl  &&'78((**! ms   DDr  r  r  r   c                    U R                  XU5      nUc  Uc  U$ US   R                  R                  S   U R                  l        US   R                  R                  S   U R                  l        US   R                  R                  S   U l        US   R                  R                  S   U l        U R                  5         U$ )a  
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

Arguments:
    new_num_tokens (`int`, *optional*):
        The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
        vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
        returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the embedding matrix to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
        details about this, or help on choosing the correct value for resizing, refer to this guide:
        https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
    mean_resizing (`bool`):
        Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
        covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

        Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
        where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
        old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
        Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

Return:
    `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
r   )r  r   r   rD   r   
vocab_sizetie_weights)rC   r  r  r  model_embedss        rF   resize_token_embeddings%BarkFineModel.resize_token_embeddings?  s    F 44^Yfg!&8&@ )5Q(>(>(D(DQ(G%!-a!7!7!=!=a!@!-a!7!7!=!=a!@&q/0066q9 	rH   c                 v   [        U R                  SS5      (       a  / U l        U R                  5       nU R	                  5       n[        U R                  R                  U R                  R                  -
  5       H;  nU R                  X   X#S-      5        U R                  R                  SU S35        M=     g g )Nr  Tr   	lm_heads..weight)
getattrrD   _tied_weights_keysr  r  r   r  r  _tie_or_clone_weightsappend)rC   output_embeddingsinput_embeddingsr@  s       rF   _tie_weightsBarkFineModel._tie_weightsq  s    4;; 5t<<&(D# $ : : <#88:4;;44t{{7P7PPQ**+<+?AQVWRWAXY''..1#W/EF R =rH   c                    [        U R                  SS5      (       a  / U l        U R                  5       nU R	                  5       n[        U R                  R                  U R                  R                  -
  5       H;  nU R                  X   X#S-      5        U R                  R                  SU S35        M=     U R                  5        H&  n[        US5      (       d  M  UR                  5         M(     g)z
Tie the weights between the input embeddings list and the output embeddings list.

If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
weights instead.
r  Tr   r  r  r  N)r  rD   r  r  r  r   r  r  r  r  r   r   r  )rC   r  r  r@  r   s        rF   r  BarkFineModel.tie_weights|  s     4;; 5t<<&(D# $ : : <#88:4;;44t{{7P7PPQ**+<+?AQVWRWAXY''..1#W/EF R
 llnFv~..##% %rH   r  rk   r  rl   r  r  ry   r  r  c           
      B   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
SnUb  [	        S5      eUS:X  a  [        S5      eUb  Ub  [        S5      eUc  Uc  [        S5      eUb~  [        U R                  5       VVs/ s H&  u  pU" USS2SS2U4   5      R                  S5      PM(     nnn[        R                  " USS9nUSS2SS2SS2SUS	-   24   R                  SS9nUR                  5       SS nUR                  S   nUS	   nUb  UR                  OUR                  nUc6  [        R                  " SU[        R                   US
9nUR                  S5      nU R#                  U5      nUbA  US::  a  [        S5      eU R$                  (       a  SU;   a  UOSnO['        X7R(                  S	S9nU R+                  XPR                   R,                  5      nU R/                  UU-   5      nUUR                  S5      4-   nU(       a  SOSnU	(       a  SOSn[        U R0                  5       H5  u  nnU	(       a  UU4-   nU" UUX\   US9nUS   nU(       d  M,  UUS	   4-   nM7     U R3                  U5      nUR5                  U5      nU	(       a  UU4-   nU R6                  XR                   R8                  -
     " U5      nU
(       d  [;        S SUUU4 5       5      $ [=        UUUUS9$ s  snnf )a  
codebook_idx (`int`):
    Index of the codebook that will be predicted.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    NOT IMPLEMENTED YET.
input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
    `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
    `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
    associated vectors than the model's internal embedding lookup matrix.
NzTraining is not implemented yetr   zRCannot predict 0th codebook - 0th codebook should be predicted by the coarse modelr  r  rK   r^   r   r  r  r   r   )rk   rl   ry   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   r#  s     rF   r&  (BarkFineModel.forward.<locals>.<genexpr>  s     l$Zq$Zr(  )r)  r*  rv   r+  )rD   ry   r  r,  r-  r7   r5  r  r1  r=   ru   r  rL   r   r   r0  r  r   r   r   r,   r2  r   r   r   r   rA   r  r  r.  r   )rC   r  r  rk   r  rl   r  r  ry   r  r  r)  r@  r   r9  r   r:  r   r;  rv   r<  r>  r?  rA  r}   r*  s                             rF   r~   BarkFineModel.forward  sT   2 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%&GHH1qrr \%=bcc!5STT  .7t7O7O-P-P)A #9Q1W#56@@D-P   !99\r:L'1a1C<!3C1C(CDHHRHPL"'')#2.!''*
 ^
%.%:!!@S@S <<:UZZPVWL'11!4L44\B %Q !GHH**343FD "<NL^L^hi!j&&y++2H2HI			,"@A"m&8&8&<%>>$5b4"6BD!$++.HAu#$58H$H!-#,"3	G $AJM  &9WQZM&I# /  ,,];%**<8   1]4D D|kk.G.GGHWlT63DFY$Zlll+*	
 	
Es   0-Lr  rX  r  fine_generation_configr  rY  c           	         Uc  [        S5      eUc  [        S5      eUc  [        S5      eUR                  SUR                  5      nUR                  n	UR                  n
UR                  UR                  S   SUR                  5      n[        R                  " XR                  -
  U5      nUR                  S   nUb'  [        R                  " US   R                  S   USS	9nOSnUR                  n[        R                  " USUR                  U-
  4S
U5      nUbA  [        R                   " USS2U	* S2SS24   U/SS	9nUSS2U	* S2SS24   R                  S   nOSnSnUR                  S   U
:  a,  XR                  S   -
  n[        R                  " USSSU4S
US9nUR                  S   X-
  -
  U	-  n[#        [$        R&                  " U5      5      n[)        SU5      S-   n[+        U5       GH  n[-        UU	-  UR                  S   U
-
  /5      n[-        UUU	-  -   UR                  S   U	-
  /5      nUU-
  nUSS2UUU
-   2SS24   n[+        XR                  5       H  nU R/                  UU5      R0                  nUb  US:X  a&  USS2US2SU24   n[        R2                  " US5      nOgUSS2SS2SU24   U-  n[        R4                  " USS	9SS2UU
24   nUR7                  SU45      n[        R8                  " USS9R                  US5      nUR;                  [        R<                  5      nUUSS2US2U4'   AAM     [+        XR                  5       H  nUSS2US2U4   USS2UUU
U-
  -   2U4'   M!     AGM     UR?                  SS5      SS2SS2US24   nUS:  a  USS2SS2SU* 24   nUR                  S   UR                  S   :w  a  [        S5      eU$ )a  
Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
prompt.

Args:
    coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
        Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
    semantic_generation_config (`BarkSemanticGenerationConfig`):
        Generation config indicating how to generate the semantic tokens.
    coarse_generation_config (`BarkCoarseGenerationConfig`):
        Generation config indicating how to generate the coarse tokens.
    fine_generation_config (`BarkFineGenerationConfig`):
        Generation config indicating how to generate the fine tokens.
    codebook_size (`int`, *optional*, defaults to 1024):
        Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
    history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
        Optional `Bark` speaker prompt.
Returns:
    torch.LongTensor: Output fine acoustics tokens.
Nr[  r  z+`fine_generation_config` has to be providedtemperaturer   rK   fine_promptr^   r]  r   )r^  rj   r]   )num_samplesr   rV   z-input and output should have the same seq_len) r7   r  r  max_fine_history_lengthmax_fine_input_lengthrA   r   r  r=   	remainderrm  rj  Tr   rg  n_fine_codebooksru   ri  r  r  r  r   re   r~   r*  argmaxrf   r  multinomialrg   int32rW   )rC   r  rX  r  r  r  rY  r   r  r  r  r   x_fine_historyn_coarse
fine_input	n_historyn_remove_from_endn_loopsn_outer	start_idxstart_fill_idxrel_start_fill_idxinput_buffern_innerr*  relevant_logitscodebook_predsprobss                               rF   ro  BarkFineModel.generate  st   < &-NOO#+LMM!)JKK
 jj0F0R0RS"8"P"P 6 L L &**=+>+>q+A2G_GrGrs 8f8f(fhuv"((+
%"44^M5R5T5TUY5Z\flmnN "N+>> UU&77(BC	

 %N17N6N6OQR3R$SU_#`fghJ 'q+B*B*CQ'FGMMaPIIA!66 58H8H8K KzAq!5F+Gj`mnJ !&&q)-B-NOSjjbggg&'a/A%W~GW'>>
@P@PQR@SVk@klmI W'>>>
@P@PQR@SVm@mnN "0)!;%aYAV5V)VXY&YZL +R+RSg|<CC&+*<&,Q0B0C^m^-S&TO%*\\/2%FN&,Q>M>-A&B[&POIIo2>qBTUjBj?jkE!MM2}*=>E%*%6%6u!%L%Q%QR\^`%aN!/!2!25;;!?@NQ 2 3W<=N T$ !+R+RS !$6$7!@A ~:ORd:d(eegnn T = &@  ))!Q/1ij0@A
q #Aq*=,=+=*=$=>JB=#6#6r#::LMMrH   )r  r   rD   r   r   r  r   r   r  r  r   r   r  NT)NNT)	NNNNNNNNN)NNNr  N)&r   r   r   r   rx  r   r   main_input_namer.   r  r	  r  r  r  r   ri  r@   r   r   r  r  r  r   r=   rP  rR  r   r   r   r~   r#   r!   r"   r   ry  ro  r   r   r   s   @rF   r  r    s    )!L$O>(2.+. )-,0"	0 0 %SM0 	0
 
0d	G&*  -115/3,0-1/3,0/3&*w
w
 ELL)w
 !.	w

 u||,w
 ELL)w
 ))*w
 u||,w
 $D>w
 'tnw
 d^w
 
uU\\"N2	3w
 w
x DH?C;?!<@F||F %AF #=	F
 !9F F !c5<<&7!89F 
		F FrH   r  a7  
    The full Bark model, a text-to-speech model composed of 4 sub-models:
    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
      takes
    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
    - [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer,
    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
    to `encodec`.
    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
    predicts the last codebooks based on the sum of the previous codebooks embeddings.
    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
      array.

    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
    output sound according to specific predefined voice.
    c                     ^  \ rS rSr\rU 4S jr\S\4S j5       r	\
S\R                  4S j5       r SS\\   4S jjrSS jr\R"                  " 5          SS	\\R$                     S
\\\\R$                  4      S\\   S\R*                  4S jj5       r\    SS\\R.                     S\\\\\\4   4      S\S\4U 4S jjj5       rSrU =r$ )	BarkModeli  c                   > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l
        [        R                  " UR                  5      U l        Xl        g r   )r-   r.   rV  semantic_configrW  r{  coarse_acoustics_configr|  r  fine_acoustics_configr  r   from_configcodec_configcodec_modelrD   r   s     rF   r.   BarkModel.__init__  sf     )&*@*@A /0N0N O+F,H,HI$001D1DErH   r   c                     gr	  r   )clss    rF   can_generateBarkModel.can_generate  s     rH   c                 x   [        U R                  S5      (       d  [        U 5      $ U R                  R                  5        Hv  n[        US5      (       d  M  [        UR                  S5      (       d  M3  UR                  R
                  c  ML  [        R                  " UR                  R
                  5      s  $    g)r   r   r   N)r   rW  r   r   r   r   r=   r   r   s     rF   r   BarkModel.device  s     t}}j11'--mm++-F
++FOO-?@@OO44@||FOO$D$DEE .rH   accelerator_idc                    [        5       (       a  SSKJn  O[        S5      eUR	                  SS5      nUS:w  a  [
        R                  " S[        5        UnSn[        5       (       a(  [        R                  R                  5       R                  n[        R                  " U SU 35      n[        [        U5      nU R                  R                  S:w  a!  U R                  S5        UR!                  5         U" U R"                  R$                  U5      u  U R"                  l        nS	n	U R"                  U R&                  U R(                  4 H  n
U" XU	S
9u  pM     Xl        U" U R,                  XiS
9u  pXl        g	)aP  
Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
method moves one whole sub-model at a time to the accelerator when it is used, and the sub-model remains in accelerator until the next sub-model runs.

Args:
    accelerator_id (`int`, *optional*, defaults to 0):
        accelerator id on which the sub-models will be loaded and offloaded. This argument is deprecated.
    kwargs (`dict`, *optional*):
        additional keyword arguments:
            `gpu_id`: accelerator id on which the sub-models will be loaded and offloaded.
r   )cpu_offload_with_hookz1`enable_model_cpu_offload` requires `accelerate`.gpu_idzThe argument `gpu_id` is deprecated and will be removed in version 4.54.0 of Transformers. Please use `accelerator_id` instead.cuda:cpuN)prev_module_hook)r   
accelerater  ImportErrorr  warningswarnFutureWarningr   r=   acceleratorcurrent_acceleratortyper   r  rg   empty_cacherW  r   r|  r  fine_acoustics_hookr  codec_model_hook)rC   r  r   r  r  device_typer   torch_accelerator_moduler   hookcpu_offloaded_models              rF   enable_cpu_offloadBarkModel.enable_cpu_offload  sP     #$$8QRRHa(Q;MM R $N)++++??AFFKQ~.>?@#*5+#> ;;u$GGEN$002 /DDMMDdDdfl.m+(!MM!!$

 ,,?Z^_GAt$
 $( '(8(8&X !%rH   c                    UR                  SS5      nU R                  R                  R                  U5      nUbq  [	        X25       VVs/ s H  u  pEUSS2SU24   R                  S5      PM!     nnnU Vs/ s H+  o@R                  R                  U5      R                  5       PM-     nnU$ U R                  R                  U5      nUR                  S5      nU$ s  snnf s  snf )z:Turn quantized audio codes into audio array using encodec.r   r   N)rW   r  	quantizerdecoder6  r1  decodersqueeze)rC   fine_outputr  embsamplelout	audio_arrs           rF   codec_decodeBarkModel.codec_decode  s     "++Aq1((//<% BESAYZAY+66!RaR%=**1-AYCZRUVRU))11&9AACRUIV
  ""**3/CAI [Vs   	&C62C r  rY  r  c           	         [        S0 U R                  R                  D6n[        S0 U R                  R                  D6n[        S0 U R                  R                  D6nUR                  SS5      UR                  SS5      S.n0 n	0 n
UR                  5        H  u  pUR                  S5      (       a  U[        S5      S nXU'   M/  UR                  S5      (       a  U[        S5      S nXU'   MY  UR                  S5      (       a  U[        S5      S nXU'   M  X;  a  XU'   X;  a  XU'   X;  d  M  XU'   M     SU;   a  UR                  S5        U R                  R                  " U4UUS	.UD6nSU	;   a  U	R                  S5        U R                  R                  " U4UUUU R                  R                  US
.U	D6nSnU(       a  Uu  pXR                  -  nSU
;   a  U
R                  S5        U R                   R                  " U4UUUUU R                  R                  S.U
D6n[#        U SS5      bD  U R$                  R'                  5         U R(                  R+                  U R,                  5      U l        U R/                  UU5      n[#        U SS5      b  U R0                  R'                  5         U(       aH  U Vs/ s H  n[        U5      PM     nn[2        R4                  R6                  R9                  USSS9nUU4$ U$ s  snf )aV  
Generates audio from an input prompt and an additional optional `Bark` speaker prompt.

Args:
    input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
        Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
        longest generation among the batch.
    history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
        Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
    kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:

        - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
        - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
        semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.

        This means you can, for example, specify a generation strategy for all sub-models except one.
    return_output_lengths (`bool`, *optional*):
        Whether or not to return the waveform lengths. Useful when batching.
Returns:
    By default:
        - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
    When `return_output_lengths=True`:
        Returns a tuple made of:
        - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
        - **output_lengths** (`torch.Tensor` of shape (batch_size)): The length of each waveform in the batch
Example:

```python
>>> from transformers import AutoProcessor, BarkModel

>>> processor = AutoProcessor.from_pretrained("suno/bark-small")
>>> model = BarkModel.from_pretrained("suno/bark-small")

>>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
>>> voice_preset = "v2/en_speaker_6"

>>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)

>>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
>>> audio_array = audio_array.cpu().numpy().squeeze()
```
rk   Nr`  )rk   r`  	semantic_coarse_fine_rc  )rY  rX  )rY  rX  r  r  r  )rY  rX  r  r  r  r,  r-  Tr   )batch_firstpadding_valuer   )r#   rc  r  r!   r  r"   r  popitems
startswithr/  rW  ro  r|  r  r  r  r  r,  offloadr  rg   r   r?  r-  r   utilsrnnpad_sequence)rC   r  rY  r  r   rX  r  r  kwargs_semantickwargs_coarsekwargs_fineri   rj   ru  r  r  outputaudior;  s                      rF   ro  BarkModel.generate  s    h &B%kDDZDZDjDj%k"#=#o@V@V@n@n#o !9!iD<R<R<h<h!i %jj)94@K6

  ,,.JC~~k**#k*,-',$	**#i.*+%*c"((#g,.)#(C  -+0C(+).#&)',$% )* /1 34--00
)'A
 	
 -/12--66
)'A%=00>>"7
 
  ,9)M+/Z/ZZN +-OO/0$$--
)'A%=#900>>
 
 4.5A $$,,.#//224;;?D !!&.94+T2>!!))+ 8=>fc&kN>HHLL--eUV-WE.((	 ?s    Ltorch_dtype
device_maphard_check_onlycheck_device_mapc                    > [         TU ]  XX4US9nUR                  UR                  l        UR                  UR                  l        UR                  UR
                  l        U$ )a  
`_check_and_enable_flash_attn_2` originally don't expand flash attention enabling to the model
sub-configurations. We override the original method to make sure that Bark sub-models are using Flash Attention
if necessary.

If you don't know about Flash Attention, check out the official repository of flash attention:
https://github.com/Dao-AILab/flash-attention

For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
specific section of the documentation to learn more about it:
https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models

The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
half precision and not ran on CPU.

If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model
can initialize the correct attention module
)rV  rW  )r-   _check_and_enable_flash_attn_2r   r  r  r  )r  rD   rT  rU  rV  rW  rE   s         rF   rY  (BarkModel._check_and_enable_flash_attn_2  sd    6 7_o 8 
 7=6Q6Q3>D>Y>Y&&;<B<W<W$$9rH   )r|  r  r-  rD   r  r,  rW  )r   r   rw  )NNFF)r   r   r   r   r   r   r.   classmethodr@   r  r   r=   r   r   ri  r2  r?  no_gradrP  r   ry  rR  ro  r,   r   rY  r   r   r   s   @rF   r  r    sJ   & L	 T   F F F& )*8% 8%t$ ]]_ -1<@04	OELL)O !c5<<&7!89O  (~	O 
		O Ob  .2;? %!&! ekk*! U3S#X#678	!
 ! ! !rH   r  )r  rV  r{  r  r   r   )Cr   ra   r%  typingr   r   r   r   numpyr  r=   r   torch.nnr   r   
generationr
   generation.logits_processr   r   r   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_outputsr   r   modeling_utilsr   r   rK  r   r   r   r   autor   configuration_barkr   r   r   r   r    generation_configuration_barkr!   r"   r#   r$   
get_loggerr   r3  Moduler&   r   r   r   r   r   r   r   rV  r{  r  r  __all__r   rH   rF   <module>rl     s      / /    $ ) 
 C h F C     J 
		H	%z		 zzV/ Vt 0 	XBII 	Xbii  3		 3l **/ ** **\D
)? D
N e eeP To TTn [' [[| $h# h%$hV	rH   