o
    Zhl/                    @   s  d Z ddlZddlZddlmZmZmZmZ ddlZ	ddl
Z
ddl
mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0 e rddlm1Z1 e$2e3Z4G dd dej5Z6G dd de6Z7e6e7dZ8G dd dej5Z9G dd dej5Z:G dd dej5Z;e!G dd  d eZ<G d!d" d"e<eZ=e!d#d$G d%d& d&e=Z>e!d'd$G d(d) d)e=Z?e!d*d$G d+d, d,e<Z@e!d-d$G d.d/ d/e<ZAg d0ZBdS )1zPyTorch BARK model.    N)DictOptionalTupleUnion)nn)
functional   )GenerationMixin)#AlternatingCodebooksLogitsProcessor!BarkEosPrioritizerLogitsProcessorSuppressTokensLogitsProcessor)_prepare_4d_attention_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)CausalLMOutputWithPastMaskedLMOutput)PreTrainedModelget_parameter_device)auto_docstringis_accelerate_availableis_torch_accelerator_availablelogging   )	AutoModel   )BarkCoarseConfig
BarkConfigBarkFineConfigBarkSemanticConfigBarkSubModelConfig)BarkCoarseGenerationConfigBarkFineGenerationConfigBarkSemanticGenerationConfig)_flash_attention_forwardc                       sL   e Zd Zd fdd	Zdd Zdd Zdd	d
Z					dddZ  ZS )BarkSelfAttentionFc                    s   t    |j| _t|j| _t|j| _|j| _|j	| _	| j| j	 | _
|j|j	 dkr;td| j d| j	 dtj|jd|j |jd| _tj|j|j|jd| _|| _|rw|j}ttj||ftddd||}| d	| d S d S )
Nr   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   biasdtyper   r&   )super__init__dropoutr   Dropoutattn_dropoutresid_dropouthidden_size	embed_dim	num_headshead_dim
ValueErrorLinearr&   att_projout_proj	is_causal
block_sizetorchZtrilonesboolviewZregister_buffer)selfconfigr7   r8   r&   	__class__ U/var/www/auris/lib/python3.10/site-packages/transformers/models/bark/modeling_bark.pyr*   D   s*   

$zBarkSelfAttention.__init__c                 C   s2   |  dd ||f }||}|ddddS )J
        Splits hidden_size dim into attn_head_size and num_heads
        Nr   r   r   r   )sizer<   Zpermuter=   tensorr1   attn_head_sizeZ	new_shaperA   rA   rB   _split_headsb   s   
zBarkSelfAttention._split_headsc                 C   s4   | dd }|| dd || f }|S )S
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r   N)	transpose
contiguousr<   rE   r=   rG   r1   rH   rA   rA   rB   _merge_headsj   s    zBarkSelfAttention._merge_headsNc           
      C   s   t ||dddt| j  }| jr>|d|d}}|| j	d d d d || |d |f dkt 
|jj}|d urF|| }tjj|dd}||j}| |}|d ura|| }t ||}	|	|fS )NrD   rK         ?r   dim)r9   matmulrL   mathsqrtr2   r7   rE   masked_fillr&   Zfinfor(   minr   r   softmaxtor-   )
r=   querykeyvalueattention_mask	head_maskattn_weightsZquery_lengthZ
key_lengthattn_outputrA   rA   rB   _attnv   s    $(
zBarkSelfAttention._attnc                 C   s   |  |j| jdd\}}}	| || j| j}| || j| j}| |	| j| j}	|d urG|d }
|d }tj|
|fdd}tj||	fdd}	|du rP||	f}nd }| |||	||\}}| 	|| j| j}| 
|}| |}||f}|r{||f7 }|S )Nr   rQ   r   r   rK   T)r5   splitr0   rI   r1   r2   r9   catra   rO   r6   r.   )r=   hidden_statesr]   past_key_valuesr^   	use_cacheoutput_attentionsrZ   r[   r\   past_key
past_valuepresentr`   r_   outputsrA   rA   rB   forward   s(   




zBarkSelfAttention.forwardF)NNNNNFF)	__name__
__module____qualname__r*   rI   rO   ra   rl   __classcell__rA   rA   r?   rB   r$   @   s    
"r$   c                       sD   e Zd ZdZ fddZdd Zdd Z							dd
dZ  ZS )BarkSelfFlashAttention2aH  
    Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                    s   t  j|i | t | _d S N)r)   r*   r   _flash_attn_uses_top_left_mask)r=   argskwargsr?   rA   rB   r*      s   z BarkSelfFlashAttention2.__init__c                 C   s&   |  dd ||f }||}|S )rC   NrD   )rE   r<   rF   rA   rA   rB   rI      s   
z$BarkSelfFlashAttention2._split_headsc                 C   s$   | | dd || f }|S )rJ   NrK   )r<   rE   rN   rA   rA   rB   rO      s    z$BarkSelfFlashAttention2._merge_headsNFc              
   C   s>  |  \}}}	| |j| jdd\}
}}| |
| j| j}
| || j| j}| || j| j}|d urV|d dd}|d dd}tj	||fdd}tj	||fdd}|du rg|dd|ddf}nd }t
|
||||| jru| jnd| j| jd}| || j| j}| |}| |}||f}|rd }||f7 }|S )Nr   rQ   r   r   T        )r+   Zuse_top_left_maskr7   )rE   r5   rb   r0   rI   r1   r2   rL   r9   rc   r#   trainingr+   ru   r7   rO   r6   r.   )r=   rd   r]   re   r^   rf   rg   
batch_sizeZ	query_len_rZ   r[   r\   rh   ri   rj   r`   rk   r_   rA   rA   rB   rl      s>   	


zBarkSelfFlashAttention2.forwardrn   )	ro   rp   rq   __doc__r*   rI   rO   rl   rr   rA   rA   r?   rB   rs      s    
rs   )eagerflash_attention_2c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )BarkLayerNormzOLayerNorm but with an optional bias. PyTorch doesn't support simply bias=False.Tc                    s@   t    tt|| _|rtt|| _d S d | _d S rt   )	r)   r*   r   	Parameterr9   r:   weightZzerosr&   )r=   r/   r&   r?   rA   rB   r*     s   
$zBarkLayerNorm.__init__c                 C   s   t j|| jj| j| jddS )Ngh㈵>)Zeps)FZ
layer_normr   shaper&   )r=   inputrA   rA   rB   rl   $  s   zBarkLayerNorm.forward)T)ro   rp   rq   r|   r*   rl   rr   rA   rA   r?   rB   r     s    r   c                       s$   e Zd Z fddZdd Z  ZS )BarkMLPc                    s^   t    tj|jd|j |jd| _tjd|j |j|jd| _t|j	| _	t
 | _d S )N   r%   )r)   r*   r   r4   r/   r&   in_projr6   r,   r+   ZGELUgelur=   r>   r?   rA   rB   r*   )  s
   
zBarkMLP.__init__c                 C   s,   |  |}| |}| |}| |}|S rt   )r   r   r6   r+   )r=   rd   rA   rA   rB   rl   0  s
   



zBarkMLP.forwardro   rp   rq   r*   rl   rr   rA   rA   r?   rB   r   (  s    r   c                       s2   e Zd Zd fdd	Z					dddZ  ZS )		BarkBlockFc                    sr   t    |rt|j|jd| _t|j|jd| _nt|j| _t|j| _t	|j
 ||d| _t|| _d S )Nr%   r7   )r)   r*   r   r/   r&   layernorm_1layernorm_2r   	LayerNormBARK_ATTENTION_CLASSES_attn_implementationattnr   mlp)r=   r>   r7   r?   rA   rB   r*   9  s   
zBarkBlock.__init__Nc                 C   sx   |  |}| j||||||d}|d }	|dd  }
||	 }|| | | }|r1|f|
 }
|
S |f|
dd   }
|
S )Nre   r]   r^   rf   rg   r   r   )r   r   r   r   )r=   rd   re   r]   r^   rf   rg   Zintermediary_hidden_statesZattn_outputsr`   rk   rA   rA   rB   rl   J  s(   
		
zBarkBlock.forwardrm   rn   r   rA   rA   r?   rB   r   8  s    r   c                       sD   e Zd ZeZdZdZdd Z fddZe	de
jfdd	Z  ZS )
BarkPreTrainedModelFTc                 C   s   t |tjfr!|jjjd| jjd |jdur|jj	  dS dS t |tj
rD|jjjd| jjd |jdurB|jj|j 	  dS dS t |tjrY|jj	  |jjd dS dS )zInitialize the weights.rx   )meanZstdNrP   )
isinstancer   r4   r   dataZnormal_r>   Zinitializer_ranger&   Zzero_	EmbeddingZpadding_idxr   Zfill_r=   modulerA   rA   rB   _init_weightst  s   

z!BarkPreTrainedModel._init_weightsc                    s   t  j|i | d S rt   )r)   r*   )r=   Zinputsrw   r?   rA   rB   r*     s   zBarkPreTrainedModel.__init__returnc                 C   s\   t | ds	t| S |  D ]}t |dr)t |jdr)|jjdur)t|jj  S qt| S z
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        _hf_hookexecution_deviceN)hasattrr   modulesr   r   r9   devicer   rA   rA   rB   r     s   

zBarkPreTrainedModel.device)ro   rp   rq   r   config_classZsupports_gradient_checkpointingZ_supports_flash_attn_2r   r*   propertyr9   r   rr   rA   rA   r?   rB   r   n  s    r   c                       s  e Zd ZeZ fddZdd Zdd Zddd	Ze												dd
e
ej de
eej  de
ej de
ej de
ej de
ej de
ej de
e de
e de
e de
e deeej ef fddZedeeej  dejdeeej  fddZ  ZS )BarkCausalModelc                    s   t     | _t j j| _t j j| _	t
 j| _t fddt jD | _ jdk| _t j jd| _tj j jdd| _d| _|   d S )Nc                       g | ]}t  d dqS )Tr   r   .0r{   r>   rA   rB   
<listcomp>      z,BarkCausalModel.__init__.<locals>.<listcomp>r~   r%   F)r)   r*   r>   r   r   input_vocab_sizer/   input_embeds_layerr8   position_embeds_layerr,   r+   drop
ModuleListrange
num_layerslayersr   _use_flash_attention_2r   r&   layernorm_finalr4   output_vocab_sizelm_headgradient_checkpointing	post_initr   r?   r   rB   r*     s    zBarkCausalModel.__init__c                 C      | j S rt   r   r=   rA   rA   rB   get_input_embeddings  s   z$BarkCausalModel.get_input_embeddingsc                 C   
   || _ d S rt   r   r=   Znew_embeddingsrA   rA   rB   set_input_embeddings  s   
z$BarkCausalModel.set_input_embeddingsNc           
      K   s|  | dd }| dd }| dd }|d urB|jd }|d d jd }|jd |kr.|}	n|jd d }	|d d |	d f }d }n|d urQ| drQ|jd }n|jd }|d urd|d d d |f }|d urr|d d d |f }|d ur|d u r| dd }||dkd |r|d d |jd  d f }nd }|d ur| drd ||| d||d	S ||| d||d
S )Ninput_embedsr]   position_idsr   r   r   rf   rD   )	input_idsr   re   rf   r   r]   )r   re   rf   r   r]   )getr   longZcumsummasked_fill_)
r=   r   re   rw   r   r]   r   Zseq_lenpast_lengthZremove_prefix_lengthrA   rA   rB   prepare_inputs_for_generation  sN   

	z-BarkCausalModel.prepare_inputs_for_generationr   re   r]   r   r^   labelsr   rf   rg   output_hidden_statesreturn_dictr   c              
   C   s"  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}d}|dur2td|dur>|dur>td|durG|du rGn|durQ| |}n	|durVntd| dd }|j	d }|d }|durr|j
n|j
}|du rd}tdgt| j }n	|d d d}|du rtj||| tj|d}|d}| |}|dur|dkrtd	| jrd|v r|nd}n||d}t||jd
d}| || j j}| || }||df }| jr| jr|rtd d}|rdnd}|	rdnd}|
rdnd}tt| j|D ]O\}\}}|
r||f }| jr5| jr5|  |j!|d||| ||	}n|||||| ||	d}|d }|rO||d
 f }|	r^|||rYdnd
 f }q| "|}||}|
rr||f }| #|}|stdd d||||fD S t$|||||dS )a  
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
            have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
            is used in priority instead of `input_ids`.
        NzXTraining is not implemented yet for Bark - ensure you do not pass `labels` to the model.CYou cannot specify both input_ids and input_embeds at the same time4You have to specify either input_ids or input_embedsrD   r   rK   r(   r   $batch_size has to be defined and > 0r   Ztgt_lenzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FrA   r   r   c                 s       | ]	}|d ur|V  qd S rt   rA   r   vrA   rA   rB   	<genexpr>  s    z*BarkCausalModel.forward.<locals>.<genexpr>)losslogitsre   rd   
attentions)%r>   rg   r   rf   use_return_dictNotImplementedErrorr3   r   rE   r   r   tuplelenr   r9   aranger   	unsqueezer   r   r<   r   r(   get_head_maskr   r   r   ry   loggerZwarning_once	enumeratezipZ_gradient_checkpointing_func__call__r   r   r   )r=   r   re   r]   r   r^   r   r   rf   rg   r   r   r   input_shaperz   
seq_lengthr   r   position_embedsrd   output_shapeZpresent_key_valuesall_self_attentionsall_hidden_statesiblockZpast_layer_key_valuesrk   r   rA   rA   rB   rl     s   




	



zBarkCausalModel.forwardbeam_idxc                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s&    | ]}t  fd d|D V  qdS )c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectrY   r   )r   Z
past_stater   rA   rB   r     s   " z;BarkCausalModel._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   Z
layer_pastr   rA   rB   r     s
    
z1BarkCausalModel._reorder_cache.<locals>.<genexpr>r   )re   r   rA   r   rB   _reorder_cache  s   
zBarkCausalModel._reorder_cachert   )NNNNNNNNNNN)ro   rp   rq   r   r   r*   r   r   r   r   r   r9   Tensorr   ZFloatTensor
LongTensorr;   r   r   rl   staticmethodr   rr   rA   rA   r?   rB   r     sh    
>	
 r   z
    Bark semantic (or text) model. It shares the same architecture as the coarse model.
    It is a GPT-2 like autoregressive model with a language modeling head on top.
    )Zcustom_introc                       sZ   e Zd ZdZeZ			d
dejdede	e
eejf  de	ej dejf
 fdd	Z  ZS )BarkSemanticModelsemanticNr   semantic_generation_confighistory_promptr]   r   c              	      s  |du rt d|jd }|j}||j }|dur$|d|  |j}|durC|d | d }tjj	|d|t
| f|jdd}ntj|jg| tjd| j}tj|d |dd	}tj|jgg| tjd| j}	tj| |ddd|f | |ddd|d f  | |	gdd	}
tt|j|j}|tt|jd | jj t||jd
}|d|j}t|j||jd}t  j!tj"||d ftj| jdf|
||g|d|}|dd|d df }|S )a  
        Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids, i.e tokenized input sentences. Will be truncated up to
                semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
                long as the longest generation among the batch.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
            attention_mask (`Optional[torch.Tensor]`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        Returns:
            torch.LongTensor: Output semantic tokens.
        N/`semantic_generation_config` has to be providedr   r   semantic_promptconstant)r\   moder'   rQ   r   	min_eos_p)eos_token_idr   r   r   )r   logits_processorgeneration_config)#r3   r   max_input_semantic_lengthZtext_encoding_offsetrV   r;   Ztext_pad_tokenr   r   padr   semantic_pad_tokenr9   rG   intrY   r   repeat_interleaveZsemantic_infer_tokenrc   r   listr   semantic_vocab_sizeextendr>   r   r   r   r   r   r   r)   generater:   )r=   r   r   r   r]   rw   rz   r  Zsemantic_historyZinfer_arrayr   Ztokens_to_suppressZ suppress_tokens_logits_processorr   Zearly_stopping_logits_processorsemantic_outputr?   rA   rB   r	    sp   

	
	zBarkSemanticModel.generateNNN)ro   rp   rq   base_model_prefixr   r   r9   r   r"   r   r   strr   r	  rr   rA   rA   r?   rB   r     s"    r   z
    Bark coarse acoustics model.
    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
    language modeling head on top.
    c                       s   e Zd ZdZeZ	ddedededededeee	e
jf  fd	d
Z					dde
jdedededeee	e
jf  dee dee
jee
je
jf f f fddZ  ZS )BarkCoarseModelcoarse_acousticsNmax_coarse_historysemantic_to_coarse_ratiorz   r   codebook_sizer   c                 C   s|  |durt j|d d |dd}|d  }|dur3td|jd D ]}	||	ddf  ||	 7  < q"t |ddd}||j }t j|d |dd}tt	
|| }
t|
|jd |jd d  tt	
|jd | g}tt|| }|dd| df  }|dd| df  }|dddd	f }||fS t jg g| t j| jd
}t jg g| t j| jd
}||fS )a  
        Preprocess the optional `Bark` speaker prompts before `self.generate`.

        Args:
            max_coarse_history (`int`):
                Maximum size of coarse tokens used.
            semantic_to_coarse_ratio (`int`):
                Ratio of semantic to coarse frequency
            batch_size (`int`):
                Batch size, i.e the number of samples.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            codebook_size (`int`):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`):
                Optional `Bark` speaker prompt.
        Returns: Returns:
            `tuple(torch.FloatTensor)`:
            - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
            - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
        Nr   r   rQ   Zcoarse_promptr   rD   r   rK   r   )r9   r  cloner   r   rL   reshaper  r  npfloorrW   roundrG   r   )r=   r  r  rz   r   r  r   x_semantic_historyZx_coarse_historynmax_semantic_historyZn_semantic_hist_providedZn_coarse_hist_providedrA   rA   rB   preprocess_histories  s0   
z$BarkCoarseModel.preprocess_histories   r
  coarse_generation_configreturn_output_lengthsr   c              	      sV  |du rt d|du rt d|j}|j}	|j}
|||jk|j |j|j |j	 }t
t|	| }||jkd}t|| |j	 }t||j	 
 }t| }|jd }| j||	||||d\}}|jd }t||g}t
t||
 }d}|jd }t|D ]}|t
t||  }|ddtd|| gdf }|ddd|f }t|d||jd  fd|j}t|tj|jgg| | jd	|dd|	 df g}t|jd |j|}t j|f|gt |
|| |d
|}|jd }t||dd|df g}|jd | }~q|dd|df }|r)||fS |S )aW  
        Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
                Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
            return_output_lengths (`bool`, *optional*):
                Whether or not to return the output lengths. Useful when batching.
        Returns:
            By default:
                torch.LongTensor: Output coarse acoustics tokens.
            If `return_output_lengths=True`:
                `Tuple(torch.Tensor, torch.Tensor): The output coarse acoustics tokens, and the length of each sample
                of the batch.
        Nr   -`coarse_generation_config` has to be providedr   r   )r   r  r  rz   r   r  rD   r   r   )r   Zmax_new_tokensr   )!r3   max_coarse_input_lengthr  sliding_window_lenr   r  Zcoarse_semantic_pad_tokenZcoarse_rate_hzZsemantic_rate_hzn_coarse_codebooksr  r  r  sumr9   r  maxitemr   r  Zhstackceilr   r   r  rG   Zcoarse_infer_tokenr   r
   r  r)   r	  rW   )r=   r
  r   r  r  r   r  rw   r   r  r!  r  r  output_lengthsZmax_generated_lenrz   r  Zx_coarseZbase_semantic_idxZn_window_stepsZtotal_generated_lenZlen_coarse_historyr{   Zsemantic_idxZinput_coarseZalternatingLogitsProcessorZoutput_coarseZinput_coarse_lencoarse_outputr?   rA   rB   r	  g  s   #



"
zBarkCoarseModel.generatert   )NNr  NN)ro   rp   rq   r  r   r   r  r   r   r  r9   r   r  r"   r    r;   r   r   r   r	  rr   rA   rA   r?   rB   r    sJ    	
M	r  z
    Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
    language modeling heads, one for each codebook.
    c                       s^  e Zd ZdZeZdZ fddZdd Zdd Z	d	d
 Z
dd Zd/ddZ			d0dee dee dedejfddZdd Zdd Ze									d1dedeej deej deej deej deej d eej d!ee d"ee d#ee deeej ef fd$d%Z				&	d2d'ejd(ed)ed*ed+ed,ee e!ejf  dejfd-d.Z"  Z#S )3BarkFineModelfine_acousticscodebook_idxc                    s   t     | _t fddt jD | _t j	 j
| _t j| _t fddt jD | _ jdk| _t j
| _t fddt j jD | _d| _ j| _|   d S )Nc                    s   g | ]
}t  j jqS rA   )r   r   r   r/   r   r   rA   rB   r         z*BarkFineModel.__init__.<locals>.<listcomp>c                    r   )Fr   r   r   r   rA   rB   r   	  r   r~   c                    s    g | ]}t j j jd dqS )Fr%   )r   r4   r/   r   r   r   rA   rB   r     s    F)r)   r*   r>   r   r   r   n_codes_totalinput_embeds_layersr   r8   r/   r   r,   r+   r   r   r   r   r   r   r   n_codes_givenlm_headsr   r   r   r?   r   rB   r*     s$    
zBarkFineModel.__init__c                 C   r   rt   r.  r   rA   rA   rB   r        z"BarkFineModel.get_input_embeddingsc                 C   r   rt   r1  r   rA   rA   rB   r        
z"BarkFineModel.set_input_embeddingsc                 C   r   rt   r0  r   rA   rA   rB   get_output_embeddings"  r2  z#BarkFineModel.get_output_embeddingsc                 C   r   rt   r4  )r=   Znew_output_embeddingsrA   rA   rB   set_output_embeddings&  r3  z#BarkFineModel.set_output_embeddingsNTc                    s     }t fdd|D }| |d jjd  d ur@jjs@ }tfdd|D }	|   S )Nc                    s   g | ]
} | qS rA   )Z_get_resized_embeddings)r   Zold_embeddingsmean_resizingnew_num_tokenspad_to_multiple_ofr=   rA   rB   r   -  s    z:BarkFineModel._resize_token_embeddings.<locals>.<listcomp>r   c                    s   g | ]} | qS rA   )Z_get_resized_lm_head)r   Zold_lm_head)r9  r=   rA   rB   r   9  r   )
r   r   r   r   r   r   r5  r>   tie_word_embeddingsr6  )r=   r9  r:  r8  Zold_embeddings_listZnew_embeddings_listZold_lm_head_listZnew_lm_head_listrA   r7  rB   _resize_token_embeddings*  s   

z&BarkFineModel._resize_token_embeddingsr9  r:  r8  r   c                 C   sz   |  |||}|du r|du r|S |d jjd | j_|d jjd | j_|d jjd | _|d jjd | _|   |S )a  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        Nr   )r<  r   r   r>   r   Z
vocab_sizetie_weights)r=   r9  r:  r8  Zmodel_embedsrA   rA   rB   resize_token_embeddings?  s   #z%BarkFineModel.resize_token_embeddingsc                 C   sr   t | jddr5g | _|  }|  }t| jj| jj D ]}| || ||d   | j	d| d qd S d S )Nr;  Tr   	lm_heads..weight)
getattrr>   _tied_weights_keysr5  r   r   r-  r/  _tie_or_clone_weightsappend)r=   output_embeddingsinput_embeddingsr   rA   rA   rB   _tie_weightsq  s   zBarkFineModel._tie_weightsc                 C   s   t | jddr5g | _|  }|  }t| jj| jj D ]}| || ||d   | j	d| d q| 
 D ]}t|drD|  q9dS )z
        Tie the weights between the input embeddings list and the output embeddings list.

        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
        weights instead.
        r;  Tr   r?  r@  rG  N)rA  r>   rB  r5  r   r   r-  r/  rC  rD  r   r   rG  )r=   rE  rF  r   r   rA   rA   rB   r=  |  s   
zBarkFineModel.tie_weightsr   r]   r   r^   r   r   rg   r   r   c                    s  |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
d}|dur(td|dkr0td dur<|dur<td du rH|du rHtd duru fddt| jD }tj	|d	d
}|ddddddd|d f j
d	d
}| dd	 }|jd }|d } dur jn|j}|du rtjd|tj|d}|d}| |}|dur|dkrtd| jrd|v r|nd}nt||jdd}| || j j}| || }||d	f }|rdnd}|	rdnd}t| jD ]#\}}|	r||f }||||| |d}|d }|r||d f }q| |}||}|	r&||f }| j|| j j  |}|
sAtdd d|||fD S t||||dS )a  
        codebook_idx (`int`):
            Index of the codebook that will be predicted.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            NOT IMPLEMENTED YET.
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
            associated vectors than the model's internal embedding lookup matrix.
        NzTraining is not implemented yetr   zRCannot predict 0th codebook - 0th codebook should be predicted by the coarse modelr   r   c                    s0   g | ]\}}| d d d d |f  dqS )NrD   r   )r   r   r   r   rA   rB   r     s    z)BarkFineModel.forward.<locals>.<listcomp>rD   rQ   r   r   r   r   rA   )r]   r^   rg   c                 s   r   rt   rA   r   rA   rA   rB   r     s    z(BarkFineModel.forward.<locals>.<genexpr>)r   r   rd   r   )r>   rg   r   r   r   r3   r   r.  r9   rc   r#  rE   r   r   r   r   r   r   r   r   r(   r   r   r   r   r   r<   r0  r/  r   r   )r=   r+  r   r]   r   r^   r   r   rg   r   r   r   r   rz   r   r   r   rd   r   r   r   r   r   rk   r   rA   rI  rB   rl     s   
,






zBarkFineModel.forwardr  r(  r   r  fine_generation_configr  r   c              	   K   s  |du rt d|du rt d|du rt d|d|j}|j}	|j}
||jd d|j}t	||j
 |}|jd }|durPtj|d jd |dd	}nd}|j}t|d|j| fd
|}|durtj|dd|	 dddf |gdd	}|dd|	 dddf jd }nd}d}|jd |
k r|
|jd  }tj|ddd|fd
|d}|jd |
|  |	 }tt|}td|d }t|D ]}t||	 |jd |
 g}t|||	  |jd |	 g}|| }|dd|||
 ddf }t||jD ]k}| ||j}|du s|dkr'|dd|dd|f }t|d}n0|ddddd|f | }tj|dd	dd||
f }|d|f}tj|dd|d}|tj}||dd|d|f< ~~q t||jD ]}|dd|d|f |dd|||
|  |f< qr~q|dddddd|df }|dkr|ddddd| f }|jd |jd krt d|S )ap  
        Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
                Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            fine_generation_config (`BarkFineGenerationConfig`):
                Generation config indicating how to generate the fine tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
        Returns:
            torch.LongTensor: Output fine acoustics tokens.
        Nr   r  z+`fine_generation_config` has to be providedtemperaturer   rD   Zfine_promptrQ   r   r   )r   r\   rP   )Znum_samplesr   rK   z-input and output should have the same seq_len) r3   r   rK  max_fine_history_lengthmax_fine_input_lengthr<   r   r"  r9   	remainderr  r  Tr   r  Zn_fine_codebooksrc   r  r  r&  r$  r   rW   rl   r   ZargmaxrX   r  ZmultinomialrY   Zint32rL   )r=   r(  r   r  rJ  r  r   rw   rK  rL  rM  rz   Zx_fine_historyZn_coarseZ
fine_inputZ	n_historyZn_remove_from_endZn_loopsZn_outerZ	start_idxZstart_fill_idxZrel_start_fill_idxZinput_bufferZn_innerr   Zrelevant_logitsZcodebook_predsZprobsrA   rA   rB   r	    s   
*$"
zBarkFineModel.generateNT)NNT)	NNNNNNNNN)NNNr  N)$ro   rp   rq   r  r   r   Zmain_input_namer*   r   r   r5  r6  r<  r   r  r;   r   r   r>  rG  r=  r   r9   r   r   r   r   r   rl   r"   r    r!   r   r  r	  rr   rA   rA   r?   rB   r)    s    

2	
|	r)  a7  
    The full Bark model, a text-to-speech model composed of 4 sub-models:
    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
      takes
    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
    - [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer,
    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
    to `encodec`.
    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
    predicts the last codebooks based on the sum of the previous codebooks embeddings.
    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
      array.

    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
    output sound according to specific predefined voice.
    c                       s   e Zd ZeZ fddZedefddZe	de
jfddZ	dd	ee fd
dZdddZe
 			ddee
j deeee
jf  dee de
jfddZe				ddee
j deeeeeef f  dedef fddZ  ZS )	BarkModelc                    sH   t  | t|j| _t|j| _t|j	| _
t|j| _|| _d S rt   )r)   r*   r   semantic_configr   r  coarse_acoustics_configr  r)  fine_acoustics_configr*  r   Zfrom_configZcodec_configcodec_modelr>   r   r?   rA   rB   r*     s   
zBarkModel.__init__r   c                 C   s   dS rP  rA   )clsrA   rA   rB   can_generate  s   zBarkModel.can_generatec                 C   s\   t | jds
t| S | j D ]}t |dr+t |jdr+|jjdur+t|jj  S qdS r   )r   r   r   r   r   r   r9   r   r   rA   rA   rB   r     s   
zBarkModel.devicer   accelerator_idc                 K   s   t  r
ddlm} ntd|dd}|dkr tdt |}d}t r+t	j
 j}t	| d| }tt	|}| jjdkrI| d |  || jj|\| j_}d	}	| j| j| jfD ]}
||
||	d
\}}	q^|	| _|| j||	d
\}}	|	| _d	S )a  
        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
        method moves one whole sub-model at a time to the accelerator when it is used, and the sub-model remains in accelerator until the next sub-model runs.

        Args:
            accelerator_id (`int`, *optional*, defaults to 0):
                accelerator id on which the sub-models will be loaded and offloaded. This argument is deprecated.
            kwargs (`dict`, *optional*):
                additional keyword arguments:
                    `gpu_id`: accelerator id on which the sub-models will be loaded and offloaded.
        r   )cpu_offload_with_hookz1`enable_model_cpu_offload` requires `accelerate`.gpu_idzThe argument `gpu_id` is deprecated and will be removed in version 4.54.0 of Transformers. Please use `accelerator_id` instead.cuda:cpuN)Zprev_module_hook)r   Z
acceleraterY  ImportErrorr   warningswarnFutureWarningr   r9   ZacceleratorZcurrent_acceleratortyper   rA  rY   Zempty_cacher   r   r  r*  fine_acoustics_hookrU  codec_model_hook)r=   rX  rw   rY  rZ  Zdevice_typer   Ztorch_accelerator_moduler{   hookZcpu_offloaded_modelrA   rA   rB   enable_cpu_offload  s8   


zBarkModel.enable_cpu_offloadNc                    sf   | dd} jj|}|dur&dd t||D } fdd|D }|S  j|}|d}|S )z:Turn quantized audio codes into audio array using encodec.r   r   Nc                 S   s*   g | ]\}}|d d d |f  dqS )Nr   rH  )r   samplelrA   rA   rB   r     s   * z*BarkModel.codec_decode.<locals>.<listcomp>c                    s   g | ]
} j | qS rA   )rU  decodersqueezer   rg  r   rA   rB   r     r,  )rL   rU  Z	quantizerdecoder   ri  rj  )r=   Zfine_outputr'  ZemboutZ	audio_arrrA   r   rB   codec_decode  s   
zBarkModel.codec_decoder   r   r  c                 K   s>  t di | jj}tdi | jj}tdi | jj}|dd|ddd}i }	i }
| D ]R\}}|	drF|t
dd }|||< q0|	drX|t
dd }||	|< q0|	drj|t
dd }||
|< q0||vrr|||< ||	vrz||	|< ||
vr||
|< q0d|v r|d | jj|f||d	|}d|	v r|	d | jj|f|||| jj|d
|	}d}|r|\}}||j }d|
v r|
d | jj|f||||| jjd|
}t| dddur| j  | j| j| _| ||}t| dddur| j  |rdd |D }tjjj|ddd}||fS |S )a^	  
        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
                longest generation among the batch.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:

                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.

                This means you can, for example, specify a generation strategy for all sub-models except one.
            return_output_lengths (`bool`, *optional*):
                Whether or not to return the waveform lengths. Useful when batching.
        Returns:
            By default:
                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
            When `return_output_lengths=True`:
                Returns a tuple made of:
                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
                - **output_lengths** (`torch.Tensor` of shape (batch_size)): The length of each waveform in the batch
        Example:

        ```python
        >>> from transformers import AutoProcessor, BarkModel

        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
        >>> model = BarkModel.from_pretrained("suno/bark-small")

        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
        >>> voice_preset = "v2/en_speaker_6"

        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)

        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
        >>> audio_array = audio_array.cpu().numpy().squeeze()
        ```
        r]   Nr   )r]   r   Z	semantic_Zcoarse_Zfine_r   )r   r   )r   r   r  r  r  )r   r   r  rJ  r  rc  rd  c                 S   s   g | ]}t |qS rA   )r   rk  rA   rA   rB   r     s    z&BarkModel.generate.<locals>.<listcomp>Tr   )Zbatch_firstZpadding_valuerA   )r"   r   rR  r    rS  r!   rT  popitems
startswithr   r   r	  r  r  r"  r*  rA  rc  ZoffloadrU  rY   r   rn  rd  r   utilsZrnnZpad_sequence)r=   r   r   r  rw   r   r  rJ  Zkwargs_semanticZkwargs_coarseZkwargs_finer[   r\   r
  r(  r'  outputZaudiorA   rA   rB   r	    s   4















zBarkModel.generateFtorch_dtype
device_maphard_check_onlycheck_device_mapc                    s8   t  j|||||d}|j|j_|j|j_|j|j_|S )a(  
        `_check_and_enable_flash_attn_2` originally don't expand flash attention enabling to the model
        sub-configurations. We override the original method to make sure that Bark sub-models are using Flash Attention
        if necessary.

        If you don't know about Flash Attention, check out the official repository of flash attention:
        https://github.com/Dao-AILab/flash-attention

        For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
        specific section of the documentation to learn more about it:
        https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models

        The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
        half precision and not ran on CPU.

        If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model
        can initialize the correct attention module
        )rv  rw  )r)   _check_and_enable_flash_attn_2r   rR  rS  rT  )rV  r>   rt  ru  rv  rw  r?   rA   rB   rx    s   



z(BarkModel._check_and_enable_flash_attn_2)r   rt   r  )NNFF)ro   rp   rq   r   r   r*   classmethodr;   rW  r   r9   r   r   r  rf  rn  Zno_gradr   r   r  r   r	  r(   r   rx  rr   rA   rA   r?   rB   rQ    sP    

: rQ  )r)  r   r  rQ  r   r   )Cr|   rT   r_  typingr   r   r   r   numpyr  r9   r   Ztorch.nnr   r   Z
generationr	   Zgeneration.logits_processr
   r   r   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   r   Zmodeling_outputsr   r   Zmodeling_utilsr   r   rr  r   r   r   r   autor   Zconfiguration_barkr   r   r   r   r   Zgeneration_configuration_barkr    r!   r"   r#   Z
get_loggerro   r   Moduler$   rs   r   r   r   r   r   r   r   r  r)  rQ  __all__rA   rA   rA   rB   <module>   sr   
}Z6.  	h X   !  -