o
    Zha                    @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZmZm Z m!Z!m"Z"m#Z#m$Z$ dd
l%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@ ddlAmBZBmCZCmDZD ddlEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO ddl%mPZPmQZQmRZRmSZS ddlTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZn ddlompZpmqZqmrZrmsZsmtZtmuZumvZv er9ddlwmxZx ddlymzZz ddl{m|Z| e=}e~Ze9 rJd dlmZmZ g dZeG dd de8ZeG d d! d!e8ZeG d"d# d#e8ZeG d$d% d%e8ZeZeZeZeZeZeZeZeZeZeZeeef Zeeef Zeeef Zeeef Zeeef Zeeef Zeeef Zeeef ZG d&d' d'Zd(d) ZdPd+d,Zd-ejd.ejd/ejd0ejd1ed2ed3ejfd4d5Zd6ed7efd8d9Zd:ee8e
f d7ed6ed;e&d3eee8e
f  f
d<d=Zd>ee8 d;e&d3e8fd?d@ZdAedB dCdfdDejdEejdFedGedHed3ejfdIdJZdKee dLe
eejf dMejd3ejfdNdOZdS )Q    N)	dataclass)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion)file_exists)version)nn)
functional   )CacheDynamicCacheEncoderDecoderCacheHybridChunkedCacheOffloadedCacheOffloadedHybridCacheQuantizedCacheConfig)PretrainedConfig)check_python_requirementsget_cached_module_fileget_class_in_moduleresolve_trust_remote_code)is_deepspeed_zero3_enabled)is_fsdp_managed_module)CausalLMOutputWithPastSeq2SeqLMOutput)isin_mps_friendly)ExtensionsTrie)ModelOutputis_accelerate_availableis_hqq_availableis_optimum_quanto_availableis_torchdynamo_exportinglogging   )DisjunctiveConstraintPhrasalConstraint)
BeamScorerBeamSearchScorerConstrainedBeamSearchScorer)
AssistantVocabTranslatorCacheAssistedCandidateGenerator-AssistedCandidateGeneratorDifferentTokenizersCandidateGeneratorEarlyExitCandidateGeneratorPromptLookupCandidateGenerator%UniversalSpeculativeDecodingGenerator_crop_past_key_values_prepare_attention_mask_prepare_token_type_ids) NEED_SETUP_CACHE_CLASSES_MAPPINGQUANT_BACKEND_CLASSES_MAPPINGGenerationConfigGenerationMode)#EncoderNoRepeatNGramLogitsProcessor'EncoderRepetitionPenaltyLogitsProcessorEpsilonLogitsWarperEtaLogitsWarperExponentialDecayLengthPenaltyForcedBOSTokenLogitsProcessorForcedEOSTokenLogitsProcessorHammingDiversityLogitsProcessorInfNanRemoveLogitsProcessorLogitNormalizationLogitsProcessorListMinLengthLogitsProcessor!MinNewTokensLengthLogitsProcessorMinPLogitsWarperNoBadWordsLogitsProcessorNoRepeatNGramLogitsProcessor PrefixConstrainedLogitsProcessor RepetitionPenaltyLogitsProcessorSequenceBiasLogitsProcessor$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperTypicalLogitsWarper.UnbatchedClassifierFreeGuidanceLogitsProcessor)ConfidenceCriteriaEosTokenCriteriaMaxLengthCriteriaMaxTimeCriteriaStoppingCriteriaStoppingCriteriaListStopStringCriteria)PreTrainedModel)PreTrainedTokenizerBase)BaseStreamer)AlignDevicesHookadd_hook_to_module)past_key_valuescache_paramsstatememspast_buckets_statesc                   @   s   e Zd ZU dZejed< dZee	ej
  ed< dZee	ej
  ed< dZee	e	ej
   ed< dZee	e	ej
   ed< dZee	e	e	ej
    ed< dS )	GenerateDecoderOnlyOutputaw  
    Outputs of decoder-only generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    	sequencesNscoreslogits
attentionshidden_statesrb   )__name__
__module____qualname____doc__torch
LongTensor__annotations__ri   r   r	   FloatTensorrj   rk   rl   rb    ru   ru   L/var/www/auris/lib/python3.10/site-packages/transformers/generation/utils.pyrg      s   
 
"rg   c                   @   s   e Zd ZU dZejed< dZee	ej
  ed< dZee	ej
  ed< dZee	ej
  ed< dZee	ej
  ed< dZee	e	ej
   ed< dZee	e	ej
   ed	< dZee	e	ej
   ed
< dZee	e	e	ej
    ed< dS )GenerateEncoderDecoderOutputaI  
    Outputs of encoder-decoder generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rh   Nri   rj   encoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesrb   )rm   rn   ro   rp   rq   rr   rs   ri   r   r	   rt   rj   rx   ry   rz   r{   r|   rb   ru   ru   ru   rv   rw      s   
 
#"rw   c                   @   s   e Zd ZU dZejed< dZeej	 ed< dZ
eeej	  ed< dZeeej	  ed< dZeej ed< dZeeeej	   ed< dZeeeej	   ed	< dZeeeeej	    ed
< dS )GenerateBeamDecoderOnlyOutputa
  
    Outputs of decoder-only generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rh   Nsequences_scoresri   rj   beam_indicesrk   rl   rb   )rm   rn   ro   rp   rq   rr   rs   r~   r   rt   ri   r	   rj   r   rk   rl   rb   ru   ru   ru   rv   r}      s   
 
 "r}   c                   @   s  e Zd ZU dZejed< dZeej	 ed< dZ
eeej	  ed< dZeeej	  ed< dZeej ed< dZeeej	  ed< dZeeej	  ed	< dZeeeej	   ed
< dZeeeej	   ed< dZeeeej	   ed< dZeeeeej	    ed< dS ) GenerateBeamEncoderDecoderOutputa  
    Outputs of encoder-decoder generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
            sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rh   Nr~   ri   rj   r   rx   ry   rz   r{   r|   rb   )rm   rn   ro   rp   rq   rr   rs   r~   r   rt   ri   r	   rj   r   rx   ry   rz   r{   r|   rb   ru   ru   ru   rv   r     s   
 
*"r   c                "   @   s  e Zd ZdZ		ddeeeejf  dee	 de
fddZdejd	eej d
eej deejejf fddZdejd	eej d
eej deejejf fddZ				ddejdee deej d	eej d
eej f
ddZ			ddeej deej deeeejf  deejee eeejf f fddZ			ddeej deej deeeejf  dejfddZdejdedeeef dejfddZdejdee dedeeef fdd Z	dd!ededeeejf d"ejd#eej deejeeejf f fd$d%Ze	&	'	dd(ed)e	deej deejeeef f fd*d+Z 	'	&dd,e!deeef d)e	d-edeeef f
d.d/Z"d0d1 Z#dedejdejd2d3d4e$d5d6d7d6dede%fd8d9Z&				dded:ed;ejd<e
eejge'e f d4ee$ d#ee deeeef  d=eej d>eej de$fd?d@Z(	ddedAee) dBed6 de)fdCdDZ*dEee$e)f dFee$e)f dee$e)f fdGdHZ+		'ddIejdJeej dKeej dLe	dejf
dMdNZ,dOdP Z-deeef fdQdRZ.dSdT Z/dUdV Z0	ddee dWee	 dXedeeef fdYdZZ1d[d\ Z2deeeeeef f  fd]d^Z3d_ed!ed`ed#ejdef
dadbZ4de	fdcddZ5deded2d3d!edeed#ejde	fdfdgZ6de	fdhdiZ7		ddedjee	 d#eeejef  fdkdlZ8dedede	fdmdnZ9e: 												ddeej dee d4ee$ dAee) d<ee
eejge'e f  doee	 d2ed3 dpedq d=eej d>eej dWee	 dree dee;ejf fdsdtZ<due	doe	d#ejde	fdvdwZ=	ddejdBed6 dejfdxdyZ>dejdzeee'e f d4e$dAe)dedoe	dpdqdee?ejf fd{d|Z@e: dejd4e$dAe)dedoe	dpedq dee?ejf fd}d~ZAdejd4e$dAe)dedoe	dpedq dee?ejf fddZBdd ZCedejdejfddZDedejd!ededejfddZEedejdKejdejfddZFedejdejdejdejdedededee	ef deGfddZHdejdejdejdedede	dededed!edeejejejf fddZIdejdejdejdejdedeejejejf fddZJdIejdejdejdejdKejdejdejdejdejdedededeGdee	ef deejejejejf fddZKdejd4e$dAe)dedoe	deeLejf fddZMdejdeNd4e$dAe)dedoe	fddZOdejdePd4e$dAe)dedoe	deeLejf fddZQdejde%d4e$dAe)dedoe	dpedq dee?ejf fddZRdejdefddZSdS )GenerationMixina	  
    A class containing all functions for auto-regressive text generation, to be used as a mixin in model classes.
    Inheriting from this class causes the model to have special generation-related behavior, such as loading a
    `GenerationConfig` at initialization time or ensuring `generate`-related tests are run in `transformers` CI.

    A model class should inherit from `GenerationMixin` to enable calling methods like `generate`, or when it
    has defined a custom `generate` method that relies on `GenerationMixin`, directly or indirectly, which
    approximately shares the same interface to public methods like `generate`. Three examples:
        - `LlamaForCausalLM` should inherit from `GenerationMixin` to enable calling `generate` and other public
            methods in the mixin;
        - `BlipForQuestionAnswering` has a custom `generate` method that approximately shares the same interface as
           `GenerationMixin.generate` (it has a few extra arguments, and the same output). That function also calls
           `GenerationMixin.generate` indirectly, through an inner model. As such, `BlipForQuestionAnswering` should
           inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
        - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
            However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
            `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.

    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* if `num_beams=1` and `do_sample=False`
        - *contrastive search* if `penalty_alpha>0` and `top_k>1`
        - *multinomial sampling* if `num_beams=1` and `do_sample=True`
        - *beam-search decoding* if `num_beams>1` and `do_sample=False`
        - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
        - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
        - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`

    To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    Npretrained_model_name_or_pathtrust_remote_codereturnc           	      K   s   t j|}d}|rt jt j|dsd}nt|dsd}|s)td| dd| d}t|||| |d t|fd	d
i| t|fddi|}t	d|}|S )at  
        Loads and returns a custom generate function, given a model repo.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            trust_remote_code (`bool`, *optional*):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            **kwargs:
                Additional keyword arguments for remote code loading.

        Raises:
            OSError: If `pretrained_model_name_or_path` does not contain a `custom_generate` subdirectory.

        Returns:
            A callable that can be used to generate text.
        Tzcustom_generate/generate.pyF`zw` does not contain a `custom_generate` subdirectory with a `generate.py` file, can't load the custom generate function.zThe repository `zS` contains custom generation code that will override the default `generate` method.)Zhas_local_codeZhas_remote_codeerror_messageZrequirements_filez custom_generate/requirements.txtZmodule_filegenerate)
ospathexistsjoinr   OSErrorr   r   r   r   )	selfr   r   kwargsZis_local_codeZhas_custom_generate_folderr   modulecustom_generate_functionru   ru   rv   load_custom_generate  sJ   


	
z$GenerationMixin.load_custom_generate	input_idsinputs_embedscache_positionc                 C   s   t  r
| |||S |dur'|jd dkr'|dd|jd  df }||fS |dus4|d |jd krF|dd|jd  df }||fS |jd |jd krX|dd|f }||fS )a  
        Generic cache-dependent input preparation
        The code is put in a separate function to allow granular unit testing
        as it needs a different implementation to be exportable.

        If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        - Exception 1: when passing input_embeds, input_ids may be missing entries
        - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
        - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
          generate the first token for each sequence. Later use the generated Input ids for continuation.

        The current implementation does not rely on ``self`` and could be
        a class method. It is left as a standard method to be easily rewritten.
        Nr(   r   )r&   ,_cache_dependant_input_preparation_exportingshaper   r   r   r   ru   ru   rv   "_cache_dependant_input_preparation  s   z2GenerationMixin._cache_dependant_input_preparationc                    sv   |du r|dd|f }||fS dd  dd dd t |jd d	k fd
dfdd|||g\}}||fS )z
        This method implements method ``_cache_dependant_input_preparation``
        with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
        The code is put in a separate function to allow granular unit testing.
        Nc                 S      | d d |j d  d f S Nr   r   )r   r   ru   ru   rv   branch_1     zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_1c                 S   r   r   r   r   r   ru   ru   rv   branch_2   r   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_2c                 S   s   | d d |f S Nru   r   ru   ru   rv   branch_3  s   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_3r(   r   c                    s    ||| fS r   ru   r   r   r   )r   ru   rv   <lambda>	  s   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>c                    s.   |t |d | jd k fdd| |gfS )Nr   r(   c                    s(   t | jd |jd k dd | |gS )Nr(   r   c                 S   s   | S r   ru   r   ru   ru   rv   r     s    zrGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>rq   Zcondr   r   )r   ru   rv   r     s   z`GenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>r   r   )r   r   ru   rv   r     s   
r   r   ru   )r   r   r   rv   r     s   0
z<GenerationMixin._cache_dependant_input_preparation_exportingrb   attention_maskc              
   K   sT  i }| j r
||d< n!|du r+|dur|d d jd nd}tj||jd tj|jd}|dur<||d< | |||\}}| jjrBdnd	}	| jjsl|dur^t	||jd kr^d||	< ||d
< n|j
tjd||	< d|d
< n	|j
tjd||	< | jjr{|nd}
| jjr|ddn|}| jjrdnd}| jjrdnd}|dur||du r|tt| jj v r| dd }||dkd |||< dD ];}||}|dur|dur|d
dur|d
 jd n||	 jd }|dd| df }|j
tjd}|||< qt|tr}|jr}|dur}|jdkr}|d
 dur)|d
 j\}}}n||	 jdd \}}t| | j| }t|drE| nd}t|dd}|du r]|dur]t|dd}|du rmt| j j! d n||||" | j#||| j|d}|dur|||< |
dur|
|d< |$ D ]\}}||vr|||< q|dd |S )a  
        Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
        slicing inputs given the existing cache.

        See the forward pass in the model documentation for expected arguments (different models might have different
        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
        r   Nr   r   r(   dtypedevicerb   decoder_input_idsr   r   )Zmemory_formatdecoder_attention_maskr   decoder_position_idsposition_idsr   )r   token_type_idsr   get_decoderZ5_prepare_4d_causal_attention_mask_with_cache_positiona   has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.)sequence_lengthZtarget_lengthr   r   
batch_sizeconfigrb   labels)%_supports_cache_classr   rq   arangelongr   r   r   is_encoder_decoderlencloneZcontiguous_formatpopgetsetinspect	signatureforward
parameterskeyscumsumZmasked_fill_
isinstancer   is_compileablendimgetattrbase_model_prefixhasattrr   loggerwarning_once	__class__rm   Zget_max_cache_shaper   items)r   r   rb   r   r   r   r   model_inputspast_lengthZinput_ids_keyZencoder_attention_maskZattention_mask_keyZposition_ids_keyr   model_input_namemodel_inputZcurrent_input_lengthr   r   _
base_modeldecoderZcausal_mask_creation_functionkeyvalueru   ru   rv   prepare_inputs_for_generation$  s   










z-GenerationMixin.prepare_inputs_for_generationinputsbos_token_idmodel_kwargsc              
      s&  | j jrt| dr| jj| jkr| jj n| j  fdd| D }| d}|durB|durBtd| d  d| d  d		|durH|} d
krd|v r| j jsxdtt	
| jj v }|smtd| jj d| j|||d|d
< n|durtd|d d} | |||}| |fS )zT
        This function extracts the model-specific `inputs` for generation.
        encoderc                    s&   i | ]\}}|d us| kr||qS r   ru   ).0kvZ
input_nameru   rv   
<dictcomp>     & z9GenerationMixin._prepare_model_inputs.<locals>.<dictcomp>Nz
`inputs`: z` were passed alongside z0 which is not allowed. Make sure to either pass z or z=...r   r   zAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)r   zMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)r   r   r   r   main_input_namer   r   
ValueErrorr   r   r   r   r   r   r   rm   *_maybe_initialize_input_ids_for_generation)r   r   r   r   Zinputs_kwargZhas_inputs_embeds_forwardingru   r   rv   _prepare_model_inputs  sH   

z%GenerationMixin._prepare_model_inputsc                 C   s   |dur|S | d}| jjr(|dur(|j dd }tj|tj| jdd S d}|	 D ]}t
|tjr=|jd } nq.d|v rNtj|dftj| jdS |du rVtd	tj|dftj| jd| S )
z3Initializes input ids for generation, if necessary.Nencoder_outputsr   r   ir(   r   r   zB`bos_token_id` has to be defined when no `input_ids` are provided.)r   r   r   Zlast_hidden_statesizerq   onesr   r   valuesr   Tensorr   r   )r   r   r   r   r   r   r   r   ru   ru   rv   r     s"   

z:GenerationMixin._maybe_initialize_input_ids_for_generationinputs_tensorgeneration_configc                 C   s   |j }|j}d|v r|d jd dkr|d }tj|jd d tj|jd}|d u r,|S t|jdko;|jtj	tjfv }|s@|S |d uoKt
||d }|d u pXt
||d  }	||	 }
|| }||
 ||
   }|S )Nr   r(   r   r   r   elementsZtest_elements)_pad_token_tensor_eos_token_tensorr   rq   r   r   r   r   r   intr    anyne)r   r   r   r   pad_token_ideos_token_idZdefault_attention_maskZis_input_idsZis_pad_token_in_inputsZ&is_pad_token_not_equal_to_eos_token_idZcan_infer_attention_maskZattention_mask_from_paddingr   ru   ru   rv   &_prepare_attention_mask_for_generation  s*    z6GenerationMixin._prepare_attention_mask_for_generationr   c                    s   |   }t| drt|drd|j_nt|tdd g dfdd| D }tt	|j
j d v p:d	 v }|sH fd
d| D }|j|d< |j|d< |d urX|n| j}d|d< |||< |di ||d< |S )Nhf_device_map_hf_hookT)io_same_device)decoder_Z
cross_attn	use_cachec                    s,   i | ]\ }t  fd dD s |qS )c                 3   s    | ]}  |V  qd S r   )
startswithr   pargumentru   rv   	<genexpr>G      z\GenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>.<genexpr>r   )r   r   )irrelevant_prefixr   rv   r   D  s    zRGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>r   r   c                    s   i | ]\}}| v r||qS ru   ru   )r   r   r   )encoder_signatureru   rv   r   L  s    output_attentionsoutput_hidden_statesreturn_dictr   ru   )Zget_encoderr   r   r   ra   r`   r   r   r   r   r   r   r  r  r   )r   r   r   r   r   r   encoder_kwargsZencoder_accepts_wildcardru   )r  r  rv   ._prepare_encoder_decoder_kwargs_for_generation1  s,   






z>GenerationMixin._prepare_encoder_decoder_kwargs_for_generationr   decoder_start_token_idr   c                 C   s  |durd|v r| d}nd|v r|dkr| d}nd}|du r%| j}|jdkrE|jd |kr>td| d|jd  |dd}ntj|dftj|d	| }|du r\|}||fS d
| j	j
 v ss| jjdkrxd
| jjj v rx	 ||fS | jjdv r	 ||fS |dddf |dddf k  rtj||gdd}d|v r|d }tjt|ddddf |fdd}||d< ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNr   r   r(   r   z1`decoder_start_token_id` expected to have length z	 but got r   r   Zdonutzvision-encoder-decoder)Zwhisperdimr   )r   r   r   r   r   viewrq   r   r   r   rm   lowerr   
model_typer   allitemcat	ones_like)r   r   r   r   r
  r   r   r   ru   ru   rv   )_prepare_decoder_input_ids_for_generationZ  sF   
(z9GenerationMixin._prepare_decoder_input_ids_for_generationr(   Fexpand_sizer   c                    sl    dkr||fS  fdd}|dur|j  dd}||}|r2|ddu r*td||d |d< ||fS )	zIExpands tensors from [batch_size, ...] to [batch_size * expand_size, ...]r(   c                    sH   | D ]}|dkr!| | d ur!t | | tjr!| | j dd| |< q| S )Nr   r   r  )r   rq   r   repeat_interleave)Zdict_to_expandr   r  ru   rv   _expand_dict_for_generation  s   zRGenerationMixin._expand_inputs_for_generation.<locals>._expand_dict_for_generationNr   r  r   zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)r  r   r   )r  r   r   r   r  ru   r  rv   _expand_inputs_for_generation  s   

z-GenerationMixin._expand_inputs_for_generationoutputsnum_new_tokensc                 C   sV  t D ]}||v r|dv rd}n|}t||||<  nqd|v r7|d }tj||d d df dgdd|d< |sUd|v rT|d }tj|||jd dfgdd|d< nd	|v rp|d	 }	tj|	|	|	jd dfgdd|d	< |d
dr|d dd  | |d< |S |d}
tj	|
d d |
d | d |
j
d|
j}t|
|f|d< |S )N)rf   re   rb   r   r   r  r   r   r(   r   r   Tr   r   )ALL_CACHE_NAMESr   rq   r  	unsqueezenew_onesr   r   r   r   r   tor   )r   r  r   r   r  possible_cache_name
cache_namer   r   r   Zpast_positionsZnew_positionsru   ru   rv   #_update_model_kwargs_for_generation  sF   
(


z3GenerationMixin._update_model_kwargs_for_generationc                 C   s   t d| jj d| j )NzGMake sure that a `_reorder_cache` function is correctly implemented in z to enable beam search for )NotImplementedErrorr   rn   )r   rb   beam_idxru   ru   rv   _reorder_cache  s
   zGenerationMixin._reorder_cacheassistant_modelr]   logits_processortarget_tokenizerr^   assistant_tokenizerc	                 C   s  t dd |||fD }	|jdurt|| ||||d}
|
S |jdur0t|j|j|j|jd}
|
S |	rv|jdu rXt	j
||| j j|dd}d|j_t|||||||||d	}
|
S |jd	u rkt||||||||d
}
|
S tdt|jj t||||||d}
|
S )zU
        Returns the candidate generator to be used in `assisted_generation`
        c                 s   s    | ]}|d uV  qd S r   ru   )r   r   ru   ru   rv   r         z;GenerationMixin._get_candidate_generator.<locals>.<genexpr>N)r   r'  r   r   r   r(  )r   Znum_output_tokensmax_matching_ngram_size
max_lengthT)r'  Zassistant_prune_lm_head)	r   r'  r   r   r   r(  r)  r*  atm_translatorF)r   r'  r   r   r   r(  r)  r*  z7Invalid value for `do_sample`: expected a boolean, got )r  Zassistant_early_exitr2   Zprompt_lookup_num_tokensr3   r   r,  r-  	do_sampler.   Zget_translatorr   get_text_config
vocab_sizer   repetition_penaltyr4   r0   r   typerm   r/   )r   r   r   r   r'  r(  r)  r*  r   Zdifferent_tokenizerscandidate_generatorr.  ru   ru   rv   _get_candidate_generator  s   
=
4

#
z(GenerationMixin._get_candidate_generatorinput_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnnegative_prompt_idsnegative_prompt_attention_maskc
              	   C   s  t  }
|jdur|jdkr|
t|j| ||	|jd |jdur)|
t|jd |jdur@|jdkr@|
t|j|j	|j
d |jdurb|jdkrbt|jdkr\|
t|j|d	 ntd
t |jduru|jdkru|
t|jd |jdur|jdkr|
t|j |jdur|jdkrt|jdkr|
t|j| ntdt |jdur|
t|j|j |jdur|jdur|jdkr|
t|j|j|d |jdur|jdur|jdkr|
t||j|j|d |dur|
t||j	|j
  |j dur|
t!|j  |j"dur|
t#|j$|j"|d |j%du r*|
t&  |j'dur;|
t(|j'|j| |j)durK|
t*|j)|d |j+duro|}|dks^|j du r`|n|d }|
t,|j+||d |j-duryt.d| /|
|}
|j0rS|j	dkrt1|jt2rt|jd }nt1|jt3j4r|jjd d }nd}nd}|j5dur|j5dkr|
t6|j5 |j7dur|j7dkr|
t8|j7|d |j9dur|j9dk r|
t:|j9|d |j;dur|
t<|j;|d |j=dur|j=dk r|
t>|j=|d |j?dur3d|j?  k r'dk r3n n
|
t@|j?|d |jAdurSd|jA  k rFdk rSn n|
tB|jA||d |jCdurg|
|jCD| jEF jG| |jHdu rs|
tI  |
S )z
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
        instances used to modify the scores of the language model head.
        Nr(   )Zunconditional_idsZunconditional_attention_maskr   sequence_bias        )diversity_penalty	num_beamsnum_beam_groups      ?r   )penaltyr7  zyPassing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.)rB  r   z{Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.r   TzYou have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument in favour of `input_ids` or `decoder_input_ids` respectively.)top_kmin_tokens_to_keep)top_prE  )min_prE  )ZmassrE  )epsilonrE  )rH  rE  r   )JrF   Zguidance_scaleappendrU   r   r<  rN   r>  rC   r?  r@  Zencoder_repetition_penaltyr   r   r=   warningswarnUserWarningr2  rM   Zno_repeat_ngram_sizerK   Zencoder_no_repeat_ngram_sizer<   Zbad_words_idsrJ   r   
min_lengthrG   min_new_tokensrH   rL   Zforced_bos_token_idrA   Zforced_eos_token_idrB   r-  Zremove_invalid_valuesrD   Z exponential_decay_length_penaltyr@   Zsuppress_tokensrP   Zbegin_suppress_tokensrO   Zforced_decoder_idsr   _merge_criteria_processor_listr/  r   listrq   r   ZtemperaturerQ   rD  rR   rF  rS   rG  rI   Z	typical_prT   Zepsilon_cutoffr>   Z
eta_cutoffr?   Zwatermarking_configZconstruct_processorr   r0  r1  Zrenormalize_logitsrE   )r   r   r6  r7  r8  r(  r   r   r9  r:  Z
processorsZbegin_indexrE  ru   ru   rv   _get_logits_processor?  sl  
	











**z%GenerationMixin._get_logits_processorstopping_criteria	tokenizerc                 K   s   t  }|jd urt| jdd }|t|j|d |jd ur'|t|jd |jd ur>|d u r4t	d|t
|j|d |jd urL|t|jd |jrb|jd urb|jdkrb|t|jd | ||}|S )	Nmax_position_embeddings)r-  rT  )max_timea  There are one or more stop strings, either in the arguments to `generate` or in the model's generation config, but we could not locate a tokenizer. When generating with stop strings, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.)stop_stringsrS  )r   r   )assistant_confidence_threshold)r[   r-  r   r   rI  rX   rU  rY   rV  r   r\   r   rW   Zis_assistantrW  rV   rO  )r   r   rR  rS  r   criteriarT  ru   ru   rv   _get_stopping_criteria(  s8   






z&GenerationMixin._get_stopping_criteriadefault_listcustom_listc                 C   s   t |dkr|S t| }|D ]B}d}|D ]4}t|t|u rIt|tr&dnd}td| dt| dt| dt| d		 || d
} nq|sQ|| q|D ]}||vr_|| qT|S )a4  
        Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
        processor/criteria is present on both lists, use the user-defined one.

        (Note: up to v4.49.0, this function threw an exception is the same logit processor was found twice.)
        r   Fzstopping criteriazlogits processorz	A custom z	 of type zt has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom z5 will take precedence. Please check the docstring of z$ to see related `.generate()` flags.T)r   r3  r   rZ   r   r   rI  )r   rZ  r[  Z
final_listdefaultZusing_customZcustomZobject_typeru   ru   rv   rO  O  s6   



z.GenerationMixin._merge_criteria_processor_listrh   ri   r   normalize_logitsc                 C   s>  |du rt |d jd dd|j}|dt|}t |	t|d
dd}|rO|	d| j j|jd }t jjj|dd}|	d|jd }|dk }d|  d }| ddd|f }|ddd|f }d||< || j j }|jd | }|dd|df | }	|d|	}
d|
|< |
S )a  
        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
        used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.

        Parameters:
            sequences (`torch.LongTensor`):
                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
                shorter if all batches finished early due to the `eos_token_id`.
            scores (`tuple(torch.FloatTensor)`):
                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
                of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
                Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
                with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
            beam_indices (`torch.LongTensor`, *optional*):
                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
                generate-time.
            normalize_logits (`bool`, *optional*, defaults to `False`):
                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).

        Return:
            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
                the transition scores (logits)

        Examples:

        ```python
        >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
        >>> import numpy as np

        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
        >>> inputs = tokenizer(["Today is"], return_tensors="pt")

        >>> # Example 1: Print the scores for each token generated with Greedy Search
        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, normalize_logits=True
        ... )
        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
        >>> # encoder-decoder models, like BART or T5.
        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
        >>> generated_tokens = outputs.sequences[:, input_length:]
        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
        ...     # | token | token string | log probability | probability
        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
        |   262 |  the     | -1.414 | 24.33%
        |  1110 |  day     | -2.609 | 7.36%
        |   618 |  when    | -2.010 | 13.40%
        |   356 |  we      | -1.859 | 15.58%
        |   460 |  can     | -2.508 | 8.14%

        >>> # Example 2: Reconstruct the sequence scores from Beam Search
        >>> outputs = model.generate(
        ...     **inputs,
        ...     max_new_tokens=5,
        ...     num_beams=4,
        ...     num_return_sequences=4,
        ...     return_dict_in_generate=True,
        ...     output_scores=True,
        ... )
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
        ... )
        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
        >>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
        >>> # use case, you might want to recompute it with `normalize_logits=True`.
        >>> # Tip 2: the output length does NOT include the input length
        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
        >>> length_penalty = model.generation_config.length_penalty
        >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
        True
        ```Nr   r   r(   r  )rq   r   r   r  r   r   expandr   stackreshape	transposer   r0  r1  r   r   log_softmaxr   summaxr   gather)r   rh   ri   r   r]  Zbeam_indices_maskZmax_beam_lengthZbeam_sequence_indicesZcut_idxindicesZtransition_scoresru   ru   rv   compute_transition_scorest  s&   T$z)GenerationMixin.compute_transition_scoresc                    s    d u rd S j jr0 j js0g dfddt j D t fddD }|s0tdd}j  j j  jkrL|d urJtd| d	d S |d u sT|d u r\td
| d	d S )N)Zencoder_attention_headsZencoder_ffn_dimZencoder_layersc                       g | ]}| v r|qS ru   ru   r   attr)attributes_to_checkru   rv   
<listcomp>      z7GenerationMixin._validate_assistant.<locals>.<listcomp>c                 3   s(    | ]}t j|t  j|kV  qd S r   )r   r   ri  )r'  r   ru   rv   r     s    
z6GenerationMixin._validate_assistant.<locals>.<genexpr>zThe main model and the assistant don't have compatible encoder-dependent input shapes. Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper.zc(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)z`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` .zThe main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` )r   r   dirr  r   r0  r1  )r   r'  rS  r*  Z	are_equalZdoc_referenceru   )r'  rk  r   rv   _validate_assistant  s2   

z#GenerationMixin._validate_assistantc                 C   s|  t |ddtr| jst| jj d| jjr$dD ]}|	|d qg }t
t| jj}d|v s7d|v rB|t
t| jjO }| jjrt| | jd}t| dd}|du ra|durat|dd}|durrt
t|jj}||O }t| dd}|du r|durt|dd}|durt
t|jj}	|d	d
 |	D O }| D ]\}}
|
dur||vr|| q|rtd| ddS )zXValidates model kwargs for generation. Generate argument typos will also be caught here.rb   Nz does not support an instance of `Cache` as `past_key_values`. Please check the model documentation for supported cache formats.)r   r   r   r   r   c                 S   s   h | ]}d | qS )r   ru   )r   xru   ru   rv   	<setcomp>8      z9GenerationMixin._validate_model_kwargs.<locals>.<setcomp>z8The following `model_kwargs` are not used by the model: zG (note: typos in the generate arguments will also show up in this list))r   r   r   r   r   r   rm   r   r   r   r   r   r   r   r   r   r   r   r   rI  )r   r   r   Zunused_model_argsZ
model_argsr   r   Zencoder_model_argsr   Zdecoder_model_argsr   ru   ru   rv   _validate_model_kwargs  sD   

z&GenerationMixin._validate_model_kwargsc              	   C   s
  |r|j du r|jdkrtd|j dt ||jkr3| jjr"dnd}td| d| d	|j d
d}|r@|d|j d7 }|jdur\|j|jkr\td|j d|j d| t |j	dur|j	| }||jkrtd|j	 d| d|j d| t dS dS dS )z=Performs validation related to the resulting generated lengthN   z0Using the model-agnostic default `max_length` (=zz) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.r   r   zInput length of z is z, but `max_length` is set to z}. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.z Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.z" Note that `max_length` is set to z, its default value.z-Unfeasible length constraints: `min_length` (z.) is larger than the maximum possible length (z).z1Unfeasible length constraints: `min_new_tokens` (z$), when added to the prompt length (z/), is larger than the maximum possible length ()
max_new_tokensr-  rJ  rK  rL  r   r   r   rM  rN  )r   r   input_ids_lengthhas_default_max_lengthZinput_ids_stringZmin_length_error_suffixrM  ru   ru   rv   _validate_generated_lengthD  sT   





z*GenerationMixin._validate_generated_lengthc                 C   s,  |j dur!|s|jdurtd|j  d|j d |j | |_n;|dkr;||jd kr;| jjs;| j|jd 8  _n!|r\|jt jkr\|j| |_t| jdd}|dur\t	|j||_|j
dury|sqtd|j
 d	|j d
 |j
| |_|S |dkr||jd kr| jjst|j|jd  d|_|S )z]Prepared max and min length in generation configs to avoid clashes between similar attributesNzBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   r(   rT  zBoth `min_new_tokens` (=z) and `min_length`(=z) seem to have been set. `min_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   )rv  r-  r   warningr   r   r   r:   r   minrN  rM  rd  )r   r   rx  has_default_min_lengthr   rw  r   rT  ru   ru   rv   _prepare_generated_lengthp  sD   



	z)GenerationMixin._prepare_generated_lengthuse_model_defaultsr   c                 K   s  d}|du r5| j jr0| j jt| j kr0t| j dkr0t| j}|| j kr0t	
dt || _ | j }d}t|}|stt| j jj}|du sV|du r|tdkri }t }| j }	|	j D ],\}
}|
dsp|
dkrqqct||
d}t||
d}||kr||kr|||
< t||
| qc|du rt|dkrtd	| d
 n(|jdu r| j j|_|jdu r| j j|_|jdu r| j j|_|jdu r| j j|_|jdi |}||fS )z
        Prepares the base generation config, then applies any generation configuration options from kwargs. This
        function handles retrocompatibility with respect to configuration files.
        FNr   a?  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed in v5. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )Tz4.50.0r   transformers_versionzX`generation_config` default values have been modified to match model-specific defaults: z=. If this is not desired, please set these values explicitly.ru   )r   Z_from_model_configZ_original_object_hashhashr   r   Z&_get_non_default_generation_parametersr:   Zfrom_model_configrJ  rK  rL  copydeepcopyr   parser  base_version__dict__r   r   r   setattrr   r   r   r   r   r
  update)r   r   r~  r   Zusing_model_generation_configZnew_generation_configZmodel_base_versionZmodified_valuesZ global_default_generation_configZmodel_generation_configr   Zmodel_gen_config_valueZglobal_default_valueZcustom_gen_config_valuer   ru   ru   rv   _prepare_generation_config  sd   









z*GenerationMixin._prepare_generation_configc                 C   s  d|v r
|d r
|S d|v r)| j js)tj|d ddddf tjddd }n-d|v rH| j jrHtj|d ddddf tjddd }ntj|tj|ddd }d}|d	dur|d	 }d}t|t	st|d d j
d
 }nt|dr| dur| }||d }||d< |S )zbCalculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past lengthr   r   r   Nr  r(   Zdecoder_inputs_embedsr   rb   r   get_seq_length)r   r   rq   r  int64r   r   r   r   r   r   r   r  )r   Z
seq_lengthr   r   r   r   cacheru   ru   rv   _get_initial_cache_position  s&   .*
z+GenerationMixin._get_initial_cache_positionc           
         s  d}t | dr7t| j dhkst| j ddhkrdndd | j D d fdd	| j D }|du r=dS | j j}t|d
krWd|v rWt	
t||d S i }t | drd |  D ]\}}||  u rr|  nqd du r{td fdd| D }t||krt|D ]}|D ]}d| d| dv r|| ||<  nqqnF	  |v rt	
t||  }nd v rɈ dd
d  ntd  dqn|D ]}	t|D ]}d| d|	 dv r||	 ||<  nqqt|D ]}||vrtd| dq|S )z
        Returns the device map for each decoder layer, to allocate the cache on the right device.
        Inspired from `dispatch_model` in accelerate.
        Nr   cpudiskc                 S   s   g | ]}|d vr|qS )r  r  ru   )r   dru   ru   rv   rl  $  rm  zHGenerationMixin._get_layer_device_map_for_cache_init.<locals>.<listcomp>r   c                    s"   i | ]\}}||d v r n|qS r  ru   )r   namer   )main_deviceru   rv   r   %  s    zHGenerationMixin._get_layer_device_map_for_cache_init.<locals>.<dictcomp>r(    r   zw`model.get_decoder()` is not returning a named module of the model. This is unexpected, please open an issue on GitHub.c                    s   g | ]} |v r|qS ru   ru   )r   module_name)decoder_nameru   rv   rl  B  s    rn  TzDecoder name z" not found in execution device mapzlayer z! has not been mapped to a device.)r   r   r   r   r   r   r0  num_hidden_layersr   dictfromkeysrangeZnamed_modulesr   RuntimeErrorr   rsplit)
r   Zexecution_device_mapr  layer_device_mapr  r   Zdecoder_mapped_modulesidxr  layerru   )r  r  rv   $_get_layer_device_map_for_cache_init  st   
*


	
z4GenerationMixin._get_layer_device_map_for_cache_initcache_implementationmax_cache_lenc                 C   s  |dkrdt | jddv rd}t| }| jjp|ddu}t| dr,|r)| jjn| j}|d	kr7t| jj	|}t| d pNt
|| pN|j|kpNt
|ttf}	|d
krZ|	pY|j|k }	|rqt| drq|	pp| jjj|d d jd k}	|	rt| jdr~| jj}
n| j}
|  }| j |||
||d}|di || _|r| }|d d jd |d< t| j|di || _| jS | j  | jS )z
        Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
        new `generate` call requires a larger cache or uses a different batch size.

        Returns the resulting cache object.
        hybridZllama4r  r  Zhybrid_chunkedr   N_cachesliding_windowmambar   r(   _pre_quantization_dtype)r   max_batch_sizer  r   r   r  r  ru   )r   r   r8   r   r   r   r  self_attention_cacher{  r  r   r  r   r   r  Zcross_attention_cacher   r  r   r  r0  r  r   reset)r   r  r   r  r   r   Z	cache_clsrequires_cross_attention_cacheZcache_to_checkZneed_new_cacheZcache_dtyper  Zcache_kwargsr  ru   ru   rv   
_get_cacheg  sX   	



zGenerationMixin._get_cachec                 C   s6   | j od| jj vod| jj vod| jj vS )a  
        Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`.
        This is mostly the same as `_supports_cache_class` attribute, but add exception for `Jamba` model which
        uses its own `HybridMambaAttentionDynamicCache` and do not need to initialize the Cache in advance in
        order to save memory (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
        for `HybridMambaAttentionDynamicCache`).
        ZjambaZzambaZbamba)r   r   rm   r  r   ru   ru   rv   _supports_default_dynamic_cache  s   	z/GenerationMixin._supports_default_dynamic_cachemax_cache_lengthc                 C   s6  d| j j vr
dnd}| jjp|ddu}||}	|	durF|jdur-td| dt|	t	rD| 
 rD|s=t|	nt|	||< dS |jdu rMdS | 
 sc|jduratd	|j d
t dS |dury|jdurytd|j d d|_|jpt| j dd|_|jdur
|jtv r|jdkr| jstd| j|jt|j|j| |||d||< dS |jdkr| jstd|jdur|jnt }
t|
j  }|
j dkrt! st"d|
j dkrt# st"d||
||< dS |jdkrt$ ||< dS |jdkrt ||< dS dS |st ntt t ||< dS )z
        Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is
        instantiated, writes it to `model_kwargs`, under the name expected by the model.
        r  rb   rc   r   NzMPassing both `cache_implementation` (used to initialize certain caches) and `zB` (a Cache object) is unsupported. Please use only one of the two.FzThis model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to z) will be ignored.zRAn assistant model is provided, using a dynamic cache instead of a cache of type='z'.r  staticzThis model does not support `cache_implementation='static'`. Please check the following issue: https://github.com/huggingface/transformers/issues/28981)r  r   r  r   r   Z	quantizedzThis model does not support the quantized cache. If you want your model to support quantized cache, please open an issue and tag @zucchini-nlp.ZquantozYou need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. Please install it via  with `pip install optimum-quanto`ZHQQzYou need to install `HQQ` in order to use KV cache quantization with HQQ backend. Please install it via  with `pip install hqq`Z	offloadedZdynamic)%r   rm   r  r   r   r   r  r   r   tupler  r   from_legacy_cacher   r   rJ  rK  rL  r   r   r   r0  r8   _supports_static_cacher  rd  r?  num_return_sequencesZ_supports_quantized_cachecache_configr   r9   backendr%   ImportErrorr$   r   )r   r   r   r'  r   r  r   r"  r  Zuser_defined_cacher  Zcache_classru   ru   rv   _prepare_cache_for_generation  s   











z-GenerationMixin._prepare_cache_for_generationc                 C   s   dt t| jj v S )z
        Return True if the current model supports the keyword argument `logits_to_keep` in forward()
        to save memory. Checking it in this way allows to avoid using a new model attribute.
        logits_to_keep)r   r   r   r   r   r   r  ru   ru   rv   _supports_logits_to_keep/  s   z(GenerationMixin._supports_logits_to_keepkwargs_has_attention_maskc           	         sL  d fdd	}||j |d}||j|d}||j|d}||j|d} jjr/|dur-|n|}|dur=|jdkr=|d}|du r]|dur]|durP|sPt	d |d }t	d| d  jjri|du rit
d	|durt||d
 r|dur|std |durt|s|dk  rt	d| d ||_||_||_||_dS )a  
        Prepares the special tokens for generation, overwriting the generation config with their processed versions
        converted to tensor.

        Note that `generation_config` is changed in place and stops being serializable after this method is called.
        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
        Nc                    sF   | d u r| S |d ur|n j }t| tjr| |S tj| |tjdS )Nr   r   )r   r   rq   r   r   tensorr   )tokenr   r  ru   rv   _tensor_or_noneF  s   
z@GenerationMixin._prepare_special_tokens.<locals>._tensor_or_nonerC  r   zThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.z\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.r   zThe attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z;`eos_token_id` should consist of positive integers, but is zq. Your generation will not stop until the maximum length is reached. Depending on other flags, it may even crash.r   )r   r   r   r
  r   r   r   r  r   rz  r   r    r   r   rq   Zis_floating_pointZ_bos_token_tensorr   r   _decoder_start_token_tensor)	r   r   r  r   r  Zbos_token_tensorZeos_token_tensorZpad_token_tensorZdecoder_start_token_tensorru   r  rv   _prepare_special_tokens6  sP   	


	
z'GenerationMixin._prepare_special_tokensc           	      C   s   |j rdS | jjdkpt|jduo|jj}t|dto"|d j	}|o)|o)| j
}t| dddur8|| jj	M }t| dr\t| j }d|v oMt|dk}|| M }d	|v }|| M }|jdurh|shtd
 |S )zp
        Determines whether to trigger auto-compilation of the model's forward pass at generation time.
        FcudaNrb   hf_quantizerr   r  r(   r  zsYou have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.)Zdisable_compiler   r3  boolcompile_configZ_compile_all_devicesr   r   r   r   r  r   r  r   r   r   r   r   r   r   )	r   r   r   Zvalid_hardwareZusing_compilable_cacheZcan_compileZall_model_devicesZhas_cpu_offloadZhas_disk_offloadru   ru   rv   _valid_auto_compile_criteria  s*   


z,GenerationMixin._valid_auto_compile_criteriasynced_gpusstreamerr_   custom_generatec           +         sX  |dur3| dd}ddhfddt  D }|| | j|fd|i|}|d>d| i|S | dd}| d	d}| j |fi |\ }| |  | ||| |du rjt	 sdt
| oit d
k}|durp|nt }|dury|nt }dtt| jj v }d|v}|dddu}| | j|\}}}|jd }|j}| j ||d | jjsو jdur|d
krt|jdkrt |dddf  jkdkrt!"d | jjs|dkrd _#|s|r|r| $| ||d< n|r
|dkr
t|d jdkr
t%d| jjrd|vr| &||| }| jjr0| j'||| j(|jd\}}n|dkr7|n| d} j)rF| *||}|durR|+|,  |jd
 }|ddu oc j-du}|ddu op j.du}| j/ |||||d | 0 rd|vrd
|d< | 1 ||  j-d
 }|jd
 |kr|dkr| jjs||jd
 7 }| 2 |||||  3|} |dur͈ j4d
krt%d| jj5|jj5krt67d|jj5 d| jj5 d| jj5 dt8 | j9 |||||j||	|
d 	}!| j:d> ||d!|}" j#|d"< | t;j<kri j=d
kr!t%d# j= d$|d
kr*t%d%|d" s3t%d& j>d'v r=t%d(| j?rJt%d)| j@jA | jB |||||||d*}#| jC|f|#|!|" ||d+|}$n| t;jDkr| j?r|t%d,| j@jA | jE|f jF|!|" ||d-|}$n| t;jGkr|d" st%d.| j?rt%d/| j@jA | jH|f|!|" ||d0|}$nS| t;jIt;jJfv r| jKd>| j=| jjd1|\}}| jL|f|!|" ||d0|}$n(| t;jMt;jNfv r| jKd>| j4| jjd1|\}}| jO|f|!|" |d2|}$n| t;jPkrJtQ| j4|j jR jS j= jT j-d3}%| jKd>| j4| jjd1|\}}| jU||%f|!|" |d2|}$n| t;jVkrg }& jWdur[ jW}& jXdur݇ fd4d5}'tY jXtZrvt jXdkry|'   jXD ]`}(tY|(d tZrtY|(tZrt|(dkr|'  t[d6d7 |(D r|'  t[d8d7 |(D r|'  t\|(})n!tY|(tZrt|(dkr|'  t[d9d7 |(D r|'  t]|(})|&^|) q|t_|&| j4|j jR jS j= j-d:}*| jKd>| j4| jjd1|\}}| j`|f|*|!|" |d;|}$ jadu r*tb|$d<r*tc|$jdd=dur*|$jde |$_d|$S )?a  

        Generates sequences of token ids for models with a language modeling head.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
            generation_config ([`~generation.GenerationConfig`], *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which has the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complements the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
                intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://arxiv.org/abs/2010.00904).
            synced_gpus (`bool`, *optional*):
                Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
                to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
                deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
            assistant_model (`PreTrainedModel`, *optional*):
                An assistant model that can be used to accelerate generation. The assistant model must have the exact
                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                is much faster than running generation with the model you're calling generate from. As such, the
                assistant model should be much smaller.
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
                size. This is an experimental feature, subject to breaking API changes in future versions.
            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Attention_mask for `negative_prompt_ids`.
            use_model_defaults (`bool`, *optional*):
                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
                generation configuration (`model.generation_config`), as opposed to the global defaults
                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
                `True`.
            custom_generate (`str`, *optional*):
                A string containing the name of a huggingface.co repository. If provided, the custom `generate`
                function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the
                standard `generate` method. Note that the logic is for generation is entirely defined in that
                repository, and the return type may be different from the standard `generate` method.
            kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.

                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateDecoderOnlyOutput`],
                    - [`~generation.GenerateBeamDecoderOnlyOutput`]

                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        Nr   r   r   c                    s   i | ]\}}| vr||qS ru   ru   )r   r   r   )global_keys_to_excluderu   rv   r   &	  s    z,GenerationMixin.generate.<locals>.<dictcomp>modelrS  r*  r(   r   r   r   rC  r   r   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.r   Tr   z1`attention_mask` passed to `generate` must be 2D.)r   r   r   r
  r   r-  rM  )r   rx  r|  r   r   rw  r  zZ`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1.z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.)	r   r6  r7  r8  r(  r   r   r9  r:  )r   rR  rS  r   zFnum_return_sequences has to be 1 when doing assisted generate, but is rn  z6assisted generate is only supported for batch_size = 1z+assisted generate requires `use_cache=True`)r  r  r  z=assisted generate is not supported with Static cache classes`zCassisted generation is not supported with stateful models, such as )r   r   r   r'  r(  r)  r*  r   )r4  r(  rR  r   r  r  z=dola decoding is not supported with stateful models, such as )dola_layersr(  rR  r   r  r  z,Contrastive search requires `use_cache=True`zBcontrastive search is not supported with stateful models, such as )r(  rR  r   r  r  r   r  r   )r(  rR  r   r  )r   r?  r   length_penaltydo_early_stoppingnum_beam_hyps_to_keepr@  r-  c                      s   t d j d)Nzo`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` of positive integers, but is rn  )r   force_words_idsru   r   ru   rv   	typeerrorf
  s
   z+GenerationMixin.generate.<locals>.typeerrorc                 s   s    | ]	}t |t V  qd S r   )r   rP  r   Z	token_idsru   ru   rv   r   v
  s    z+GenerationMixin.generate.<locals>.<genexpr>c                 s   s"    | ]}t d d |D V  qdS )c                 s   $    | ]}t |t p|d k V  qdS r   Nr   r   r   Ztoken_idru   ru   rv   r   y
     " z5GenerationMixin.generate.<locals>.<genexpr>.<genexpr>Nr  r  ru   ru   rv   r   x
  s
    
c                 s   r  r  r  r  ru   ru   rv   r   
  r  )constraintsr   r?  r   r  r  r  r-  )constrained_beam_scorerr(  rR  r   r  rb   to_legacy_cacheru   )fr   localsr   r  r   r  rt  r  rp  r   r   distZget_world_sizerF   r[   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   rq   rc  r   rz  r   r   r   r	  r  r  Ztoken_healingheal_tokensputr  r-  rM  r}  r  ry  r  Zget_generation_moder?  r3  rJ  rK  rL  rQ  rY  r;   ZASSISTED_GENERATIONr  r  Z_is_statefulr   rm   r5  _assisted_decodingZDOLA_GENERATION_dola_decodingr  ZCONTRASTIVE_SEARCH_contrastive_searchZSAMPLEZGREEDY_SEARCHr  _sampleZBEAM_SAMPLEZBEAM_SEARCH_beam_searchZGROUP_BEAM_SEARCHr,   r  early_stoppingr@  _group_beam_searchZCONSTRAINED_BEAM_SEARCHr  r  r   rP  r   r)   r*   rI  r-   _constrained_beam_searchZreturn_legacy_cacher   r   rb   r  )+r   r   r   r(  rR  r8  r  r'  r  r9  r:  r~  r  r   r   Zgenerate_argumentsr   rS  r*  r   Zaccepts_attention_maskZrequires_attention_maskr  r   r   r   r   r   rw  rx  r|  r  Zgeneration_modeZprepared_logits_processorZprepared_stopping_criteriar4  resultbeam_scorerZfinal_constraintsr  Zword_ids
constraintr  ru   )r   r  rv   r     s  o




 



















	






zGenerationMixin.generatethis_peer_finishedc                 C   sL   |r t j|rdnd|d}tj|tjjd | dkrdS dS |r$dS dS )z
        Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is
        fed through `this_peer_finished`. ZeRO stage 3-friendly.
        r=  rA  rC  )opFT)rq   r  r  Z
all_reduceZReduceOpZSUMr  )r   r  r  r   Zthis_peer_finished_flagru   ru   rv   _has_unfinished_sequences
  s   z)GenerationMixin._has_unfinished_sequencesc                    s  du rt djj}}t }td|d}dd j|ddD }|d	dd
j|j	}t
||k||}	 | dkrE|S |dddf  }dd   fdd|D }	tt||	D ]`\}
\}}||
 }t
||k r~qj	 fdd|j|dD }t|dkrqj||f  d7  < |j|d |dd }	 | dkrqjt|||k dkr||d< | j|d|d||
< qj|S )a  
        Generates sequences of token ids for models with a language modeling head.
        Parameters:
            input_ids (`torch.LongTensor`): The sequence used as a prompt for the generation.
            tokenizer (`PreTrainedTokenizerBase`, *optional*): The tokenizer used to decode the input ids.
        Return:
            `torch.LongTensor` where each sequence has its tail token replaced with its appropriate extension.
        Nzs When generating with token healing, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.r(   )rv  r   c                 S   s   g | ]}|  qS ru   )stripr   ru   ru   rv   rl  
      z/GenerationMixin.heal_tokens.<locals>.<listcomp>T)Zskip_special_tokenspt)Zreturn_tensorspaddingr   r    c                 3   s"    | ]} |d  V  qdS )r  N)decodereplace)r   tZ	space_tokrS  ru   rv   r   
  s     z.GenerationMixin.heal_tokens.<locals>.<genexpr>c                    s   i | ]	}  |fd qS )g      $@)convert_tokens_to_ids)r   Zalt_tok)rS  ru   rv   r   
  s    z/GenerationMixin.heal_tokens.<locals>.<dictcomp>)prefixrA  r;  r  )r   r   r   r!   Z	get_vocabr:   Zbatch_decoder   r   r   rq   whereZnumeltolistZconvert_ids_to_tokensr  	enumeratezipr  r  
extensionsr   r  r   r  )r   r   rS  r   r   Z
vocab_trier   ZpromptsZtail_idsZ	tail_toks	batch_idxZtail_idZtail_tokZ	batch_idsZseq_biasZtrimmed_idsru   r  rv   r  
  sT   


zGenerationMixin.heal_tokensr  c           '   	      s\  | j jrtd|j}	|j}
|j}|j}|j}|j}t	dd |D }|j
}|r,|r,dnd}|r4|r4dnd}|r<|
r<dnd}|rD|
rDdnd}|rL|rLdnd}|jdd \}}tj|tj|jd}| ||j|}d}| j  j | j jsxd	}n dkrd}n	 dkrd
}nd	}t|tr|dkr| d kr|g}nL dkrtt| d dntt|dd}n5t|tr|dkrԈ dkrtt d  dn	tt d  d}nt|tr fdd|D }ntd|  }|du rtd| j|||jdr| j|fi |}| di |d|
dd}|jdddddf  jdtjd}|jdddddf  } i }!|D ]}"||j |" dddddf | j|!|"< q9| j!||| j jd}|rd|rdqt"||!| }#|#|j}#|||#}$|r|r||$f7 }|r||f7 }|
r|| j jr|j#fn|j$f7 }| j jr||j%f7 }|r|| j jr|j&fn|j f7 }|rt'j(j)|$dd}%tj*|%d
d+d
}&ntj,|$dd}&|r|&| |	d
|   }&tj-||&dddf gdd}|dur|.|&/  |||| @ }|0 d	k}| j|||jds|dur|1  |r,t2||||||3ddS |S )a
  
        Generates sequences of token ids for models with a language modeling head using **dola decoding** and can be
        used for decoder-only text models.
        The method is based on the paper "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language
        Models" (https://arxiv.org/abs/2309.03883) in ICLR 2024.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            dola_layers (`Union[str, List[int]]`):
                The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
                means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
                to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        z8DoLa decoding is only available for decoder-only models.c                 s       | ]}t |d V  qdS r   Nr   r   rX  ru   ru   rv   r   Q  r  z1GenerationMixin._dola_decoding.<locals>.<genexpr>ru   Nr   r   Fr   r(   low(   ru  highc                    s   g | ]}| k r|qS ru   ru   r   iZfinal_layerru   rv   rl    rm  z2GenerationMixin._dola_decoding.<locals>.<listcomp>z?dola_layers must be either 'low', 'high' or a list of integers.zCDoLa is not supported for models that don't have output embeddings.rC  T)r  r  r  r   )r  r   r   r  Znum_samplesrb   rh   ri   rj   rk   rl   rb   )4r   r   r   r   r  r  output_scoresoutput_logitsreturn_dict_in_generater   r/  r   rq   r   r   r   r  r0  r  Ztie_word_embeddingsr   strrP  r  get_output_embeddingsr  r   rj   detachr   float32floatrl   r#  _dola_select_contrastrz   rk   r{   r|   r   r   softmaxmultinomialsqueezeargmaxr  r  r  rd  endrg   r   )'r   r   r  r(  rR  r   r  r  r   r   r  r  r  r  r  has_eos_stopping_criteriar/  ri   
raw_logitsrz   r{   r|   r   Z
cur_lengthunfinished_sequencesr  Zstart_layercandidate_premature_layersZlm_headr   r  Zfinal_layer_next_token_logitsfinal_logitscandidate_premature_logitsZcandidate_premature_layernext_token_logitsnext_token_scoresprobsnext_tokensru   r  rv   r    s   2

(






L	zGenerationMixin._dola_decodingc           ?         s
  t dd |D }|j |j}	|j}
|j}|j}|j}|j}|j}|j	}|r*|r*dnd}|r2|r2dnd}|r:|r:dnd}|rB|rBdnd}|rJ|rJdnd}|rh| j
jrh|r[|d dnd}|rf|d dnd}|jdd \}}tj|tj|jd	}| ||j|}tj|tjd
}| j
jrd|v r|d dur|d }n|d }|j dd}d}| j|||jdr|ddu st|d ttfr\|d  dkr\d|d< | j|fi |}| di |dd|d}| j
jr|jd }n|jd }|jdddddf jdtj|jd} | j ||| j
jd}|s)| j!d| | j
jd|\}!}|d}"|"du r<t"| j#j$ dt|"d t%tj&frS|"d d jd |kr\t"| j#j$ d||| }#t'j(j)|#dd}$tj*|$d d\}%}&|r|r~|| f7 }|r||#f7 }|r|| j
jr|j+fn|j,f7 }| j
jr||j-f7 }|r|| j
jr|jfn|jf7 }~|s|d }'t|'t.st|'trt|'j/t.r|'0  n%g }(|'D ]})g }*|)D ]}+|*1|+j dd q|(1t%|* qt%|(}'|'|d< |r^g },t2 D ]L}-| j|&dd|-f 3ddfi |}.| di |.dd|d}t|d t.sCt|d trNt|d j/t.rNd|d< |d 4d |,1| qt5|,| j
6 }n| j|&3ddfi |}.| di |.dd|d}~.| j
jr|jd }/|j}0n|jd }/|j}0|jdddddf 7 }1|j dd}2t8|2|/|%||	 }3tj9||:|jd dfgdd}|3d}3t; fddt<|3D }4|&t2t=|&|3f }5t>t?|/j@dd }/|/t2||3ddf }/tj9||/Adgdd}d}6|0D ]})t>t?|) t2||3ddf })|6|)f7 }6q|rF| j|&dd|3f 3ddfi |}7| di |7dddd}8|8d }9nNd}9tBD ]}:|9pTtC||:d}9qJt|9t.sjt|9trpt|9j/t.rp|9D|4 n$g }(|9D ]})g }*|)D ]}+|*1|+|4d f  qz|(1t%|* qtt%|(}9t>t?|1 t2||3ddf } | |j} | j
jrd};d}<|r|j-D ]})t>tj?|) ddt2||3d f })|;|)f7 };q|j+D ]})t>tj?|) ddt2||3d f })|<|)f7 }<qtE|9|6|<pd|;pdd!}n-d}=|r&|j,D ]})t>tj?|) ddt2||3d f })|=|)f7 }=qtF|9|6|=p-dd"}| j ||| j
jd}|rA|rAq|rN|5| |
d|   }5tj9||5dddf gdd}|duri|G|5H  |||| @ }|I dk}| j|||jds|dur|J  |r|ddurt|d t.st|d trt|d j/t.r|d 4d n-g }"|d D ] })g }>|)D ]}+|>1|+d ddddf  q|"1t%|> qt%|"|d< | j
jrtK|||||||||dd#	S tL||||||dd$S |S )%a  
        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c                 s   r  r   r  r  ru   ru   rv   r     r  z6GenerationMixin._contrastive_search.<locals>.<genexpr>ru   Nr   rk   rl   r   r   r  r   r   r   r  FrC  rb   Tr   )r  r  r  r   r  r   r   r	  r  zQ does not support caching and therefore **can't** be used for contrastive search.z| does not have a standard cache format and therefore **can't** be used for contrastive search without further modifications.)r  r   r(   r  c                    s   g | ]
\}}||   qS ru   ru   )r   r  rq  rD  ru   rv   rl        z7GenerationMixin._contrastive_search.<locals>.<listcomp>.)rb   r|   rz   r{   )rb   rl   rk   	rh   ri   rj   rx   ry   rz   r{   r|   rb   r  )Mr   rD  penalty_alphar   r  r  r  r  r  
low_memoryr   r   r   r   rq   r   r   r   r  r  r  r  r   r   r   r  r   r|   rl   rj   r   r  r#  r  r   r   rm   r  r   r   r   r  topkrz   rk   r{   r   r  Zbatch_repeat_interleaverI  r  r  cropstack_model_outputsr0  r  _ranking_fastr  r  r  r  r   r_  splitr  r  r  r   Zbatch_select_indicesr   r   r  r  rd  r  rw   rg   )?r   r   r(  rR  r   r  r  r   r  r(  r   r  r  r  r  r  
sequentialr  ri   rz   r{   r|   rx   ry   r   cur_lenr  cosine_matrix_maskr  r   r  Zlast_hidden_statesZlogit_for_next_stepr   rb   Zprocessed_logit_for_next_stepZ
next_probsZtop_k_probsZ	top_k_idsZpastZnew_key_valuesr  r   r  Zall_outputsr  Znext_model_inputsnext_hiddenZfull_hidden_statesrj   context_hiddenselected_idxZaugmented_idxr#  Znext_decoder_hidden_statesZnext_model_inputZselected_outputsZnext_past_key_valuesr!  Znext_step_cross_attentionsZnext_step_decoder_attentionsZnext_step_attentionsZlayer_past_key_valuesru   r%  rv   r    s(  ,










&

	



$


$

$
$
$
    
 
	z#GenerationMixin._contrastive_searchc           $      K   s  |j }|j}	|j}
|j}|j}|j}tdd |D }|j}|r$|r$dnd}|r,|r,dnd}|r4|	r4dnd}|r<|	r<dnd}|rD|
rDdnd}|rb| jj	rb|	rU|d 
dnd}|
r`|d 
dnd}|jdd \}}d	}tj|tj|jd
}| ||j|}| j}| ||}|rdtjd< | |j}|jdur| j||fi |}d	}nd}| j|||jdr| j|fi |}||	rd|	ini  ||
rd|
ini  |r| di |ddi}d	}n|di |ddi}| j||| jj	d}|r|rq|jdddddf jdtj|jd} ||| }!|rW|r ||!f7 }|r(|| f7 }|	rE|| jj	r5|j fn|j!f7 }| jj	rE||j"f7 }|
rW|| jj	rR|j#fn|j$f7 }|rmt%j&j'|!dd}"tj(|"dd)d}#ntj*|!dd}#|r|#| |d|   }#tj+||#dddf gdd}|dur|,|#-  |||| @ }|. dk}|d7 }~| j|||jds|dur|/  |r| jj	rt0|||||||||
dd	S t1||||||
ddS |S )a  
        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c                 s   r  r   r  r  ru   ru   rv   r     r  z*GenerationMixin._sample.<locals>.<genexpr>ru   Nr   rk   rl   r   Fr   0ZTOKENIZERS_PARALLELISMTrC  r  r  r  r	  r   r$  r  r(   r
  r   rb   r'  r  )2r   r  r  r  r  r  r   r/  r   r   r   r   rq   r   r   r   r  __call__r  r   environget_compiled_callr  prefill_chunk_size_prefill_chunkingr  r   r  r#  rj   r   r  rz   rk   r{   r|   rl   r   r   r  r  r  r  r  r  r  rd  r  rw   rg   )$r   r   r(  rR  r   r  r  r   r   r  r  r  r  r  r  r/  ri   r  rz   r{   r|   rx   ry   r   r0  r  r  model_forwardcompile_forwardZ
is_prefillr   r  r   r!  r"  r#  ru   ru   rv   r    s   +

(





K
	zGenerationMixin._samplec                 C   sx   | j j }t|ttfr| ||}|S d|v r5t|ttfs(t	d| d| ||}t
|}|S || |S )aC  
        Temporary function to handle the different types of cache reordering processes while we roll out `Cache`.

        TODO: standardize cache formats and make all models compatible with `Cache`. It would remove the need
        for this function, with `Cache.reorder_cache` being the sole remaining code path
        Z
gptbigcodez'Using an unsupported cache format with zG. Currently, it only supports the legacy tuple format or `DynamicCache`)r   rm   r  r   r  rP  r&  r   r   r   r  Zreorder_cache)r   rb   r%  Zmodel_classru   ru   rv   _temporary_reorder_cacheD  s   


z(GenerationMixin._temporary_reorder_cacher  c                 C   s0   t | j}t| |d |d  g|dd  S )z=[batch_size, num_beams, ...] -> [batch_size * num_beams, ...]r   r(   r   NrP  r   rq   r`  )r  r   ru   ru   rv   _flatten_beam_dim^  s   
&z!GenerationMixin._flatten_beam_dimr?  c                 C   s&   t | j}t| ||g|dd  S )z=[batch_size * num_beams, ...] -> [batch_size, num_beams, ...]r(   Nr>  )r  r   r?  r   ru   ru   rv   _unflatten_beam_dimd  s   
z#GenerationMixin._unflatten_beam_dimc                 C   sF   t |jt | jk r|d}t |jt | jk s
tj| |dd}|S )a  
        Gathers the beam slices indexed by beam_indices into new beam array.

        Args:
            tensor (`torch.Tensor`): A tensor containing data to be gathered. The tensor is a 2D or a 3D tensor
                with the two first dimensions depicting the batch and the beam dimensions.
            beam_indices (`torch.Tensor` of shape `(batch_size, num_beams_to_select)`): The indices of the beams to
                select .

        Returns:
            A tensor with the selected beams
        r   r(   )inputrf  r  )r   r   r  rq   Ztake_along_dim)r  r   Zgathered_tensorru   ru   rv   _gather_beamsj  s
   
zGenerationMixin._gather_beamsrunning_beam_scoresbeam_scoresis_sent_finished!next_token_hits_stopping_criteriar0  r-  decoder_prompt_lenr  r  c	                 C   s   |dkr|dkr|| }	n|| }	| ddddf |	|  }
t |t j|dddd d}t |
|k}t ||du @  }t | }||@ |@ S )	zv
        Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
        neverr=  Nr(   Tr  Zkeepdimr       e)rq   r  r{  r   r  )rC  rD  rE  rF  r0  r-  rG  r  r  Zbest_hypothetical_lengthZbest_possible_running_scoreZworst_finished_scoreZimprovement_possibleZexists_open_beamZvalid_continuationsru   ru   rv   %_beam_search_has_unfinished_sequences~  s   
z5GenerationMixin._beam_search_has_unfinished_sequencesaccumulated_log_probsrunning_sequencesrunning_beam_indicesr/  beams_to_keepr1  c                 C   s   |rt jtjj|dd|d}t j|d|d}n	t j||d\}}||	 }| ||}| ||}||	 }||dddd|f< t j|
|j	d
dd| }|| }||dddd|| f< |||fS )	a'  
        Get top-K continuations given the accumulated log probs on the next token.

        A few notes to understand what's going on:
        1. Each item in batch has `num_beams` * `vocab_size` candidate continuations. For each item, get the
        top K [K = (number of EOS tokens + 1) * `num_beams`] candidates with the highest accumulated
        log-probabilities, or sample them without replacement using the accumulated scores
        2. We gather the top K (as opposed to `num_beams`, or any number lower than K) here so that we have at
        least `num_beams` sequences remaining to continue the live beam search.
        3. Note that other stopping criteria might result in impossible to continue beams, i.e. all continuations
        selected in this step hit the stopping criteria.
        r   r  r
  r(   )rA  r  indexr   NrC  )rq   r  r   r   r  re  r*  rB  r   r   r  )r   rL  rM  rN  r0  rG  r/  rO  r?  r1  r   Ztopk_indicestopk_log_probsZtopk_current_beam_indicestopk_running_beam_indicestopk_running_sequencesZtopk_idsZbatch_offsetZbatch_modified_indicesru   ru   rv   _get_top_k_continuations  s   
z(GenerationMixin._get_top_k_continuationsrR  rT  rS  c                 C   sT   || tjd  }tj||dd }| ||}| ||}	| ||}
||	|
fS )z
        Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
        best non-finished beams to continue beam search in the next iteration.
        rJ  rQ  r(   )r   rq   r  r*  rB  )r   rR  rT  rS  rF  r?  Ztopk_running_log_probsZnext_topk_indicesrM  rC  rN  ru   ru   rv   %_get_running_beams_for_next_iteration  s   
z5GenerationMixin._get_running_beams_for_next_iterationtop_num_beam_maskc                 C   s   ||	dddf @ }||d | |  }t j|ddd|du @ }||t jd 7 }|| d 7 }t j||fdd}t j||fdd}t j||fdd}t j||fdd}t j||
dd }| ||}| ||}| ||}| ||}||||fS )	z
        Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
        the current finished sequences.
        Nr(   r   T)ZaxisZkeepdimsrJ  r  rQ  )rq   r  r   r  r  r*  rB  )r   rh   rT  rD  rR  r   rS  rE  rF  rW  r?  r0  rG  r  r  Zdid_top_num_beams_just_finishedZbeams_in_batch_are_fullZmerged_sequencesZmerged_scoresZmerged_beam_indicesZmerged_is_sent_finishedZtopk_merged_indicesru   ru   rv   _update_finished_beams  s   z&GenerationMixin._update_finished_beamsc           8      K   s  |j }|j}|j}	|j}
|j}|j}|j}|j}|j}|j	}|j
}|j}|j}|jdd \}}|| }| jjdkr?| jj}n| jjdkrK|  j}n| j j}|}d}|dur^|jd nd}tdd| | }tjtj|tjdtj|| tjdfdd	|j}| ||j|}|j}|rt d
|r|rdnd}|r|rdnd}|r|rdnd} |r|	rdnd}!|r|	rdnd}"|r|
rdnd}#|r| jj!r|	r|d "dnd}$|
r|d "dnd}%|dur|p|d nd}&tj#|||f|&tj$|jd}'| %||||'ddddd|f< |'& ' }(tj||ftj(|jd})d|)ddddf< tj#||fdtj(|jd}*tj||ftj|jd}+tj||ftj|jd},tj#|||| fdtj)|jd}-|-& ' } | j*|||jdr| +|'ddddd|f }.| j,|.fi |}/|/-|	rd|	ini  |/-|
rd|
ini  | di |/ddi}0| j.|0|| jj!d}|r|rqd|0j/dddddf jdtj0|jd}1t1j2j3|1dd	}2||.|2}2|r&|r||1' f7 }|r|r||2' f7 }|	r|!| jj!r|0j4fn|0j5f7 }!| jj!r|"|0j6f7 }"|
r&|#| jj!r!|0j7fn|0j8f7 }#~0| %|2||}2|2|)dddddf  }2t9|2||| f}2| j:|2|'|-|||||||d
\}3}4}5|| +|4ddddd|d f |},| %|,||},| j;|3|4|5|,|d\}'})}-| j<|(|4|*|3| |5|+|,||||||d\}(}*} }+|"dddur| j=|d | +|-d|| f d|d< |d }| >|)|*|+|,|||||	 }| j*|||jdsn| +|(ddd|ddf }(| +|*ddd|f }*| +| ddd|ddf } | d  j?dd	 }6||6 }7|(ddd|7f }(| ddd|6f } |rO|s)d}*| jj!r@t@|(|*||| |$|%|!|"|#|"dd S tA|(|*||| |!|#|"dd!S |(S )"a	  
        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        If it's the first time you're diving into Beam Search, we recommend you read the following blog post:
        https://huggingface.co/blog/how-to-generate (especially the beam search section).

        You can recompute the sequence scores from the individual scores using the `compute_transition_scores` function
        (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.compute_transition_scores)

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`:
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        Nr   ZMoshiDepthDecoderZImageGPTForCausalImageModelingFr   r(   r  r  z`low_memory=True` is not supported after the beam search refactor. Please check the discussion in #35802 *after the PR got merged*, and add a comment there if your questions are not yet answered.ru   r   rk   rl   r   )Z
fill_valuer   r   r   rJ  rC  r  r  r  Tr	  r$  )
rL  rM  rN  r0  rG  r/  rO  r?  r1  r   )rR  rT  rS  rF  r?  )rh   rT  rD  rR  r   rS  rE  rF  rW  r?  r0  rG  r  r  rb   .)rb   r%  rh   r~   ri   rj   r   rx   ry   rz   r{   r|   rb   rh   r~   ri   rj   r   rk   rl   rb   )Br   r   r  r  r  r  r  r/  r  r  r-  r?  r  r   r   rm   r   Zaudio_vocab_sizer  Zout_featuresr0  r1  rd  rq   r  r   r  zerosr   r   r  r)  r   r   r   fullr  r@  r  r   r  Zint32r  r?  r   r  r#  rj   r  r   r   rb  rz   rk   r{   r|   rl   r`  rU  rV  rX  r=  rK  rc  r   r}   )8r   r   r(  rR  r   r  r   r   r   r  r  r  r  r  r/  r  r  r-  r?  r  Zbatch_size_unflattenedr0  r   r1  rG  r  n_eos_tokensrO  rW  r/  Z
all_scoresr  r   rz   r{   r|   rx   ry   Zoutput_fill_valuerM  rh   rC  rD  rE  rF  rN  Zflat_running_sequencesr   model_outputsrj   Z	log_probsrR  rT  rS  Zmax_generated_lengthZoutput_lengthru   ru   rv   r  %  s  .
" $ (

"	
   
zGenerationMixin._beam_searchr  c           4         s  |j }|j}	|j}
|j}|j}|j}|j}|j|j}| t	|j
| |j}|j\}}| ||j|}|rG|rGfddt|D nd |kr\td  d| d|rb|rbdnd}|rj|rjdnd}|rr|
rrdnd}|rz|
rzdnd}|r|rdnd}|r| jjr|
r|d d	nd}|r|d d
nd}tjfdtj|d}d|ddddf< | f}d}|jd }| j|||jdrtj |j|d}tj tj|d}| j|fi |}||
rd|
ini  ||rd|ini  | di |ddi} | j| || jjd}|r&|r&|d }q|r8t| jdddddf }!|rM| jdddddf j d|jd}"t|D ] }#t!|# }$|$|# }%g }&tD ] |&" fddt|#|$D  qi||& }'| j|&dddf j tj#|jd}(t$j%j&|(dd})|)jd }*||'|)|d}+|+||& 'd })|)(|+})|r|+|!|&< |)|%|* })|	dur|	jd nd},tj)|)t*dd|, |% dddd\})}-tj+|-|*dd}.|-|* }-durt,dnd}/|j-|'|)|-|.||	|/|d	}0|0d  ||&< |0d! }1|0d" |r2|r2t.fd#d$tt	d D < |' ||&< tj/|'ddf |1'dgdd}'|'dddf ||&< tj+|%dd |# |%  ||&< qQ|r|rs||!f7 }|r{||"f7 }|
r|| jjr| j0fn| j1f7 }| jjr|| j2f7 }|r|| jjr| j3fn| j4f7 }tj/||'dgdd}~ |d%ddur| 5|d% ||d%< |d }|j6st7|||rd}| j|||jdsЈdurt,dnd}2|j8|||-|.||	|j9|2|d&	}3|r<|s
d|3d'< | jjr't:|3d( |3d' |||3d) ||||||d%d*S t;|3d( |3d' |||3d) |||d%d+S |3d( S ),a	  
        Generates sequences of token ids for models with a language modeling head using **diverse beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            beam_scorer (`BeamScorer`):
                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
                model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c                    s&   g | ]}t d d t  D qS )c                 s       | ]}d V  qdS ru   Nru   r   r   ru   ru   rv   r         z@GenerationMixin._group_beam_search.<locals>.<listcomp>.<genexpr>)r  r  ra  )r   num_sub_beamsru   rv   rl    r   z6GenerationMixin._group_beam_search.<locals>.<listcomp>N)Batch dimension of `input_ids` should be 	, but is rn  ru   r   rk   rl   rJ  r   r   Fr(   rC  r  r  r  Tr	  r   )r  r   c                    s   g | ]}  | qS ru   ru   )r   r  )r  r?  ru   rv   rl    rm  r  )current_tokensbeam_group_idxr   r  Zlargestsortedfloor)Zrounding_mode)r   r   r   Zgroup_indexrG  next_beam_scoresnext_beam_tokensnext_beam_indicesc                 3   s*    | ]}  |  | f V  qd S r   ru   r  )rg  r%  r   ru   rv   r   (  s    
z5GenerationMixin._group_beam_search.<locals>.<genexpr>rb   r   r   r-  r   rG  sequence_scoresrh   r   rY  rZ  )<r   r   r  r  r  r  r  r?  r@  r   
_beam_hypsr   r   r  r  r   r   r   r   rq   r\  r  r  r  r[  r   r   r   r  r#  Z
zeros_likerj   r   r{  extendr  r   r   rb  r  	expand_asr*  rd  divrc  processr  r  rz   rk   r{   r|   rl   r=  is_doner  finalizer-  r   r}   )4r   r   r  r(  rR  r   r  r   r   r   r  r  r  r  r  r@  r   batch_beam_sizer0  ri   r  rz   r{   r|   rx   ry   rD  r  rG  rf  Zreordering_indicesr   r  Zprocessed_scoreZraw_logit_scoreZgroup_start_idxZgroup_end_idxZ
group_sizeZbatch_group_indicesZgroup_input_idsr   r!  r1  next_token_scores_processedr]  r#  next_indicesZprocess_beam_indicesbeam_outputsbeam_next_tokensZfinal_beam_indicessequence_outputsru   )r  r   rg  r%  r   r?  rc  rv   r  i  sl  +

$


$



  
z"GenerationMixin._group_beam_searchr  c           *         s  |j }|j}	|j}
|j}|j}|j}|j}t|j}|j	}|j
dd \}}| ||j|}|| |krAtd||  d| d|rG|rGdnd}|rO|rOdnd}|r`|r`tdd t|D nd|rh|
rhdnd}|rp|
rpdnd}|rx|rxdnd}|r| jjr|
r|d	 d
nd}|r|d	 dnd}tj||ftj|jd}d|ddddf< ||| f}d}|j
d }| j|||jdr| j|fi |}||
rd|
ini  ||rd|ini  | di |ddi}| j||| jjd}|r|r|d }q|jdddddf jdtj|jd}tjj |dd} ||| }!|!|dddf !|! } | " }"|rx|rA|| f7 }|rI||f7 }|
rf|| jjrV|j#fn|j$f7 }| jjrf||j%f7 }|rx|| jjrs|j&fn|j'f7 }| j
d }#| |||# } |	dur|	j
d nd}$tj(| t)dd|$ | dddd\} }%|%|# * }&|%|# }%|j+|| |%|&|"||	|d	}'|'d }|'d }(|'d  tj,| ddf |(-dgdd}~|dddur| .|d  |d< |r|rt fd dttD |d }|j/st0|||rd}| j|||jds|j1|||%|&||	|j2|d!	})|rj|s8d|)d"< | jjrUt3|)d# |)d" |||)d$ ||||||dd%S t4|)d# |)d" |||)d$ |||dd&S |)d# S )'a|	  
        Generates sequences of token ids for models with a language modeling head using **constrained beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation, while satisfying a list of positive constraints. For more information, the
                documentation of [`ConstrainedBeamSearchScorer`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        Nr   rd  re  rn  ru   c                 s   r_  r`  ru   ra  ru   ru   rv   r     rb  z;GenerationMixin._constrained_beam_search.<locals>.<genexpr>r   rk   rl   r   rJ  r(   FrC  r  r  r  Tr	  r   r$  r  r   rh  )r   r   r   rG  rk  rl  rm  rb   c                 3   s&    | ]} |   | f V  qd S r   ru   r  r%  r   ru   rv   r   G     $ rn  ro  rh   r   rY  rZ  )5r   r   r  r  r  r  r  r   rp  r?  r   r  r   r   r  r  r   r   r   rq   r[  r  r  r  r   r  r#  rj   r   r  r   r   rb  rr  r   rz   rk   r{   r|   rl   r*  rd  r   rt  r  r  r=  ru  r  rv  r-  r   r}   )*r   r   r  r(  rR  r   r  r   r   r   r  r  r  r  r  r   r?  rw  r0  ri   r  rz   r{   r|   rx   ry   rD  r  rG  r   r  r   r!  rx  Zscores_for_all_vocabr1  r]  r#  ry  rz  r{  r|  ru   r}  rv   r    s  ,
 
(





$ j
z(GenerationMixin._constrained_beam_searchr4  c           +         s  |j }	|j}
|j}|j}|j}|j}|r|rdnd}|r |r dnd}|r(|
r(dnd}|r0|
r0dnd}|r8|r8dnd}|rV| jjrV|
rI|d dnd}|rT|d dnd}|j	dd \}}t
j|t
j|jd}| ||j|}d}d	}| j|||jd
r|j	d }||\}}|| j}|dur|| j}|j	d |j	d  }||d}t|}t||j	d | jj}t||j	d }d|v rt
j|d t
j||| |jt
jdfdd|d< | j|fi |} d| v r|d | d< | |
rd|
ini  | |rd|ini  | di | }!|!jdd| d df jt
j|jd   t|dkrZt|d D ]$}"||ddd||" f  dd|"ddf  dd|"ddf< q5|	rm|durmt||| |\}#}$n`|	r jdd}%t
j |%dddddf dd!ddddf }&n j"dd}&|dd|df }'|'|&ddddf k j#dddk $ }$|r|$|kr|$d8 }$|&ddd|$d f }#t
j||#fdd}|dur|%|#&  |j	d }(|(d })t'| |!j(|)|!_(|)| |$ | j*|!|| jj|$d d}|r|rqu|r|$d }*|r'|t+ fddt|*D 7 }|r9|t+fddt|*D 7 }|r>|(n|*}*|
rm| jjr[t,||!j-||*}t,||!j.||*d	d}n|!j/d durmt,||!j/||*d	d}|r| jjr~t,||!j0||*}nt,||!j1||*}|||| @ }|2 dk}d}| j|||jd
s|dur|3  t4|dr|j5j6j7dkr|j8|j5j6_8|r| jjrt9|||||||||dd	S t:||||||ddS |S )a
  
        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
        models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            candidate_generator (`CandidateGenerator`):
                A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
                more information, the documentation of [`CandidateGenerator`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        ru   Nr   rk   rl   r   r   FTrC  r(   r   r  r   r  r  r  r  r   r
  )r   r  c                 3   &    | ]} d d |d d f V  qd S r   ru   r  )
new_logitsru   rv   r   :  r~  z5GenerationMixin._assisted_decoding.<locals>.<genexpr>c                 3   r  r   ru   r  )r   ru   rv   r   <  r~  )is_decoder_attentionr'  	heuristicrb   r'  r  );r/  r  r  r  r  r  r   r   r   r   rq   r   r   r   r  r  Zget_candidatesr   r  r6   r7   r  r   r   r  rj   r  r   r   r  _speculative_samplingr  r  r  r  r   rc  r  r  r5   rb   Zupdate_candidate_strategyr#  r  _split_model_outputsr{   rz   rk   r|   rl   rd  r  r   r'  r   Znum_assistant_tokens_scheduleZnum_assistant_tokensrw   rg   )+r   r   r4  r(  rR  r   r  r  r   r/  r  r  r  r  r  ri   r  rz   r{   r|   rx   ry   r   r0  r  r  Zis_first_iterationcandidate_input_idscandidate_logitscandidate_lengthis_done_candidateZcandidate_kwargsr   r  r  valid_tokens	n_matchesr"  Zselected_tokensZcandidate_new_tokensZnew_cur_lenZnew_cache_sizeZnewly_added_lengthru   )r  r   rv   r  z  s6  1



F
4*






  

	z"GenerationMixin._assisted_decodingc                 K   s<  dt jj_|j}t j|d d d df |dd}d|vr td| j}| ||}|r1| 	|j
}|dd }d}	|D ]J}
|	|
jd  }|d urT|d d d |f |d< t j|	|t j|
jd|d	< |d	 d|d
< | j|
fi |}|di |ddi}|j|d< |}	q;||d< |d	 dd  d |d	< |d
d }|S )N@   r   r  rb   z+Cannot use prefill chunking without a cacher   r   r   r   r   r  Tr(   ru   )rq   Z_dynamor   cache_size_limitr9  r.  r   r   r  r8  r  r   r   r   r   r   r  r   rb   )r   r   r   r   
chunk_sizeZinput_chunksr;  r<  r   r   Zinput_chunkZcurrent_lengthr   r  r   ru   ru   rv   r:    s6   
 

z!GenerationMixin._prefill_chunking)NN)NNNN)NNNr   )r(   FN)Fr(   )NF)NNNNNNNNNNNN)Trm   rn   ro   rp   r   r
   r  r   PathLiker  r   r   rq   rr   rt   r	   r   r   r   r   r   r   r   r   r:   r   r   r	  r   r   r  staticmethodr  r"   r#  r&  rF   r1   r5  r   rQ  r[   rY  rO  rg  rp  rt  ry  r}  r  r  r  r  r  r  r  r  r  Zno_gradGenerateOutputr   r  r  GenerateNonBeamOutputr  r  r  r=  r?  r@  rB  r  rK  rU  rV  rX  GenerateBeamOutputr  r+   r  r-   r  r  r:  ru   ru   ru   rv   r   a  s   !
C
"
A
 
B
 

$

/
;&


1	

W	

 n
'



)
z8,9

U"N
@
y

R)	
   
U

 U	   !	
 = 
	'	

6
	


3
  F
  %	
 o

  r   c                 C   s  | dd| df }|j dd}|ddt||f dd}|j dd}|ddt||f dd}	|	| }
t|
}||
k}| jdddk  }|ri||kri|d8 }|ddd|d f }||fS |jd }|dd|ddf }||k r|dd|ddf }tj|| dd}|	|  n|}tj
|ddddddf }|dkrtj|ddd|f |fdd}||fS |}||fS )a  
    Applies sampling as in the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf, algorithm 1). Returns
    the selected tokens, as well as the number of candidate matches.

    NOTE: Unless otherwise stated, the variable names match those in the paper.
    Nr   r  r   r(   )r{  r
  )r  rq   r   r  Z	rand_liker   rc  r   clampZdiv_r  r  )r  r  r  r  r  Znew_candidate_input_idsqZq_ir   Zp_iZprobability_ratioZr_iZis_acceptedr  r  gammaZ
p_n_plus_1Z
q_n_plus_1Zp_primer  ru   ru   rv   r    s4     

 "r  Fc           	      C   s   t | dkr1d}|D ]}|r|n|jd }||dd|d|f f7 }q
| |f7 } |d7 }||8 }t|D ])}d}|D ]}|rC|| n|jd }||d||d d|f f7 }q;| |f7 } q5| S )z
    Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
    where each member corresponds to a single generated token.
    r   ru   r   .Nr(   )r   r   r  )	r  Znew_outputsr0  Z	added_lenr  Z	new_tupler  Zlast_dim_sizer  ru   ru   rv   r    s   
"r  r3  r2  next_top_k_probsr1  alpha
beam_widthr   c                 C   s   | | j ddd }||j ddd }t||ddd}|j|jd}d| t|jj }|| }tj	|dd\}	}
|
d}d| | ||	  }tt||}|j	dd\}
}|S )	a  
    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
    row in the batch.
    r   TrI  r(   r   r  r  rA  )Znormrq   matmulra  r  r   r   Zfinfor{  rd  r  r_  r.  )r3  r2  r  r1  r  r  Znorm_context_hiddenZnorm_next_hiddenZcosine_matrixZdegeneration_penaltyr   Zcontrastive_scorer4  ru   ru   rv   r-    s   
r-  full_batch_size
split_sizec                    s    du rdg|  S t  tjr fddtd|D S t  ts/t  tr5t  jtr5 |S t  tr]t  d trO fddtd|D S  fddtd|D S t	dt
  )a  
    Takes care of three cases:
    1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
    2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and
       return a list of tuples
    3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and
       return a list of tuples of tuples
    (see documentation of ModelOutput)
    Nc                    s   g | ]
} ||  qS ru   ru   r  datar  ru   rv   rl  0  r&  z_split.<locals>.<listcomp>r   c                    $   g | ] t  fd dD qS )c                 3   s(    | ]}t  fd d|D V  qdS )c                 3        | ]}|    V  qd S r   ru   )r   r  r  r  ru   rv   r   :      z._split.<locals>.<listcomp>.<genexpr>.<genexpr>Nr  )r   Zinner_tupler  ru   rv   r   :  s   & $_split.<locals>.<listcomp>.<genexpr>r  r   r  r  rv   rl  9      c                    r  )c                 3   r  r   ru   )r   Z
sub_tensorr  ru   rv   r   @  r  r  r  r  r  r  rv   rl  ?  r  Unexpected attribute type: )r   rq   r   r  r   r   r  Zbatch_splitr  	TypeErrorr3  )r  r  r  ru   r  rv   _split#  s&   





r  r   r   c                    s6  du rg  S t  dkrtdkr!tdtdr+j n }fdd|D }fdd|D }g d	fd
d|D fddt D }fdd|D  dv rtd | fddt|D }dv rfdd|D } fdd|D }|S )a  
    Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
    size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
    previous forward pass.
    Nr   z3`full_batch_size` must be divisible by `split_size`z:`split_size` must be smaller or equal to `full_batch_size`__dataclass_fields__c                    rh  ru   ru   r   r   r   ru   rv   rl  b  rm  z'_split_model_inputs.<locals>.<listcomp>c                    s&   g | ]}t  | ts|d kr|qS )r   r   r  r  r  ru   rv   rl  f  r   )r   r   r  c                    s&   g | ]}t | ts| vr|qS ru   r  r  )keys_to_ignorer   ru   rv   rl  h  r   c                    s$   g | ]  fd dD qS )c                    s"   i | ]}|t |   qS ru   )r  r  )r  r  r   r  ru   rv   r   l  s   " z2_split_model_inputs.<locals>.<listcomp>.<dictcomp>ru   r  )r  r   non_bool_keysr  r  rv   rl  k  r  c                    s   i | ]}| | qS ru   ru   r  r  ru   rv   r   p  rs  z'_split_model_inputs.<locals>.<dictcomp>r   c                    s$   g | ]\}}i |d  | iqS )r   ru   )r   r  
data_split)encoder_outputs_splitru   rv   rl  v  s    r  c                    s    g | ]}i |d  d  iqS )r  ru   r   r  r  ru   rv   rl  {  s    c                    s   g | ]}d i | qS )ru   ru   r  )	bool_datamodel_output_clsru   rv   rl    s    )	r3  r   r   r  r   r  _split_model_inputsr0  r  )r   r  r  r   r   Z	bool_keysZdata_split_listZsplit_model_inputsru   )r  r  r  r  r   r  r  r  rv   r  G  s@   



r  r^  c                    sf   st dtd tfddD st ddd   fdd	j D }di |S )z
    Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
    specific ModelOutput subclass from the list provided.
    zInput list is empty.r   c                 3   s    | ]}t | V  qd S r   )r   )r   obj)r  ru   rv   r     r  z&stack_model_outputs.<locals>.<genexpr>z4All elements in the list should be of the same type.c                    s   t dd  D rdS t d tjrtj ddS t d tr&t S t d tr2t S t d trdt d d trSt fddt	t
 d D S t fddt	t
 d D S t d ttfrrt S tdt d  )	z5
        Reverse of `_split` function above.
        c                 s   s    | ]}|d u V  qd S r   ru   )r   r  ru   ru   rv   r     r+  z7stack_model_outputs.<locals>._concat.<locals>.<genexpr>Nr   r  c                 3   s8    | ] t  fd dttd d D V  qdS )c                 3   s.    | ] t j fd dD ddV  qdS )c                    s   g | ]}|   qS ru   ru   ri  )r  jru   rv   rl    rm  zLstack_model_outputs.<locals>._concat.<locals>.<genexpr>.<genexpr>.<listcomp>r   r  Nrq   r  r  )r  r  )r  rv   r     s   , zAstack_model_outputs.<locals>._concat.<locals>.<genexpr>.<genexpr>r   N)r  r  r   r  r  r  rv   r     s
    &
c                 3   s,    | ] t j fd dD ddV  qdS )c                    s   g | ]}|  qS ru   ru   ri  r  ru   rv   rl    r  zBstack_model_outputs.<locals>._concat.<locals>.<genexpr>.<listcomp>r   r  Nr  r  r  r  rv   r     s   * r  )r   r   rq   r   r  r   Zfrom_batch_splitsr   r  r  r   r   r  r  r  r3  r  ru   r  rv   _concat  s"   

"
z$stack_model_outputs.<locals>._concatc                    s$   i | ]   fd dD qS )c                    s   g | ]}t | qS ru   )r   )r   Zmodel_outputrQ  ru   rv   rl    rs  z2stack_model_outputs.<locals>.<dictcomp>.<listcomp>ru   r  )r  r^  rQ  rv   r     r  z'stack_model_outputs.<locals>.<dictcomp>Nru   )r   r3  r  r  r   )r^  r   Zconcatenated_dataru   )r  r  r^  rv   r,    s   r,  g?ZInfgMbPri   baseline_scoresrelative_topfilter_valuerE  c                 C   s   | j dd}|j dd}tj|dd\}}	|d|d f }
tj|ddj}|t| }t|
|}|d}||||k < ||||k < ||fS )a]  
    Reference: https://github.com/XiangLi1999/ContrastiveDecoding/blob/170e9142e92159c1237d731e240f5eb14aabf428/transformers/src/transformers/generation_logits_process.py#L235
    Apply filtering to only keep tokens with a probability above a certain threshold. The threshold is defined as `relative_top` * max probability in the distribution.
    r   r  T)Z
descending.r(   )	rb  rq   sortrd  r   nplogr{  r  )ri   r  r  r  Zbase_filter_valuerE  Zscores_normalizedZbaseline_scores_normalizedZsorted_logitsZsorted_indicesZ
min_threshZ	probs_maxZprobs_threshru   ru   rv   _relative_top_filter  s   
r  r  r  r  c                    s.  t | dkr | d  }t||\}}|| }|S tj fdd| D dd}tj|dd}tj|dd}d|d d d d d f |  }tj|dd}	tj|dd}
tj|	d d d d d f |dd	d}tj|
|dd	d}d||  }|d}| t	|
   } | }t||\}}|| }|S )
Nr(   r   c                    s   g | ]} | qS ru   ru   r  r  ru   rv   rl    r  z)_dola_select_contrast.<locals>.<listcomp>r  r   g      ?none)Z	reduction)r   r  rq   r_  Fr  rb  Zkl_divmeanr   r  r  )r  r  r  Zbase_logitsrj   Zstacked_premature_layersZsoftmax_mature_layerZsoftmax_premature_layersZavg_distZlog_softmax_mature_layerZlog_softmax_premature_layersZkl1Zkl2Zjs_divsZpremature_layerru   r  rv   r    s(   (
r  )F)r  r   r   rJ  dataclassesr   typingr   r   r   r   r   r   r	   r
   numpyr  rq   Ztorch.distributeddistributedr  Zhuggingface_hubr   	packagingr   r   Ztorch.nnr   r  Zcache_utilsr   r   r   r   r   r   r   Zconfiguration_utilsr   Zdynamic_module_utilsr   r   r   r   Zintegrations.deepspeedr   Zintegrations.fsdpr   Zmodeling_outputsr   r   Zpytorch_utilsr    Ztokenization_utilsr!   utilsr"   r#   r$   r%   r&   r'   Zbeam_constraintsr)   r*   Zbeam_searchr+   r,   r-   r4  r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   Zlogits_processr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rR  rV   rW   rX   rY   rZ   r[   r\   Zmodeling_utilsr]   Ztokenization_utils_baser^   Z	streamersr_   Z
get_loggerrm   r   Zaccelerate.hooksr`   ra   r  rg   rw   r}   r   ZGreedySearchDecoderOnlyOutputZ"ContrastiveSearchDecoderOnlyOutputZSampleDecoderOnlyOutputZ%ContrastiveSearchEncoderDecoderOutputZ GreedySearchEncoderDecoderOutputZSampleEncoderDecoderOutputZBeamSearchDecoderOnlyOutputZBeamSampleDecoderOnlyOutputZBeamSearchEncoderDecoderOutputZBeamSampleEncoderDecoderOutputZGreedySearchOutputZSampleOutputZBeamSearchOutputZBeamSampleOutputZContrastiveSearchOutputr  r  r  r   r  r  rt   rr   r  r   r-  r  r  r,  r  r  ru   ru   ru   rv   <module>   s6  ($	 0p$
	#/+:                                    u
8
$

@9
