
    eTha                       S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	J
r
JrJrJrJr  S SKrS SKrS SKJr  S SKJr  S SKJr  S SKJr  S SKJr  SS	KJrJrJ r J!r!J"r"J#r#J$r$  SS
K%J&r&  SSK'J(r(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/  SSK0J1r1J2r2  SSK3J4r4  SSK5J6r6  SSK7J8r8J9r9J:r:J;r;J<r<J=r=  SSK>J?r?J@r@  SSKAJBrBJCrCJDrD  SSKEJFrFJGrGJHrHJIrIJJrJJKrKJLrLJMrMJNrNJOrO  SSK%JPrPJQrQJRrRJSrS  SSKTJUrUJVrVJWrWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_J`r`JaraJbrbJcrcJdrdJereJfrfJgrgJhrhJiriJjrjJkrkJlrlJmrmJnrn  SSKoJprpJqrqJrrrJsrsJtrtJuruJvrv  \(       a  SSKwJxrx  SSKyJzrz  SSK{J|r|  \=R                  " \~5      r\9" 5       (       a  S SKJrJr  / SQr\ " S S\85      5       r\ " S  S!\85      5       r\ " S" S#\85      5       r\ " S$ S%\85      5       r\r\r\r\r\r\r\r\r\r\r\\\4   r\\\4   r\\\4   r\\\4   r\\\4   r\\\4   r\\\4   r\\\4   r " S& S'5      rS( rSGS) jrS*\GR:                  S+\GR:                  S,\GR:                  S-\GR<                  S.\S/\S0\GR:                  4S1 jrS2\S3\4S4 jrS5\\8\
4   S3\S2\S6\&S0\\\8\
4      4
S7 jrS8\\8   S6\&S0\84S9 jrS:\" S;5      * S<S4S=\GR:                  S>\GR:                  S?\S@\SA\S0\GR:                  4SB jjrSC\\   SD\
\\GR:                  4   SE\GR:                  S0\GR:                  4SF jrg)H    N)	dataclass)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion)file_exists)version)nn)
functional   )CacheDynamicCacheEncoderDecoderCacheHybridChunkedCacheOffloadedCacheOffloadedHybridCacheQuantizedCacheConfig)PretrainedConfig)check_python_requirementsget_cached_module_fileget_class_in_moduleresolve_trust_remote_code)is_deepspeed_zero3_enabled)is_fsdp_managed_module)CausalLMOutputWithPastSeq2SeqLMOutput)isin_mps_friendly)ExtensionsTrie)ModelOutputis_accelerate_availableis_hqq_availableis_optimum_quanto_availableis_torchdynamo_exportinglogging   )DisjunctiveConstraintPhrasalConstraint)
BeamScorerBeamSearchScorerConstrainedBeamSearchScorer)
AssistantVocabTranslatorCacheAssistedCandidateGenerator-AssistedCandidateGeneratorDifferentTokenizersCandidateGeneratorEarlyExitCandidateGeneratorPromptLookupCandidateGenerator%UniversalSpeculativeDecodingGenerator_crop_past_key_values_prepare_attention_mask_prepare_token_type_ids) NEED_SETUP_CACHE_CLASSES_MAPPINGQUANT_BACKEND_CLASSES_MAPPINGGenerationConfigGenerationMode)#EncoderNoRepeatNGramLogitsProcessor'EncoderRepetitionPenaltyLogitsProcessorEpsilonLogitsWarperEtaLogitsWarperExponentialDecayLengthPenaltyForcedBOSTokenLogitsProcessorForcedEOSTokenLogitsProcessorHammingDiversityLogitsProcessorInfNanRemoveLogitsProcessorLogitNormalizationLogitsProcessorListMinLengthLogitsProcessor!MinNewTokensLengthLogitsProcessorMinPLogitsWarperNoBadWordsLogitsProcessorNoRepeatNGramLogitsProcessor PrefixConstrainedLogitsProcessor RepetitionPenaltyLogitsProcessorSequenceBiasLogitsProcessor$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperTypicalLogitsWarper.UnbatchedClassifierFreeGuidanceLogitsProcessor)ConfidenceCriteriaEosTokenCriteriaMaxLengthCriteriaMaxTimeCriteriaStoppingCriteriaStoppingCriteriaListStopStringCriteria)PreTrainedModel)PreTrainedTokenizerBase)BaseStreamer)AlignDevicesHookadd_hook_to_module)past_key_valuescache_paramsstatememspast_buckets_statesc                   6   \ rS rSr% Sr\R                  \S'   Sr\	\
\R                        \S'   Sr\	\
\R                        \S'   Sr\	\
\
\R                           \S'   Sr\	\
\
\R                           \S'   Sr\	\
\
\
\R                              \S	'   S
rg)GenerateDecoderOnlyOutput   a  
Outputs of decoder-only generation models, when using non-beam methods.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
        Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
        Returns the model cache, used to speed up decoding. Different models have a different cache format, check
        the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
	sequencesNscoreslogits
attentionshidden_statesrc    )__name__
__module____qualname____firstlineno____doc__torch
LongTensor__annotations__rl   r	   r
   FloatTensorrm   rn   ro   rc   __static_attributes__rp       U/var/www/auris/envauris/lib/python3.13/site-packages/transformers/generation/utils.pyri   ri      s    4 15FHU5,,-.515FHU5,,-.5<@JuU%6%6789@?CM8E%(9(9":;<CHLOXeE%0A0A*B$CDELr{   ri   c                      \ rS rSr% Sr\R                  \S'   Sr\	\
\R                        \S'   Sr\	\
\R                        \S'   Sr\	\
\R                        \S'   Sr\	\
\R                        \S'   Sr\	\
\
\R                           \S	'   Sr\	\
\
\R                           \S
'   Sr\	\
\
\R                           \S'   Sr\	\
\
\
\R                              \S'   Srg)GenerateEncoderDecoderOutput   a  
Outputs of encoder-decoder generation models, when using non-beam methods.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
        Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
        sequence_length, sequence_length)`.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Returns the model cache, used to speed up decoding. Different models have a different cache format, check
        the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
rk   Nrl   rm   encoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesrc   rp   )rq   rr   rs   rt   ru   rv   rw   rx   rl   r	   r
   ry   rm   r   r   r   r   r   rc   rz   rp   r{   r|   r~   r~      s   !F 15FHU5,,-.515FHU5,,-.5=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDKHLOXeE%0A0A*B$CDELr{   r~   c                      \ rS rSr% Sr\R                  \S'   Sr\	\R                     \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\R                     \S'   Sr\	\\\R                           \S	'   Sr\	\\\R                           \S
'   Sr\	\\\\R                              \S'   Srg)GenerateBeamDecoderOnlyOutput   au
  
Outputs of decoder-only generation models, when using beam methods.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
        Final beam scores of the generated `sequences`.
    scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
        Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
        of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
        Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
        with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
    beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
        Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
        `(batch_size*num_return_sequences, sequence_length)`.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
    hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
    past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
        Returns the model cache, used to speed up decoding. Different models have a different cache format, check
        the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
rk   Nsequences_scoresrl   rm   beam_indicesrn   ro   rc   rp   )rq   rr   rs   rt   ru   rv   rw   rx   r   r	   ry   rl   r
   rm   r   rn   ro   rc   rz   rp   r{   r|   r   r      s    @ 48hu001815FHU5,,-.515FHU5,,-.5/3L(5++,3<@JuU%6%6789@?CM8E%(9(9":;<CHLOXeE%0A0A*B$CDELr{   r   c                      \ rS rSr% Sr\R                  \S'   Sr\	\R                     \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\R                     \S'   Sr\	\\R                        \S	'   Sr\	\\R                        \S
'   Sr\	\\\R                           \S'   Sr\	\\\R                           \S'   Sr\	\\\R                           \S'   Sr\	\\\\R                              \S'   Srg) GenerateBeamEncoderDecoderOutputi  a   
Outputs of encoder-decoder generation models, when using beam methods.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
        Final beam scores of the generated `sequences`.
    scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
        Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
        of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
        Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
        with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
    beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
        Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
        `(batch_size*num_return_sequences, sequence_length)`.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
        sequence_length, sequence_length)`.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
    decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
        sequence_length)`.
    cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
    past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
        Returns the model cache, used to speed up decoding. Different models have a different cache format, check
        the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
rk   Nr   rl   rm   r   r   r   r   r   r   rc   rp   )rq   rr   rs   rt   ru   rv   rw   rx   r   r	   ry   rl   r
   rm   r   r   r   r   r   r   rc   rz   rp   r{   r|   r   r     s/   (T 48hu001815FHU5,,-.515FHU5,,-.5/3L(5++,3=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDKHLOXeE%0A0A*B$CDELr{   r   c            "          \ rS rSrSr  SS\\\\R                  4      S\\
   S\4S jjrS\R                  S	\\R                     S
\\R                     S\\R                  \R                  4   4S jrS\R                  S	\\R                     S
\\R                     S\\R                  \R                  4   4S jr    SS\R                  S\\   S\\R                     S	\\R                     S
\\R                     4
S jjr   SS\\R*                     S\\R*                     S\\\\R*                  4      S\\R*                  \\   \\\R*                  4   4   4S jjr   SS\\R*                     S\\R*                     S\\\\R*                  4      S\R                  4S jjrS\R*                  S\S\\\4   S\R                  4S jrS\R*                  S\\   S\S\\\4   4S jr SS\S\S\\\R*                  4   S\R*                  S\\R<                     S\\R                  \\\R*                  4   4   4S jjr\    SS\S\
S\\R                     S\\R                  \\\4   4   4S  jj5       r!  SS!\"S\\\4   S\
S"\S\\\4   4
S# jjr#S$ r$S\S\R                  S\R*                  S%S&S'\%S(S)S*S)S\S\&4S+ jr'    SS\S,\S-\R                  S.\\\R*                  /\(\   4   S'\\%   S\\   S\\\\4      S/\\R*                     S0\\R*                     S\%4S1 jjr) SS\S2\\*   S3\S)   S\*4S4 jjr+S5\\%\*4   S6\\%\*4   S\\%\*4   4S7 jr,  SS8\R*                  S9\\R*                     S:\\R*                     S;\
S\R*                  4
S< jjr-S= r.S\\\4   4S> jr/S? r0S@ r1 SS\\   SA\\
   SB\S\\\4   4SC jjr2SD r3S\\\\\\4   4      4SE jr4SF\S\SG\S\R<                  S\4
SH jr5S\
4SI jr6S\S\S%S&S\SJ\S\R<                  S\
4SK jr7S\
4SL jr8  SS\SM\\
   S\\\R<                  \4      4SN jjr9S\S\S\
4SO jr:\Rv                  " 5                   SS\\R*                     S\\   S'\\%   S2\\*   S.\\\\R*                  /\(\   4      SP\\
   S%\S&   SQ\SR   S/\\R*                     S0\\R*                     SA\\
   SS\\   S\\<\R                  4   4ST jj5       r=SU\
SP\
S\R<                  S\
4SV jr> SS\R                  S3\S)   S\R                  4SW jjr?S\R                  SX\\\(\   4   S'\%S2\*S\SP\
SQSRS\\@\R                  4   4SY jrA\Rv                  " 5       S\R                  S'\%S2\*S\SP\
SQ\SR   S\\@\R                  4   4SZ j5       rBS\R                  S'\%S2\*S\SP\
SQ\SR   S\\@\R                  4   4S[ jrCS\ rD\ S]\R*                  S\R*                  4S^ j5       rE\ S]\R*                  S\S_\S\R*                  4S` j5       rF\ S]\R*                  S:\R*                  S\R*                  4Sa j5       rG\ Sb\R*                  Sc\R*                  Sd\R*                  Se\R*                  Sf\Sg\Sh\Si\\
\4   Sj\H4Sk j5       rISl\R*                  Sm\R*                  Sn\R*                  Sf\Sh\So\
Sp\S_\Sq\S\S\\R*                  \R*                  \R*                  4   4Sr jrJSs\R*                  St\R*                  Su\R*                  Se\R*                  S_\S\\R*                  \R*                  \R*                  4   4Sv jrKS8\R*                  St\R*                  Sc\R*                  Ss\R*                  S:\R*                  Su\R*                  Sd\R*                  Se\R*                  Sw\R*                  S_\Sf\Sh\Sj\HSi\\
\4   S\\R*                  \R*                  \R*                  \R*                  4   4Sx jrLS\R                  S'\%S2\*S\SP\
S\\M\R                  4   4Sy jrNS\R                  Sz\OS'\%S2\*S\SP\
4S{ jrPS\R                  S|\QS'\%S2\*S\SP\
S\\M\R                  4   4S} jrRS\R                  S~\&S'\%S2\*S\SP\
SQ\SR   S\\@\R                  4   4S jrSS\R                  S\4S jrTSrUg)GenerationMixinia  a)	  
A class containing all functions for auto-regressive text generation, to be used as a mixin in model classes.
Inheriting from this class causes the model to have special generation-related behavior, such as loading a
`GenerationConfig` at initialization time or ensuring `generate`-related tests are run in `transformers` CI.

A model class should inherit from `GenerationMixin` to enable calling methods like `generate`, or when it
has defined a custom `generate` method that relies on `GenerationMixin`, directly or indirectly, which
approximately shares the same interface to public methods like `generate`. Three examples:
    - `LlamaForCausalLM` should inherit from `GenerationMixin` to enable calling `generate` and other public
        methods in the mixin;
    - `BlipForQuestionAnswering` has a custom `generate` method that approximately shares the same interface as
       `GenerationMixin.generate` (it has a few extra arguments, and the same output). That function also calls
       `GenerationMixin.generate` indirectly, through an inner model. As such, `BlipForQuestionAnswering` should
       inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
    - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
        However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
        `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.

The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
    - *greedy decoding* if `num_beams=1` and `do_sample=False`
    - *contrastive search* if `penalty_alpha>0` and `top_k>1`
    - *multinomial sampling* if `num_beams=1` and `do_sample=True`
    - *beam-search decoding* if `num_beams>1` and `do_sample=False`
    - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
    - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
    - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
    - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`

To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
Npretrained_model_name_or_pathtrust_remote_codereturnc                    [         R                  R                  U5      nSnU(       aE  [         R                  R                  [         R                  R                  US5      5      (       d  SnO[	        US5      (       d  SnU(       d  [        SU S35      eSU S3n[        UUUU(       + US9  [        U4S	S
0UD6  [        U4SS0UD6n[        SU5      nU$ )a  
Loads and returns a custom generate function, given a model repo.

Args:
    pretrained_model_name_or_path (`str` or `os.PathLike`):
         Can be either:
            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
    trust_remote_code (`bool`, *optional*):
        Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
        should only be set to `True` for repositories you trust and in which you have read the code, as it will
        execute code present on the Hub on your local machine.
    **kwargs:
        Additional keyword arguments for remote code loading.

Raises:
    OSError: If `pretrained_model_name_or_path` does not contain a `custom_generate` subdirectory.

Returns:
    A callable that can be used to generate text.
Tzcustom_generate/generate.pyF`zw` does not contain a `custom_generate` subdirectory with a `generate.py` file, can't load the custom generate function.zThe repository `zS` contains custom generation code that will override the default `generate` method.)has_local_codehas_remote_codeerror_messagerequirements_filez custom_generate/requirements.txtmodule_filegenerate)
ospathexistsjoinr   OSErrorr   r   r   r   )	selfr   r   kwargsis_local_codehas_custom_generate_folderr   modulecustom_generate_functions	            r|   load_custom_generate$GenerationMixin.load_custom_generate  s   : 'DE%)"77>>"'',,/LNk"lmm-2*<>[\\-2*)12 3O O  <= >- - 	 	")( --'	
 	")	
=_	
ci	
 ()
7T
X^
 $7z6#J ''r{   	input_idsinputs_embedscache_positionc                 j   [        5       (       a  U R                  XU5      $ Ub/  UR                  S   S:X  a  USS2UR                  S   * S24   nX!4$ Uc  US   UR                  S   :  a  USS2UR                  S   * S24   nX!4$ UR                  S   UR                  S   :w  a	  USS2U4   nX!4$ )a  
Generic cache-dependent input preparation
The code is put in a separate function to allow granular unit testing
as it needs a different implementation to be exportable.

If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
- Exception 1: when passing input_embeds, input_ids may be missing entries
- Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
- Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
- Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
  generate the first token for each sequence. Later use the generated Input ids for continuation.

The current implementation does not rely on ``self`` and could be
a class method. It is left as a standard method to be easily rewritten.
Nr)   r   )r'   ,_cache_dependant_input_preparation_exportingshape)r   r   r   r   s       r|   "_cache_dependant_input_preparation2GenerationMixin._cache_dependant_input_preparation  s    * $%%DDY_mnn$);q)@)!n.B.B1.E-E-G*GHM '' %r"iooa&88!!n&:&:1&=%=%?"?@I '' __Q>#7#7#::!!^"34I''r{   c                    ^^^ Uc  USS2U4   nX!4$ S mS mS m[         R                  " UR                  S   S:H  U4S jUU4S jXU/5      u  p!X!4$ )	z
This method implements method ``_cache_dependant_input_preparation``
with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
The code is put in a separate function to allow granular unit testing.
Nc                 4    U S S 2UR                   S   * S 24   $ Nr   r   )r   r   s     r|   branch_1NGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_1  s#    $Q)=)=a)@(@(B%BCCr{   c                 4    U S S 2UR                   S   * S 24   $ r   r   r   r   s     r|   branch_2NGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_2   s#     ^%9%9!%<$<$>!>??r{   c                     U S S 2U4   $ Nrp   r   s     r|   branch_3NGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_3  s     N!233r{   r)   r   c                    > T" X5      U 4$ r   rp   )r   r   r   r   s      r|   <lambda>NGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>	  s     ?!Fr{   c                 j   > U[         R                  " US   U R                  S   :  TU4S jX/5      4$ )Nr   r)   c                 t   > [         R                  " U R                  S   UR                  S   :g  TS X/5      $ )Nr)   r   c                     U $ r   rp   r   s     r|   r   rGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>  s    yr{   rv   condr   )r   r   r   s     r|   r   `GenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>  s7     %

$-OOA$6.:N:Nq:Q$Q$,%P%.$?	!"r{   r   )r   r   r   r   r   s      r|   r   r     s>    %

*2.)//!2DD$ '7Fr{   r   )r   r   r   r   r   r   r   s       @@@r|   r   <GenerationMixin._cache_dependant_input_preparation_exporting  s{      !!^"34I` ''KD@4 (-zz"a'" >:5($M8 ''r{   rc   attention_maskc                    0 nU R                   (       a  XWS'   OXUcU  Ub  US   S   R                  S   OSn[        R                  " XR                  S   [        R                  UR
                  S9nUb  X'S'   U R                  XU5      u  pAU R                  R                  (       a  SOS	n	U R                  R                  (       dM  Ub%  [        U5      UR                  S   :X  a	  SXy'   XGS
'   ODUR                  [        R                  S9Xy'   SUS
'   OUR                  [        R                  S9Xy'   U R                  R                  (       a  UOSn
U R                  R                  (       a  UR                  SS5      OUnU R                  R                  (       a  SOSnU R                  R                  (       a  SOSnUb  UR                  U5      c  U[        [        R                   " U R"                  5      R$                  R'                  5       5      ;   a;  UR	                  5       R)                  S5      S-
  nUR+                  US:H  S5        XU'   S H~  nUR                  U5      nUc  M  Ub^  UR                  S
5      b  US
   R                  S   OXy   R                  S   nUSS2U* S24   nUR                  [        R                  S9nXU'   M     [-        U[.        5      (       Ga
  UR0                  (       a  Ub  UR2                  S:X  a  US
   b  US
   R                  u  nnnOXy   R                  SS u  nn[5        X R6                  U 5      n[9        US5      (       a  UR;                  5       OSn[5        USS5      nUc  Ub  [5        USS5      nUc-  [<        R?                  U R@                  RB                   S35        O/U" UUURE                  5       U RF                  UUU R                  US9nUb  X7U'   U
b  XS'   URI                  5        H  u  nnUU;  d  M  UUU'   M     UR                  SS5        U$ )aZ  
Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
slicing inputs given the existing cache.

See the forward pass in the model documentation for expected arguments (different models might have different
requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
r   Nr   r   r)   dtypedevicerc   decoder_input_idsr   r   )memory_formatdecoder_attention_maskr   decoder_position_idsposition_idsr   )r   token_type_idsr   get_decoder5_prepare_4d_causal_attention_mask_with_cache_positiona   has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.)sequence_lengthtarget_lengthr   r   
batch_sizeconfigrc   labels)%_supports_cache_classr   rv   arangelongr   r   r   is_encoder_decoderlenclonecontiguous_formatpopgetsetinspect	signatureforward
parameterskeyscumsummasked_fill_
isinstancer   is_compileablendimgetattrbase_model_prefixhasattrr   loggerwarning_once	__class__rq   get_max_cache_shaper   items)r   r   rc   r   r   r   r   model_inputspast_lengthinput_ids_keyencoder_attention_maskattention_mask_keyposition_ids_keyr   model_input_namemodel_inputcurrent_input_lengthr   r   _
base_modeldecodercausal_mask_creation_functionkeyvalues                            r|   prepare_inputs_for_generation-GenerationMixin.prepare_inputs_for_generation$  sC   $ %%-;)* #<K<W/!,Q/55a8]^K"\\+q7IQVQ[Q[dmdtdtuN &.=*+'+'N'N.($M
 04{{/M/M+S^{{--(S-@MDWDWXYDZ-Z.2+0=_- /8ooELcLco.d+04_-*3//H_H_/*`L' 48;;3Q3QW[:>++:X:XFJJ/6^l 	 :>9W9W5]m59[[5S5S1Yg&

+,4 C(9(9$,,(G(R(R(W(W(Y$ZZ)..077;a?L%%n&91='3#$ ![ **%56K&". (++O<H %_5;;A>)8>>qA )
 #.a2F1F1G.G"HK"-"3"3%BYBY"3"ZK1<-. ![  ....*##q(O,81=o1N1T1T.
OQ.:.I.O.OPRQR.S+
O !'='=tDJ29*m2T2Tj,,.Z^G,3SUY-) -49L07TVZ1- -4##~~../ 0' ' "?"$3"1"E"E"G**#1);;$3	" %/=+,!--C)* !,,.JC,&$)S! )
 	4(r{   inputsbos_token_idmodel_kwargsc                 `   U R                   R                  (       aL  [        U S5      (       a;  U R                  R                  U R                  :w  a  U R                  R                  nOU R                  nUR                  5        VVs0 s H  u  pVUc  XT:w  d  M  XV_M     nnnUR                  US5      nUb  Ub  [        SU SU SU SU S3	5      eUb  UnUS:X  a  S	U;   a  U R                   R                  (       d  S	[        [        R                  " U R                  5      R                  R                  5       5      ;   nU(       d#  [        S
U R                  R                   S35      eU R!                  XUS9US'   OUb  [        S5      eUS	   S	pAU R!                  XU5      nXU4$ s  snnf )zD
This function extracts the model-specific `inputs` for generation.
encoderNz
`inputs`: z` were passed alongside z0 which is not allowed. Make sure to either pass z or z=...r   r   zAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)r	  zMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)r   r   r   r  main_input_namer   r   
ValueErrorr   r   r   r  r   r   r   rq   *_maybe_initialize_input_ids_for_generation)	r   r  r  r	  
input_namekvinputs_kwarghas_inputs_embeds_forwardings	            r|   _prepare_model_inputs%GenerationMixin._prepare_model_inputs  s    KK**i((,,0D0DD55J--J)5););)=b)=RSRa)=b $''
D9#(:VH$<ZL I,,284
|4I  %!F $L)H;;11/>#%%d&H&HITTYY[C 0, 4$[\`\j\j\s\s[t ux x  -1,[,[| -\ -[) %$%tuu!-o!>J @@Wcd<//U cs   F*F*c                    Ub  U$ UR                  S5      nU R                  R                  (       aQ  UbN  UR                  R	                  5       SS n[
        R                  " U[
        R                  U R                  S9S-  $ SnUR                  5        H3  n[        U[
        R                  5      (       d  M$  UR                  S   n  O   SU;   a0  [
        R                  " US4[
        R                  U R                  S9$ Uc  [        S	5      e[
        R                  " US4[
        R                  U R                  S9U-  $ )
z3Initializes input ids for generation, if necessary.Nencoder_outputsr   r   ir)   r   r   zB`bos_token_id` has to be defined when no `input_ids` are provided.)r   r   r   last_hidden_statesizerv   onesr   r   valuesr   Tensorr   r  )r   r  r  r	  r  r   r   r  s           r|   r  :GenerationMixin._maybe_initialize_input_ids_for_generation  s    M&**+<=;;))o.I#55::<SbAE::e5::dkkJTQQ 
!((*E%.."[[^
 +
 l*::z1oUZZTTabbzz:q/DKKPS___r{   inputs_tensorgeneration_configc                    UR                   nUR                  nSU;   a  US   R                  S   S:  a  US   n[        R                  " UR                  S S [        R
                  UR                  S9nUc  U$ [        UR                  5      S:H  =(       a-    UR                  [        R                  [        R
                  4;   nU(       d  U$ US L=(       a    [        XS9R                  5       nUS L =(       d    [        XTS9R                  5       ) n	X-  n
UR                  U5      R                  5       nX-  Xj) -  -   nU$ )Nr   r)   r   r   r   elementstest_elements)_pad_token_tensor_eos_token_tensorr   rv   r  r   r   r   r   intr!   anyne)r   r  r  r	  pad_token_ideos_token_iddefault_attention_maskis_input_idsis_pad_token_in_inputs&is_pad_token_not_equal_to_eos_token_idcan_infer_attention_maskattention_mask_from_paddingr   s                r|   &_prepare_attention_mask_for_generation6GenerationMixin._prepare_attention_mask_for_generation  sR    )::(:: ,&<+D+J+J1+MPQ+Q(5M "'M,?,?,C5::^k^r^r!s))=../14g9L9LQVQZQZ\a\f\fPg9g))".d": "
}QUUW 	 3?$2F 2
|PTTVL
. $:#b &3&6&6|&D&I&I&K# (BE[^wEww 	 r{   r   c                   ^ U R                  5       n[        U S5      (       a6  [        US5      (       a  SUR                  l        O[	        U[        SS95        / SQnUR                  5        V^Vs0 s H%  u  mn[        U4S jU 5       5      (       a  M"  TU_M'     n	nn[        [        R                  " UR                  5      R                  5      n
SU
;   =(       d    SU
;   nU(       d+  U	R                  5        VVs0 s H  u  pxXz;   d  M  Xx_M     n	nnUR                  U	S	'   UR                  U	S
'   Ub  UOU R                  nSU	S'   XU'   U" S0 U	D6US'   U$ s  snnf s  snnf )Nhf_device_map_hf_hookT)io_same_device)decoder_
cross_attn	use_cachec              3   F   >#    U  H  nTR                  U5      v   M     g 7fr   )
startswith).0parguments     r|   	<genexpr>QGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<genexpr>G  s!     I7H!x**1--7Hs   !r   r	  output_attentionsoutput_hidden_statesreturn_dictr  rp   )get_encoderr   r5  r6  rb   ra   r   r'  r   r   r   r   r   rA  rB  r  )r   r  r	  r   r  r  irrelevant_prefixr>  r  encoder_kwargsencoder_signatureencoder_accepts_wildcards          `    r|   ._prepare_encoder_decoder_kwargs_for_generation>GenerationMixin._prepare_encoder_decoder_kwargs_for_generation1  s{    ""$ 4))w
++26  /"7,<D,QR D $0#5#5#7
#7%I7HII HeO#7 	 

   1 1'// B M MN#+/@#@#gNVgDg '7E7K7K7M7MOHQYQn7M   /@.Q.Q*+1B1W1W-. 0@/K+QUQeQe(,}%+8'(7>7P7P&')
s   1!EE>E!E!r   decoder_start_token_idr   c                    Ub  SU;   a  UR                  S5      nO SU;   a  US:w  a  UR                  S5      nOSnUc  U R                  nUR                  S:X  aD  UR                  S   U:w  a  [	        SU SUR                  S    35      eUR                  SS5      nO)[        R                  " US4[        R                  US	9U-  nUc  UnXc4$ S
U R                  R                  R                  5       ;   dL  U R                  R                  S:X  a6  S
U R                  R                  R                  R                  5       ;   a   Xc4$ U R                  R                  S;   a   Xc4$ USS2S4   USS2S4   :g  R                  5       R!                  5       (       aY  [        R"                  " XF/SS9nSU;   a=  US   n[        R"                  " [        R$                  " U5      SS2SS24   U4SS9nXsS'   Xc4$ )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNr   r   r)   r   z1`decoder_start_token_id` expected to have length z	 but got r   r   donutzvision-encoder-decoder)whisperdimr   )r   r   r   r   r  viewrv   r  r   r   rq   lowerr   
model_typer  allitemcat	ones_like)r   r   r   r	  rK  r   r   r   s           r|   )_prepare_decoder_input_ids_for_generation9GenerationMixin._prepare_decoder_input_ids_for_generationZ  s    #(;|(K , 0 01D EL(-=-L , 0 0 = $ >[[F!&&!+%++A.*< G
|S\]s]y]yz{]|\}~  &<%@%@Q%G" 

J?%**VLOee # $ 6, !..% //5577KK""&>>7dkkNaNaNlNlNrNrNtCt !.. [[##{2 !..  1%)?1)EEJJLQQSS %		+A*U[] ^'<7)56N)O&).__%;<QUCE[\*& :P56 ..r{   expand_sizer   c                    ^  T S:X  a  X#4$ U 4S jnUb  UR                  T SS9nU" U5      nU(       a+  UR                  S5      c  [        S5      eU" US   5      US'   X#4$ )zIExpands tensors from [batch_size, ...] to [batch_size * expand_size, ...]r)   c                    > U  HI  nUS:w  d  M  X   c  M  [        X   [        R                  5      (       d  M5  X   R                  TSS9X'   MK     U $ )Nr   r   rO  )r   rv   r  repeat_interleave)dict_to_expandr  rZ  s     r|   _expand_dict_for_generationRGenerationMixin._expand_inputs_for_generation.<locals>._expand_dict_for_generation  sX    %++&+7">#6EE*8*=*O*OP[ab*O*cN' & "!r{   r   rO  r  zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)r]  r   r  )rZ  r   r   r	  r_  s   `    r|   _expand_inputs_for_generation-GenerationMixin._expand_inputs_for_generation  s     !**	"  !33KQ3GI2<@ 12: !pqq.I,WhJi.jL*+&&r{   outputsnum_new_tokensc                    [          H"  nXQ;   d  M
  US;   a  SnOUn[        X5      X&'     O   SU;   a4  US   n[        R                  " XwS S 2S4   R	                  S5      /SS9US'   U(       dC  SU;   a<  US   n[        R                  " XR                  UR                  S   S45      /SS9US'   OBS	U;   a<  US	   n	[        R                  " XR                  U	R                  S   S45      /SS9US	'   UR                  S
S5      (       a  US   SS  U-   US'   U$ UR                  S5      n
[        R                  " U
S   S-   U
S   U-   S-   U
R                  S9R                  U
R                  5      n[        R                  " X45      US'   U$ )N)rg   rf   rc   r   r   rO  r   r   r)   r   r9  Tr   r   )ALL_CACHE_NAMESr   rv   rV  	unsqueezenew_onesr   r   r   r   r   tor   )r   rc  r	  r   rd  possible_cache_name
cache_namer   r   r   past_positionsnew_positionss               r|   #_update_model_kwargs_for_generation3GenerationMixin._update_model_kwargs_for_generation  s    $3"-&*II!2J!4J+27+P( $3 |+)*:;N-2YYWXZ\W\H]HgHghjHk7lrt-uL)*!</!-.>!?16#%<%<n>R>RST>UWX=Y%Z[ac2-.
 (<7)56N)O&9>+-L-LNdNjNjklNmopMq-rs:56
 K..-9:J-KBC-PSa-aL)*  *--.>?N!LLr"Q&r(:^(Ka(OWeWkWkb&&'  .3YY7V-WL)*r{   c                 `    [        SU R                  R                   SU R                   35      e)NzGMake sure that a `_reorder_cache` function is correctly implemented in z to enable beam search for )NotImplementedErrorr   rr   )r   rc   beam_idxs      r|   _reorder_cacheGenerationMixin._reorder_cache  s9    !UVZVdVdVoVoUp q''+~~&68
 	
r{   assistant_modelr^   logits_processortarget_tokenizerr_   assistant_tokenizerc	                    [        S XFU4 5       5      n	UR                  b  [        UU UUUUS9n
U
$ UR                  b6  [	        UR
                  UR                  UR                  UR                  S9n
U
$ U	(       a  UR                  SL a^  [        R                  " UUU R                  R                  5       R                  USS9nSUR                  l        [!        UUUUUUUUUS9	n
U
$ UR                  SL a  [#        UUUUUUUUS	9n
U
$ [%        S
['        UR                  5      R(                   35      e[+        UUUUUUS9n
U
$ )zE
Returns the candidate generator to be used in `assisted_generation`
c              3   (   #    U  H  oS Lv   M
     g 7fr   rp   )r<  r  s     r|   r?  ;GenerationMixin._get_candidate_generator.<locals>.<genexpr>  s     "s:rQD=:r   N)r   rv  r  r	  r  rw  )r*  num_output_tokensmax_matching_ngram_size
max_lengthT)rv  assistant_prune_lm_head)	r   rv  r  r	  r  rw  rx  ry  atm_translatorF)r   rv  r  r	  r  rw  rx  ry  z7Invalid value for `do_sample`: expected a boolean, got )rT  assistant_early_exitr3   prompt_lookup_num_tokensr4   r%  r  r  	do_sampler/   get_translatorr   get_text_config
vocab_sizer  repetition_penaltyr5   r1   r  typerq   r0   )r   r  r   r  rv  rw  rx  ry  r	  different_tokenizerscandidate_generatorr  s               r|   _get_candidate_generator(GenerationMixin._get_candidate_generator  s     #"s?^q:r"ss11="=# $"3)+!1#z #"k 77C"@.@@"3"L"L(9(Q(Q,77	#h #"] " **d2!>!M!M$'KK//1<<$3,0" HL11D&K'$3&7!-"/%5%5(;#1
'#F #"1 #,,5&S'$3&7!-"/%5%5(;	'#. #" !MdSdSnSnNoNxNxMyz  #=# /"3)+!1# #"r{   input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnnegative_prompt_idsnegative_prompt_attention_maskc
           
         [        5       n
UR                  b@  UR                  S:w  a0  U
R                  [        UR                  U UU	UR                  S95        UR
                  b"  U
R                  [        UR
                  S95        UR                  bH  UR                  S:  a8  U
R                  [        UR                  UR                  UR                  S95        UR                  bh  UR                  S:w  aX  [        UR                  5      S:X  a$  U
R                  [        UR                  US95        O[        R                   " S	["        5        UR$                  b2  UR$                  S:w  a"  U
R                  ['        UR$                  S
95        UR(                  b4  UR(                  S:  a$  U
R                  [+        UR(                  5      5        UR,                  bj  UR,                  S:  aZ  [        UR                  5      S:X  a&  U
R                  [/        UR,                  U5      5        O[        R                   " S["        5        UR0                  b/  U
R                  [3        UR0                  UR4                  5      5        UR6                  bK  UR4                  b>  UR6                  S:  a.  U
R                  [9        UR6                  UR4                  US95        UR:                  bL  UR4                  b?  UR:                  S:  a/  U
R                  [=        UUR:                  UR4                  US95        Ub2  U
R                  [?        UUR                  UR                  -  5      5        UR@                  b$  U
R                  [C        UR@                  5      5        URD                  b.  U
R                  [G        URH                  URD                  US95        URJ                  SL a  U
R                  [M        5       5        URN                  b0  U
R                  [Q        URN                  UR4                  U5      5        URR                  b#  U
R                  [U        URR                  US95        URV                  b@  UnUS:  d  UR@                  c  UOUS-   nU
R                  [Y        URV                  UUS95        URZ                  b  []        S5      eU R_                  X5      n
UR`                  (       GaX  UR                  S:  a  [c        UR4                  [d        5      (       a  [        UR4                  5      S-   nOK[c        UR4                  [f        Rh                  5      (       a  UR4                  R                  S   S-   nOSnOSnURj                  b4  URj                  S:w  a$  U
R                  [m        URj                  5      5        URn                  b3  URn                  S:w  a#  U
R                  [q        URn                  US95        URr                  b3  URr                  S:  a#  U
R                  [u        URr                  US95        URv                  b#  U
R                  [y        URv                  US95        URz                  b3  URz                  S:  a#  U
R                  [}        URz                  US95        UR~                  b=  SUR~                  s=:  a  S:  a&  O  O#U
R                  [        UR~                  US95        UR                  b=  SUR                  s=:  a  S:  a&  O  O#U
R                  [        UR                  XS95        UR                  bM  U
R                  UR                  R                  U R                  R                  5       R                  U5      5        UR                  SL a  U
R                  [        5       5        U
$ )z
This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
instances used to modify the scores of the language model head.
r)   )unconditional_idsunconditional_attention_maskr9  sequence_bias        )diversity_penalty	num_beamsnum_beam_groups      ?r   )penaltyr  zyPassing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.)r  r   z{Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.r   TzYou have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument in favour of `input_ids` or `decoder_input_ids` respectively.)top_kmin_tokens_to_keep)top_pr  )min_pr  )massr  )epsilonr  )r  r  r   )JrG   guidance_scaleappendrV   r9  r  rO   r  rD   r  r  encoder_repetition_penaltyr   r   r>   warningswarnUserWarningr  rN   no_repeat_ngram_sizerL   encoder_no_repeat_ngram_sizer=   bad_words_idsrK   r%  
min_lengthrH   min_new_tokensrI   rM   forced_bos_token_idrB   forced_eos_token_idrC   r  remove_invalid_valuesrE    exponential_decay_length_penaltyrA   suppress_tokensrQ   begin_suppress_tokensrP   forced_decoder_idsr  _merge_criteria_processor_listr  r   listrv   r  temperaturerR   r  rS   r  rT   r  rJ   	typical_prU   epsilon_cutoffr?   
eta_cutoffr@   watermarking_configconstruct_processorr   r  r  renormalize_logitsrF   )r   r  r  r  r  rw  r   r	  r  r  
processorsbegin_indexr  s                r|   _get_logits_processor%GenerationMixin._get_logits_processor?  s@   " )*
++7<M<\<\`a<a>%44&91O/99 **69HYHgHghi..:?P?b?beh?h/&7&I&I/99$5$E$E 88D!<<C$**+q0!!; 1 L L*; 9
 //;@Q@d@dhk@k>GXGkGklm11=BSBhBhklBl:;L;a;abc::F!>>B$**+q0!!7)FF) 9
 **6)%33%77 ((4!33?!,,q0(%00%77! ,,8!33?!00141(%44%77!	 $/0,%//3D3T3TT 00<-%99
 00<-%00%99! 22d:9;<==I-%FF%77( ,,8-%55! 22>.K )1,0A0U0U0]  1_ 
 4%;;! //;P  88V
 &&& !**Q./AA4HH),->-P-P)QTU)U& 1 C CU\\RR):)L)L)R)RST)UXY)Y&)*&%&" !,,8=N=Z=Z^a=a!!"9:K:W:W"XY &&27H7N7NRS7S!!$+<+B+BWij !&&27H7N7NQT7T!!$+<+B+BWij !&&2!!$+<+B+BWij !**6;L;V;VY\;\!!'->-H-H]op !//;FWFfFf@lil@l!!' 1 @ @Ug
 !++7CBSB^B^<dad<d!!# 1 < <Qc 00<!55IIKK//1<<f //47023r{   stopping_criteria	tokenizerc                    [        5       nUR                  b:  [        U R                  SS 5      nUR	                  [        UR                  US95        UR                  b"  UR	                  [        UR                  S95        UR                  b1  Uc  [        S5      eUR	                  [        UR                  US95        UR                  b"  UR	                  [        UR                  S95        UR                  (       a?  UR                  b2  UR                  S:  a"  UR	                  [        UR                  S95        U R!                  XR5      nU$ )	Nmax_position_embeddings)r  r  )max_timea  There are one or more stop strings, either in the arguments to `generate` or in the model's generation config, but we could not locate a tokenizer. When generating with stop strings, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.)stop_stringsr  )r*  r   )assistant_confidence_threshold)r\   r  r   r   r  rY   r  rZ   r  r  r]   r%  rX   is_assistantr  rW   r  )r   r  r  r  r   criteriar  s          r|   _get_stopping_criteria&GenerationMixin._get_stopping_criteria(  s0    ()''3&-dkk;TVZ&[#OO!0;;,C %%1OOO5F5O5OPQ))5  s 
 OO.<M<Z<Zfopq..:OO,:K:]:]^_**!@@L!@@1DOO"BSBrBrs 66xSr{   default_listcustom_listc                    [        U5      S:X  a  U$ [        U5      " 5       nU H  nSnU H  n[        U5      [        U5      L d  M  [        U[        5      (       a  SOSn[        R                  SU S[        U5       S[        U5       S[        U5       S	3	5        UR                  U5        S
n  O   U(       a  M  UR                  U5        M     U H  nXc;  d  M
  UR                  U5        M     U$ )a  
Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
processor/criteria is present on both lists, use the user-defined one.

(Note: up to v4.49.0, this function threw an exception is the same logit processor was found twice.)
r   Fzstopping criteriazlogits processorz	A custom z	 of type zt has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom z5 will take precedence. Please check the docstring of z$ to see related `.generate()` flags.T)r   r  r   r[   r   r   r  )r   r  r  
final_listdefaultusing_customcustomobject_types           r|   r  .GenerationMixin._merge_criteria_processor_listO  s    {q ,')
#G L%<4=09CFL\9]9]"5cuK''#K=	$v, Heeijpeqdr sOOSTZ|n ]// %%f-#'L &  <!!'* $" "F'!!&) " r{   rk   rl   r   normalize_logitsc                    Ucj  [         R                  " US   R                  S   5      R                  SS5      R	                  UR
                  5      nUR                  S[        U5      5      n[         R                  " U5      R                  [        U5      S5      R                  SS5      nU(       a  UR                  SU R                  R                  5       R                  UR                  S   5      n[         R                  R                  R!                  USS9nUR                  SUR                  S   5      nUS:  nSUR#                  5       -
  R%                  S5      R'                  5       nUR)                  5       SS2SU24   nUSS2SU24   nSX5'   X0R                  R                  5       R                  -  nUR                  S   U-
  nUSS2US24   U-   n	UR+                  SU	5      n
SX'   U
$ )ao  
Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.

Parameters:
    sequences (`torch.LongTensor`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
        shorter if all batches finished early due to the `eos_token_id`.
    scores (`tuple(torch.FloatTensor)`):
        Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
        of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
        Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
        with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
    beam_indices (`torch.LongTensor`, *optional*):
        Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
        `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
        generate-time.
    normalize_logits (`bool`, *optional*, defaults to `False`):
        Whether to normalize the logits (which, for legacy reasons, may be unnormalized).

Return:
    `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
        the transition scores (logits)

Examples:

```python
>>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
>>> import numpy as np

>>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
>>> tokenizer.pad_token_id = tokenizer.eos_token_id
>>> inputs = tokenizer(["Today is"], return_tensors="pt")

>>> # Example 1: Print the scores for each token generated with Greedy Search
>>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
>>> transition_scores = model.compute_transition_scores(
...     outputs.sequences, outputs.scores, normalize_logits=True
... )
>>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
>>> # encoder-decoder models, like BART or T5.
>>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
>>> generated_tokens = outputs.sequences[:, input_length:]
>>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
...     # | token | token string | log probability | probability
...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
|   262 |  the     | -1.414 | 24.33%
|  1110 |  day     | -2.609 | 7.36%
|   618 |  when    | -2.010 | 13.40%
|   356 |  we      | -1.859 | 15.58%
|   460 |  can     | -2.508 | 8.14%

>>> # Example 2: Reconstruct the sequence scores from Beam Search
>>> outputs = model.generate(
...     **inputs,
...     max_new_tokens=5,
...     num_beams=4,
...     num_return_sequences=4,
...     return_dict_in_generate=True,
...     output_scores=True,
... )
>>> transition_scores = model.compute_transition_scores(
...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
... )
>>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
>>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
>>> # use case, you might want to recompute it with `normalize_logits=True`.
>>> # Tip 2: the output length does NOT include the input length
>>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
>>> length_penalty = model.generation_config.length_penalty
>>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
>>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
True
```Nr   r   r)   rO  )rv   r   r   rQ  rj  r   expandr   stackreshape	transposer   r  r  r   r   log_softmaxr   summaxr   gather)r   rk   rl   r   r  beam_indices_maskmax_beam_lengthbeam_sequence_indicescut_idxindicestransition_scoress              r|   compute_transition_scores)GenerationMixin.compute_transition_scorest  s   h  <<q	(:;@@QGJJ9K[K[\L'..r3v;?L V$,,S["=GG1M ^^B(C(C(E(P(PRXR^R^_aRbcFXX((44V4CF^^BR(89F )1,05577<<R@DDF#))+A/?/?,?@-a1A/1A.AB +,' !-{{/J/J/L/W/W W //"%7AwxK(+@@ #MM!W5 01,  r{   c                   ^ ^ Tc  g T R                   R                  (       as  TR                   R                  (       dX  / SQn[        TR                   5       Vs/ s H  oUU;   d  M
  UPM     nn[        UU 4S jU 5       5      nU(       d  [	        S5      eSnT R                   R                  5       R                  TR                   R                  5       R                  :X  a  Ub  [	        SU S35      eg Ub  Uc  [	        SU S35      eg s  snf )N)encoder_attention_headsencoder_ffn_dimencoder_layersc              3   ~   >#    U  H2  n[        TR                  U5      [        TR                  U5      :H  v   M4     g 7fr   )r   r   )r<  attrrv  r   s     r|   r?  6GenerationMixin._validate_assistant.<locals>.<genexpr>  s3      `sX\T*go6L6Ld.SS`ss   :=zThe main model and the assistant don't have compatible encoder-dependent input shapes. Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper.zc(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)z`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` .zThe main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` )r   r   dirrT  r  r  r  )r   rv  r  ry  attributes_to_checkr  	are_equaldoc_references   ``      r|   _validate_assistant#GenerationMixin._validate_assistant  sB   ";;))/2H2H2[2["b478N8N4O"o4OD[nSn44O"o `s I  J  r 	 ;;&&(337M7M7]7]7_7j7jj".  l  mz  l{  {|  }  /
  $7$?  V  Wd  Ve  ef  g  %@' #ps   	D
%D
c                    [        UR                  SS5      [        5      (       a3  U R                  (       d"  [	        U R
                  R                   S35      eU R                  R                  (       a  S H  nUR                  US5        M     / n[        [        R                  " U R                  5      R                  5      nSU;   d  SU;   a6  U[        [        R                  " U R                  5      R                  5      -  nU R                  R                  (       a  [!        X R"                  S5      n[!        U SS5      nUc  Ub  [!        USS5      nUb7  [        [        R                  " UR                  5      R                  5      nXG-  n[!        U SS5      nUc  Ub  [!        USS5      nUbK  [        [        R                  " UR                  5      R                  5      n	XI V
s1 s H  n
S	U
 3iM
     sn
-  nUR%                  5        H"  u  p+Uc  M
  X$;  d  M  UR'                  U5        M$     U(       a  [	        S
U S35      egs  sn
f )zXValidates model kwargs for generation. Generate argument typos will also be caught here.rc   Nz does not support an instance of `Cache` as `past_key_values`. Please check the model documentation for supported cache formats.)r   r   r	  r  r  r7  z8The following `model_kwargs` are not used by the model: zG (note: typos in the generate arguments will also show up in this list))r   r   r   r   r  r   rq   r   r   r   r   r   r   r  r   r   r   r   r   r  )r   r	  r  unused_model_args
model_argsr   r  encoder_model_argsr  decoder_model_argsxr  s               r|   _validate_model_kwargs&GenerationMixin._validate_model_kwargs  s"    l&&'8$?GGPTPjPj>>**+ ,M M  ;;)),  d+ - **4+M+MNYYZ
 z!^z%A#g//=HHIIJ ;;)) '='=tDJ dIt4G :#9!*i>"%():):7??)K)V)V%W"0
 dIt4G:#9!*i>"%():):7??)K)V)V%W"7IJ7I!!~7IJJ
&,,.JC S%:!((- / JK\J] ^F F   Ks   +Ic           	         U(       aF  UR                   c9  UR                  S:X  a)  [        R                  " SUR                   S3[        5        X!R                  :  a>  U R
                  R                  (       a  SOSn[        SU SU S	UR                   S
35      eSnU(       a  USUR                   S3-  nUR                  bS  UR                  UR                  :  a9  [        R                  " SUR                   SUR                   S3U-   [        5        UR                  b\  UR                  U-   nXaR                  :  a=  [        R                  " SUR                   SU SUR                   S3U-   [        5        ggg)z=Performs validation related to the resulting generated lengthN   z0Using the model-agnostic default `max_length` (=zz) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.r   r   zInput length of z is z, but `max_length` is set to z}. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.z Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.z" Note that `max_length` is set to z, its default value.z-Unfeasible length constraints: `min_length` (z.) is larger than the maximum possible length (z).z1Unfeasible length constraints: `min_new_tokens` (z$), when added to the prompt length (z/), is larger than the maximum possible length ()
max_new_tokensr  r  r  r  r   r   r  r  r  )r   r  input_ids_lengthhas_default_max_lengthinput_ids_stringmin_length_error_suffixr  s          r|   _validate_generated_length*GenerationMixin._validate_generated_lengthD  s    "&7&F&F&NSdSoSosuSuMMBCTC_C_B` a  	 ;;;6:kk6T6T2Ze"#3"4D9I8J K%001 2UU + 	  "#45F5Q5Q4RRfg# ''38I8T8TWhWsWs8sMM?@Q@\@\?] ^11B1M1M0NbRTkl
 ++7*99<LLJ888GHYHhHhGi j33C2D E55F5Q5Q4RRTVXop  	 9 8r{   c                    UR                   bY  U(       d=  UR                  b0  [        R                  SUR                    SUR                   S35        UR                   U-   Ul        OUS:X  aP  XVR                  S   :w  a>  U R
                  R                  (       d#  U=R                  UR                  S   -  sl        OrU(       ak  UR                  [        5       R                  :X  aI  UR                  U-   Ul        [        U R
                  SS5      nUb  [        UR                  U5      Ul        UR                  bM  U(       d0  [        R                  SUR                   S	UR                   S
35        UR                  U-   Ul        U$ US:X  aX  XVR                  S   :w  aF  U R
                  R                  (       d+  [        UR                  UR                  S   -
  S5      Ul        U$ )z]Prepared max and min length in generation configs to avoid clashes between similar attributesNzBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   r)   r  zBoth `min_new_tokens` (=z) and `min_length`(=z) seem to have been set. `min_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   )r  r  r   warningr   r   r   r;   r   minr  r  r  )r   r  r  has_default_min_lengthr   r  r  r  s           r|   _prepare_generated_length)GenerationMixin._prepare_generated_lengthp  s    ++7).?.J.J.V./@/O/O.PPd(334 5ff ,=+K+KN^+^(
 / $7$7$::KK22((M,?,?,BB(# ++/?/A/L/LL/@/K/KN^/^!,*1$++?XZ^*_'*6367H7S7SUl3m%0 ++7)./@/O/O.PPd(334 5ff ,=+K+KN^+^( !  / $7$7$::KK22+./@/K/KmNaNabcNd/dfg+h(  r{   use_model_defaultsr   c                    SnUc  U R                   R                  (       a  U R                   R                  [        U R                   5      :X  aw  [	        U R
                  R                  5       5      S:  aP  [        R                  " U R
                  5      nXPR                   :w  a!  [        R                  " S[        5        XPl         U R                   nSn[        R                  " U5      nU(       Gd  [        R                  " [        R                  " U R                   R                   5      R"                  5      nUSL d  Uc  U[        R                  " S5      :  a  0 n[        5       nU R                   n	U	R$                  R'                  5        HY  u  pU
R)                  S5      (       d  U
S:X  a  M#  [+        XS5      n[+        XS5      nX:X  d  MB  X:w  d  MI  XU
'   [-        XU5        M[     Uc(  [	        U5      S:  a  [.        R1                  S	U S
35        OUR2                  c  U R                   R2                  Ul        UR4                  c  U R                   R4                  Ul        UR6                  c  U R                   R6                  Ul        UR8                  c  U R                   R8                  Ul        UR:                  " S0 UD6nX4$ )z
Prepares the base generation config, then applies any generation configuration options from kwargs. This
function handles retrocompatibility with respect to configuration files.
FNr   a?  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed in v5. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )Tz4.50.0r   transformers_versionzX`generation_config` default values have been modified to match model-specific defaults: z=. If this is not desired, please set these values explicitly.rp   )r  _from_model_config_original_object_hashhashr   r   &_get_non_default_generation_parametersr;   from_model_configr  r  r  copydeepcopyr   parser  base_version__dict__r   r;  r   setattrr   r   r  r*  r)  rK  update)r   r  r  r   using_model_generation_confignew_generation_configmodel_base_versionmodified_values global_default_generation_configmodel_generation_configr  model_gen_config_valueglobal_default_valuecustom_gen_config_valuer	  s                  r|   _prepare_generation_config*GenerationMixin._prepare_generation_config  sv    ).%$ &&99**@@DI_I_D``JJLMPQQ(8(J(J4;;(W%(,B,BBMMB $ .C* $ 6 6,0) !MM*;<, ")w}}T=S=S=h=h/i/v/v!w!T)"*/AW]]S[E\/\"$3C3E0*.*@*@'3J3S3S3Y3Y3[/C~~c**c5K.K +23SZ^+_(.56Gd.S+/G2J/E, 18NO 4\ &-#o2F2J''r*++hj
 %119595K5K5X5X%2$119595K5K5X5X%2$119595K5K5X5X%2$;;C?C?U?U?l?l%< )//9&9 ..r{   c                    SU;   a  US   (       a  U$ SU;   a\  U R                   R                  (       dA  [        R                  " US   SSS2S4   [        R                  S9R                  S5      S-
  nOSU;   a\  U R                   R                  (       aA  [        R                  " US   SSS2S4   [        R                  S9R                  S5      S-
  nO6[        R                  " U[        R                  US9R                  S5      S-
  nSnUR                  S	5      bh  US	   nSn[        U[        5      (       d  US   S   R                  S
   nO2[        US5      (       a!  UR                  5       b  UR                  5       nXES nXCS'   U$ )zbCalculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past lengthr   r   r   Nrf  r)   decoder_inputs_embedsr   rc   r   get_seq_length)r   r   rv   rW  int64r   r  r   r   r   r   r   r,  )r   
seq_lengthr   r	  r   r   caches          r|   _get_initial_cache_position+GenerationMixin._get_initial_cache_position  sr    |+=M0Nl*4;;3Q3Q"__\/-J1aQR7-S[`[f[fgnnopqtuuN$49W9W-D EaAg NV[VaVabiijklopp  #ZZ
%++fU\\]^_bccN-.: !23EKeU++#Ahqk//2 011e6J6J6L6X#224+L9N)7%&r{   c                 T   Sn[        U S5      (       a  [        U R                  R                  5       5      S1:X  d)  [        U R                  R                  5       5      SS1:X  a  SnO5U R                  R                  5        Vs/ s H  o3S;  d  M
  UPM     snS   nU R                  R	                  5        VVs0 s H  u  pEXES;   a  UOU_M     nnnUc  gU R
                  R                  5       R                  n[        U5      S:X  a(  SU;   a"  [        R                  [        U5      US   5      $ 0 n[        U S	5      (       a  SnU R                  5        H  u  pIXR                  5       L d  M  Un  O   Uc  [        S
5      eUR                  5        V
s/ s H  oU
;   d  M
  U
PM     nn
[        U5      U:  a2  [        U5       H"  nU H  n
SU S3U
 S3;   d  M  X   X|'     M      M$     O X;   a"  [        R                  [        U5      X   5      nO^SU;   a  UR!                  SS5      S   nO[        SU S35      eMT  U H+  n[        U5       H  nSU S3U S3;   d  M  X   X|'     M)     M-     [        U5       H  nX;  d  M
  [        SU S35      e   U$ s  snf s  snnf s  sn
f )z
Returns the device map for each decoder layer, to allocate the cache on the right device.
Inspired from `dispatch_model` in accelerate.
Nr4  cpudisk)r3  r4  r   r)    r   zw`model.get_decoder()` is not returning a named module of the model. This is unexpected, please open an issue on GitHub.r  zDecoder name z" not found in execution device mapzlayer z! has not been mapped to a device.)r   r   r4  r  r   r   r  num_hidden_layersr   dictfromkeysrangenamed_modulesr   RuntimeErrorr   rsplit)r   execution_device_mapmain_devicednamer   r6  layer_device_mapdecoder_namer   module_namedecoder_mapped_modulesidxlayers                 r|   $_get_layer_device_map_for_cache_init4GenerationMixin._get_layer_device_map_for_cache_init  s   
  $4))4%%,,./E7:c$BTBTB[B[B]>^chjpbq>q#*.*<*<*C*C*Eb*EQRaIaq*Ebcde %)$6$6$<$<$>$$>LD %>kFJ$> ! $  ' !KK779KK#$)b4H.H=='8!9;OPR;STT 4''L $ 2 2 4--//#'L !5 #"/  0D/H/H/J&/J^iNi/J # &
 )*.?? !23C'=se1:K=)::4H4U,1! (> 4 #;+/==?P9QSgSu+v(,'3':':3'B1'E*]<.Hj+kll  . !23C3%qzwa[00D0K(- 4 . *+C*"VC50Q#RSS ,  C c$:&s   	JJ:J	J%#J%cache_implementationmax_cache_lenc                 &   US:X  a  S[        U R                  SS5      ;   a  Sn[        U   nU R                  R                  =(       d    UR	                  S5      SLn[        U S5      (       a)  U(       a  U R                  R                  OU R                  nUS	:X  a   [        U R                  R                  U5      n[        U S5      (       + =(       dD    [        WU5      (       + =(       d,    UR                  U:g  =(       d    [        U[        [        45      n	US
:w  a  U	=(       d    WR                  U:  n	U(       aP  [        U S5      (       a?  U	=(       d6    U R                  R                  R                  US   S   R                   S   :g  n	U	(       a  [        U R                  S5      (       a  U R                  R"                  n
OU R$                  n
U R'                  5       nU R                  R)                  5       UUU
UUS.nU" S0 UD6U l        U(       aI  UR+                  5       nUS   S   R                   S   US'   [-        U R                  U" S0 UD65      U l        U R                  $ U R                  R/                  5         U R                  $ )z
Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
new `generate` call requires a larger cache or uses a different batch size.

Returns the resulting cache object.
hybridllama4rS  r5  hybrid_chunkedr  N_cachesliding_windowmambar   r)   _pre_quantization_dtype)r   max_batch_sizerJ  r   r   rA  rJ  rp   )r   r   r9   r   r   r   rO  self_attention_cacher  rP  r   rS  r   r   rJ  cross_attention_cacher   rR  r   rG  r  r  r   reset)r   rI  r   rJ  r   r	  	cache_clsrequires_cross_attention_cachecache_to_checkneed_new_cachecache_dtyperA  cache_kwargsrF  s                 r|   
_get_cacheGenerationMixin._get_cacheg  sN     8+GDKKQ]_a<b0b#3 ;<PQ	KK**]l.>.>?P.QY].] 	' 4""A_T[[==eiepepN#33 : :MJM h'' ~y99,,
: !35I J	 	  7*+[~/K/Km/[N)gdH.E.E r;;44BBlSdFefgFhFnFnopFqq 
 t{{$=>>"kkAA"jj#HHJ++557",!.$ $4L $3l3DK-!-!2!2!42>?P2QRS2T2Z2Z[\2]/1$++y?Z>?Z[ {{ KK{{r{   c                 .   U R                   =(       a    SU R                  R                  R                  5       ;  =(       aU    SU R                  R                  R                  5       ;  =(       a'    SU R                  R                  R                  5       ;  $ )a  
Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`.
This is mostly the same as `_supports_cache_class` attribute, but add exception for `Jamba` model which
uses its own `HybridMambaAttentionDynamicCache` and do not need to initialize the Cache in advance in
order to save memory (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
for `HybridMambaAttentionDynamicCache`).
jambazambabamba)r   r   rq   rR  r   s    r|   _supports_default_dynamic_cache/GenerationMixin._supports_default_dynamic_cache  st     && ?t~~66<<>>?t~~66<<>>? t~~66<<>>		
r{   max_cache_lengthc                    SU R                   R                  R                  5       ;  a  SOSnU R                  R                  =(       d    UR                  S5      SLnUR                  U5      n	U	b|  UR                  b  [        SU S35      e[        U	[        5      (       aJ  U R                  5       (       a5  U(       d  [        R                  " U	5      O[        R                  " U	5      X''   gUR                  SL a  gU R                  5       (       d7  UR                  b)  [        R                   " S	UR                   S
3["        5        gUb7  UR                  b*  [$        R'                  SUR                   S35        SUl        UR                  =(       d%    [)        U R                  R+                  5       SS5      Ul        UR                  Gb}  UR                  [,        ;   am  UR                  S:X  a  U R.                  (       d  [        S5      eU R1                  UR                  [3        UR4                  UR6                  5      U-  UUUS9X''   gUR                  S:X  a  U R8                  (       d  [        S5      eUR:                  b  UR:                  O	[=        5       n
[>        U
R@                     nU
R@                  S:X  a  [C        5       (       d  [E        S5      eU
R@                  S:X  a  [G        5       (       d  [E        S5      eU" U
5      X''   gUR                  S:X  a  [I        5       X''   gUR                  S:X  a  [        5       X''   ggU(       d
  [        5       O[        [        5       [        5       5      X''   g)z
Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is
instantiated, writes it to `model_kwargs`, under the name expected by the model.
rQ  rc   rd   r  NzMPassing both `cache_implementation` (used to initialize certain caches) and `zB` (a Cache object) is unsupported. Please use only one of the two.FzThis model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to z) will be ignored.zRAn assistant model is provided, using a dynamic cache instead of a cache of type='z'.rI  staticzThis model does not support `cache_implementation='static'`. Please check the following issue: https://github.com/huggingface/transformers/issues/28981)rI  r   rJ  r   r	  	quantizedzThis model does not support the quantized cache. If you want your model to support quantized cache, please open an issue and tag @zucchini-nlp.quantozYou need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. Please install it via  with `pip install optimum-quanto`HQQzYou need to install `HQQ` in order to use KV cache quantization with HQQ backend. Please install it via  with `pip install hqq`	offloadeddynamic)%r   rq   rR  r   r   r   rI  r  r   tuplerd  r   from_legacy_cacher   r9  r  r  r  r   r   r   r  r9   _supports_static_cacher]  r  r  num_return_sequences_supports_quantized_cachecache_configr   r:   backendr&   ImportErrorr%   r   )r   r  r	  rv  r   rf  r   rl  rX  user_defined_cachers  cache_classs               r|   _prepare_cache_for_generation-GenerationMixin._prepare_cache_for_generation  sp    +29P9P9V9V9X*X&^l
KK**]l.>.>?P.QY].] 	' *--j9) 55A cdnco pT T  ,e449]9]9_9_ : !223EF,>>?QR (
  &&%/ 3355 55ABBSBhBhAi j  	  &+<+Q+Q+]%::;2? 6:21B1W1W 2
[bKK'')+A4\
. 11= 559YY$99XEdNiNi$Z  ,0??):)O)O"#4#>#>@Q@f@fgjtt"2!!- ,; ,( #77;F55$M  )55A &22-/ 
 <L<P<PQ''83<W<Y<Y%S  "))U2;K;M;M%H 
 ,7|+D("77;F+9+;("779D+7>( E 6 (H $r{   c                     S[        [        R                  " U R                  5      R                  R                  5       5      ;   $ )z
Return True if the current model supports the keyword argument `logits_to_keep` in forward()
to save memory. Checking it in this way allows to avoid using a new model attribute.
logits_to_keep)r   r   r   r   r   r   rc  s    r|   _supports_logits_to_keep(GenerationMixin._supports_logits_to_keep/  s2    
  3w'8'8'F'Q'Q'V'V'X#YYYr{   kwargs_has_attention_maskc                 L  ^  SU 4S jjnU" UR                   US9nU" UR                  US9nU" UR                  US9nU" UR                  US9nT R                  R
                  (       a  Ub  UOUnUb!  UR                  S:X  a  UR                  S5      nUc@  Ub=  Ub  U(       d  [        R                  S5        US   n[        R                  SU S35        T R                  R
                  (       a  Uc  [        S5      eUb;  [        XgS	9R                  5       (       a  Ub  U(       d  [        R                  S
5        UbL  [        R                  " U5      (       d  US:  R                  5       (       a  [        R                  SU S35        XQl        Xal        Xql        Xl        g)a  
Prepares the special tokens for generation, overwriting the generation config with their processed versions
converted to tensor.

Note that `generation_config` is changed in place and stops being serializable after this method is called.
That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
Nc                    > U c  U $ Ub  UOTR                   n[        U [        R                  5      (       a  U R	                  U5      $ [        R
                  " X[        R                  S9$ )Nr   r   )r   r   rv   r  rj  tensorr   )tokenr   r   s     r|   _tensor_or_none@GenerationMixin._prepare_special_tokens.<locals>._tensor_or_noneF  sR    }%1Vt{{F%..xx''<<EJJGGr{   r  r   zThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.z\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.r!  zThe attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z;`eos_token_id` should consist of positive integers, but is zq. Your generation will not stop until the maximum length is reached. Depending on other flags, it may even crash.r   )r  r*  r)  rK  r   r   r   rh  r   r  r  r!   r'  r   rv   is_floating_point_bos_token_tensorr%  r$  _decoder_start_token_tensor)	r   r  r~  r   r  bos_token_tensoreos_token_tensorpad_token_tensordecoder_start_token_tensors	   `        r|   _prepare_special_tokens'GenerationMixin._prepare_special_tokens6  s    	H ++<+I+IRXY*+<+I+IRXY*+<+I+IRXY%45F5]5]fl%m" ;;)).H.T*Zj '
 ',<,A,AQ,F/99!< #(8(D(4=Vq  02NNFGWFXXqrs ;;)).H.Pn  (!+;\``bb(4=V##C
 '##$455:JQ:N9S9S9U9UNNMN^M_ `r r /?+.>+.>+8R5r{   c                    UR                   (       a  gU R                  R                  S:H  =(       d4    [        UR                  SL=(       a    UR                  R
                  5      n[        UR                  S5      [        5      =(       a    US   R                  nU=(       a    U=(       a    U R                  n[        U SS5      b  XPR                  R                  -  n[        U S5      (       aT  [        U R                  R!                  5       5      nSU;   =(       a    [#        U5      S:  nXW(       + -  nS	U;   nXX(       + -  nUR                  b  U(       d  [$        R'                  S
5        U$ )z`
Determines whether to trigger auto-compilation of the model's forward pass at generation time.
FcudaNrc   hf_quantizerr4  r3  r)   r4  zsYou have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.)disable_compiler   r  boolcompile_config_compile_all_devicesr   r   r   r   rp  r   r  r   r   r4  r  r   r   r   )	r   r	  r  valid_hardwareusing_compilable_cachecan_compileall_model_deviceshas_cpu_offloadhas_disk_offloads	            r|   _valid_auto_compile_criteria,GenerationMixin._valid_auto_compile_criteria  sO   
 ,, ))V3 
t,,D8r=N=]=]=r=r8
 |''(9:EBu|TeGfGuGu 	 %_)?_DD_D_ 4.:,,;;;K4)) #D$6$6$=$=$? @#'88WSAR=SVW=WO..K  &):://K ++7#
 r{   synced_gpusstreamerr`   custom_generatec                   ^ Ub|  UR                  SS5      nSS1n[        5       R                  5        VVs0 s H  u  nnUU;  d  M  UU_M     nnnUR                  U5        U R                  " U4SU0UD6nU" S:SU 0UD6$ UR                  SS5      nUR                  SS5      nU R
                  " TU40 UD6u  mnU R                  UR                  5       5        U R                  UUU5        Uc;  [        5       =(       d    [        U 5      =(       a    [        R                  " 5       S:  nUb  UO	[        5       nUb  UO	[        5       nS	[        [         R"                  " U R$                  5      R&                  R)                  5       5      ;   nS
U;  nUR+                  S	S5      SLnU R-                  UTR.                  U5      u  nnnUR0                  S   nUR2                  nU R5                  TUUS9  U R6                  R8                  (       do  TR:                  bb  US:  a\  [=        UR0                  5      S:X  aC  [>        R@                  " USS2S4   TR:                  :H  5      S:  a  [B        RE                  S5        U R6                  R8                  (       d  US:X  a  STl#        U(       d%  U(       a  U(       a  U RI                  UTU5      US	'   O4U(       a-  US:X  a'  [=        US	   R0                  5      S:  a  [K        S5      eU R6                  R8                  (       a  S
U;  a  U RM                  UUUT5      nU R6                  R8                  (       a+  U RO                  UUUTRP                  UR2                  S9u  nnOUS:X  a  UOUR                  S5      nTRR                  (       a  U RU                  UU5      nUb  URW                  URY                  5       5        UR0                  S   nUR+                  S5      SL =(       a    TRZ                  SLn UR+                  S5      SL =(       a    TR\                  SLn!U R_                  TU U!UUUS9mU Ra                  5       (       a  SU;  a  SUS'   U Rc                  TUU 5        TRZ                  S-
  n"UR0                  S   U:w  a3  US:X  a-  U R6                  R8                  (       d  U"UR0                  S   -  n"U Re                  TUUUU"U5        TRg                  U5      n#Ub  TRh                  S:  a  [K        S5      eU R2                  Rj                  UR2                  Rj                  :w  aa  [l        Rn                  " SUR2                  Rj                   SU R2                  Rj                   SU R2                  Rj                   S3[p        5        U Rs                  TUUUUUR2                  UU	U
S9	n$U Rt                  " S:TUUS.UD6n%TRF                  US '   U#[v        Rx                  :X  a  TRz                  S:  a  [K        S!TRz                   S"35      eUS:  a  [K        S#5      eUS    (       d  [K        S$5      eTR|                  S%;   a  [K        S&5      eU R~                  (       a"  [K        S'U R                  R                   35      eU R                  TUUUUUUUS(9n&U R                  " U4U&U$U%TUUS).UD6n'GOU#[v        R                  :X  aY  U R~                  (       a"  [K        S*U R                  R                   35      eU R                  " U4TR                  U$U%TUUS+.UD6n'GO@U#[v        R                  :X  ac  US    (       d  [K        S,5      eU R~                  (       a"  [K        S-U R                  R                   35      eU R                  " U4U$U%TUUS..UD6n'GOU#[v        R                  [v        R                  4;   aR  U R                  " S:UTRz                  U R6                  R8                  S/.UD6u  nnU R                  " U4U$U%TUUS..UD6n'GOSU#[v        R                  [v        R                  4;   aQ  U R                  " S:UTRh                  U R6                  R8                  S/.UD6u  nnU R                  " U4U$U%TUS0.UD6n'GOU#[v        R                  :X  a  [        UTRh                  UR2                  TR                  TR                  TRz                  TR                  TRZ                  S19n(U R                  " S:UTRh                  U R6                  R8                  S/.UD6u  nnU R                  " UU(4U$U%TUS0.UD6n'GO"U#[v        R                  :X  Ga  / n)TR                  b  TR                  n)TR                  GbH  U4S2 jn*[        TR                  [        5      (       a  [=        TR                  5      S:X  a  U*" 5         TR                   H  n+[        U+S   [        5      (       as  [        U+[        5      (       a  [=        U+5      S:X  a  U*" 5         [        S3 U+ 5       5      (       a  U*" 5         [        S4 U+ 5       5      (       a  U*" 5         [        U+5      n,OT[        U+[        5      (       a  [=        U+5      S:X  a  U*" 5         [        S5 U+ 5       5      (       a  U*" 5         [        U+5      n,U)R                  U,5        M     [        U)UTRh                  UR2                  TR                  TR                  TRz                  TRZ                  S69n-U R                  " S:UTRh                  U R6                  R8                  S/.UD6u  nnU R                  " U4U-U$U%TUS7.UD6n'TR                  SL aG  [        W'S85      (       a6  [        U'R                  S95      b  U'R                  R                  5       U'ld        W'$ s  snnf );a!  

Generates sequences of token ids for models with a language modeling head.

<Tip warning={true}>

Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
model's default generation configuration. You can override any `generation_config` by passing the corresponding
parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

For an overview of generation strategies and code examples, check out the [following
guide](../generation_strategies).

</Tip>

Parameters:
    inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
        The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
        method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
        should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
        `input_ids`, `input_values`, `input_features`, or `pixel_values`.
    generation_config ([`~generation.GenerationConfig`], *optional*):
        The generation configuration to be used as base parametrization for the generation call. `**kwargs`
        passed to generate matching the attributes of `generation_config` will override them. If
        `generation_config` is not provided, the default will be used, which has the following loading
        priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
        configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
        default values, whose documentation should be checked to parameterize generation.
    logits_processor (`LogitsProcessorList`, *optional*):
        Custom logits processors that complement the default logits processors built from arguments and
        generation config. If a logit processor is passed that is already created with the arguments or a
        generation config an error is thrown. This feature is intended for advanced users.
    stopping_criteria (`StoppingCriteriaList`, *optional*):
        Custom stopping criteria that complements the default stopping criteria built from arguments and a
        generation config. If a stopping criteria is passed that is already created with the arguments or a
        generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
        sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
        intended for advanced users.
    prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
        If provided, this function constraints the beam search to allowed tokens only at each step. If not
        provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
        `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
        on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
        for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
        Retrieval](https://arxiv.org/abs/2010.00904).
    synced_gpus (`bool`, *optional*):
        Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
        to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
        deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
    assistant_model (`PreTrainedModel`, *optional*):
        An assistant model that can be used to accelerate generation. The assistant model must have the exact
        same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
        is much faster than running generation with the model you're calling generate from. As such, the
        assistant model should be much smaller.
    streamer (`BaseStreamer`, *optional*):
        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
    negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        The negative prompt needed for some processors such as CFG. The batch size must match the input batch
        size. This is an experimental feature, subject to breaking API changes in future versions.
    negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Attention_mask for `negative_prompt_ids`.
    use_model_defaults (`bool`, *optional*):
        When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
        generation configuration (`model.generation_config`), as opposed to the global defaults
        (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
        `True`.
    custom_generate (`str`, *optional*):
        A string containing the name of a huggingface.co repository. If provided, the custom `generate`
        function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the
        standard `generate` method. Note that the logic is for generation is entirely defined in that
        repository, and the return type may be different from the standard `generate` method.
    kwargs (`Dict[str, Any]`, *optional*):
        Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
        forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
        specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

Return:
    [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
    or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.

        If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
        [`~utils.ModelOutput`] types are:

            - [`~generation.GenerateDecoderOnlyOutput`],
            - [`~generation.GenerateBeamDecoderOnlyOutput`]

        If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
        [`~utils.ModelOutput`] types are:

            - [`~generation.GenerateEncoderDecoderOutput`],
            - [`~generation.GenerateBeamEncoderDecoderOutput`]
Nr   r   r   modelr  ry  r)   r   r  r   r  r   r   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.r   Tr   z1`attention_mask` passed to `generate` must be 2D.)r   r   r	  rK  r   r  r  )r  r  r  r   r  r  r{  zZ`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1.z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.)	r  r  r  r  rw  r   r	  r  r  )r  r  r  r9  zFnum_return_sequences has to be 1 when doing assisted generate, but is r  z6assisted generate is only supported for batch_size = 1z+assisted generate requires `use_cache=True`)rh  rL  rP  z=assisted generate is not supported with Static cache classes`zCassisted generation is not supported with stateful models, such as )r  r   r  rv  rw  rx  ry  r	  )r  rw  r  r  r  r  z=dola decoding is not supported with stateful models, such as )dola_layersrw  r  r  r  r  z,Contrastive search requires `use_cache=True`zBcontrastive search is not supported with stateful models, such as )rw  r  r  r  r  r   rZ  r   )rw  r  r  r  )r   r  r   length_penaltydo_early_stoppingnum_beam_hyps_to_keepr  r  c                  6   > [        ST R                   S35      e)Nzo`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` of positive integers, but is r  )r  force_words_idsr  s   r|   	typeerror+GenerationMixin.generate.<locals>.typeerrorf
  s*    $88I8Y8Y7ZZ[] r{   c              3   L   #    U  H  n[        U[        5      (       + v   M     g 7fr   )r   r  r<  	token_idss     r|   r?  +GenerationMixin.generate.<locals>.<genexpr>v
  s     YPX9:i#>>>PXs   "$c              3   F   #    U  H  n[        S  U 5       5      v   M     g7f)c              3   d   #    U  H&  n[        U[        5      (       + =(       d    US :  v   M(     g7fr   Nr   r&  r<  token_ids     r|   r?  5GenerationMixin.generate.<locals>.<genexpr>.<genexpr>y
  s)     j`iT\Z#%>!>!N(Q,!N`i   .0N)r'  r  s     r|   r?  r  x
  s&      -5	  j`ijjj-5s   !c              3   d   #    U  H&  n[        U[        5      (       + =(       d    US :  v   M(     g7fr  r  r  s     r|   r?  r  
  s)     h_gS[Jx$= = MA M_gr  )constraintsr   r  r   r  r  r  r  )constrained_beam_scorerrw  r  r  r  rc   to_legacy_cacherp   )fr   localsr   r  r   r(  r  r  r  r   r   distget_world_sizerG   r\   r   r   r   r   r   r   r   r  r  r   r   r  r   r   r$  r   rv   r  r   r  r9  r1  r  rI  rX  r  token_healingheal_tokensputr3  r  r  r  r|  r  rx  get_generation_moder  r  r  r  r  r  r  r<   ASSISTED_GENERATIONrq  rI  _is_statefulr   rq   r  _assisted_decodingDOLA_GENERATION_dola_decodingr  CONTRASTIVE_SEARCH_contrastive_searchSAMPLEGREEDY_SEARCHra  _sampleBEAM_SAMPLEBEAM_SEARCH_beam_searchGROUP_BEAM_SEARCHr-   r  early_stoppingr  _group_beam_searchCONSTRAINED_BEAM_SEARCHr  r  r   r  r'  r*   r+   r  r.   _constrained_beam_searchreturn_legacy_cacher   r   rc   r  ).r   r  r  rw  r  r  r  rv  r  r  r  r  r  r   r   global_keys_to_excluder  r  generate_argumentsr   r  ry  r	  accepts_attention_maskrequires_attention_maskr~  r  r   r   r   r   r  r  r  rf  generation_modeprepared_logits_processorprepared_stopping_criteriar  resultbeam_scorerfinal_constraintsr  word_ids
constraintr  s.     `                                           r|   r   GenerationMixin.generate  sp   ^ & &

+> E '-h%7"?Ex~~?O!u?OeSV^tSt*#u*?O!u%%f-'+'@'@(3D(HN($ ,M$M:LMM JJ{D1	$jj)>E*.*I*I1+
5;+
'< 	##L$5$5$78  )=PQ 57W;QRV;Wv]a]p]p]ruv]vK/?/K+QdQf1B1N-ThTj!1S9J9J4<<9X9c9c9h9h9j5k!k"3<"G$0$4$45Et$LTX$X! 9=8R8R%22L9
5' #((+
%%$$%68QZ`$a {{-- "33?N++,1IImArE26G6Y6YYZ]^^l {{--2Bo2U*.'(-DI_-1-X-X0,.L)* ';.3|DT7U7[7[3\_`3` !TUU;;)).?|.SNN|-=?PL
 ;;))&*&T&T%!1)'8'T'T$++ 'U '#I| *:[)HlN^N^_jNkI**((I>ILL) %??1-!'L!9T!A!nFWFbFbjnFn!'L!9T!A!nFWFbFbjnFn ::/#9#9-'- ; 
 ((**/?|/S-.L)*''(9;KMcd -77!;"&66 O3KK22 3 3A 66**|_jJZ\b	

 ,??P%6%@%@1%Dl  ;;y//444MM@@I@P@P@U@U?V W++**+ ,TTXT_T_TdTdSe f*	*  %)$>$>/!1+%=- ''% 3+I %? 
%
! &*%@%@ &
/CT`i&
ms&
"
 %6$?$?[! n@@@ 559 /DDEQH  A~ !YZZ, !NOO 559__ !`aa   !YZ^ZhZhZqZqYrs 
 #'"?"?"3#+ /!1!*$7) #@ 	# ,,	$7!:"<"3'!	 	F  > >>   STXTbTbTkTkSlm  ((	-99!:"<"3'!	 	F  A AA, !OPP   XY]YgYgYpYpXqr  --!:"<"3'! F !6!68T8T UU&*&H&H '#-BB#';;#A#A' 	'#I| \\!:"<"3'! F !;!;^=W=W XX&*&H&H '#-77#';;#A#A' 	'#I| &&!:"<"3' F  @ @@*%+55$++0??"3"B"B&7&L&L 1 A A,77	K '+&H&H '#-77#';;#A#A' 	'#I| ,, ";"<"3' F  F FF " ,,8$5$A$A! 00< ##4#D#DdKK,<<=BK 1 A AH!(1+t44)(D99S]a=O%KYPXYYY%K -5   &K%:8%D
)(D99S]a=O%Kh_ghhh%K%6x%@
%,,Z8) !B. 'B-%+55$++0??"3"B"B&7&L&L,77	'# '+&H&H '#-77#';;#A#A' 	'#I| 22(?!:"<"3' F 11T9 122..0ABN%+%;%;%K%K%MF"M "vs   oothis_peer_finishedc                     U(       ab  [         R                  " U(       a  SOSUS9n[        R                  " U[        R                  R
                  S9  UR                  5       S:X  a  g gU(       a  gg)z
Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is
fed through `this_peer_finished`. ZeRO stage 3-friendly.
r  r  r  )opFT)rv   r  r  
all_reduceReduceOpSUMrU  )r   r  r  r   this_peer_finished_flags        r|   _has_unfinished_sequences)GenerationMixin._has_unfinished_sequences
  sc    
  ',ll:L3RU^d&e#OO38I8IJ&++-4 5   r{   c                 X  ^^ Tc  [        S5      eTR                  TR                  pC[        TR	                  5       5      n[        SUS9nTR                  USS9 Vs/ s H  owR                  5       PM     nnT" USSS9R                  R                  UR                  5      n[        R                  " X:H  XA5      n UR                  5       S	:X  a  U$ USS2S
4   R                  5       n	TR                  TR!                  S5      5      S	   mUU4S jU	 5       n
[#        [%        X5      5       H  u  nu  pX   n[        R&                  " X:H  5      R)                  5       (       a  M9   UR+                  US9 Vs0 s H  nTR!                  U5      4S_M     nn[-        U5      S:X  a  M{  UU4==   S-  ss'   UR/                  US9  USS
 n UR                  5       S	:X  a  M  [-        XU:g     5      S:X  a  UUS
'   U R1                  UR3                  S	5      US9X'   M     U$ s  snf s  snf )a  
Generates sequences of token ids for models with a language modeling head.
Parameters:
    input_ids (`torch.LongTensor`): The sequence used as a prompt for the generation.
    tokenizer (`PreTrainedTokenizerBase`, *optional*): The tokenizer used to decode the input ids.
Return:
    `torch.LongTensor` where each sequence has its tail token replaced with its appropriate extension.
Nzs When generating with token healing, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.r)   )r  r)  T)skip_special_tokenspt)return_tensorspaddingr   r    c              3   f   >#    U  H&  nTR                  U5      R                  S T5      v   M(     g7f)r  N)decodereplace)r<  t	space_tokr  s     r|   r?  .GenerationMixin.heal_tokens.<locals>.<genexpr>
  s,     S(QY%%a(00i@@(s   .1)prefixg      $@r  r  r  )r  r  r)  r"   	get_vocabr;   batch_decodestripr   rj  r   rv   wherenumeltolistconvert_ids_to_tokensconvert_tokens_to_ids	enumerateziprT  rU  
extensionsr   r  r   rh  )r   r   r  r  r)  
vocab_trier  r=  promptstail_ids	tail_toks	batch_idxtail_idtail_tok	batch_idsalt_tokseq_biastrimmed_idsr  s     `               @r|   r  GenerationMixin.heal_tokens
  sa    * 
 &/%;%;Y=S=Sl#I$7$7$9:
,ALY '0&<&<Y\`&<&ab&a779&ab
 )BBy''(	 	 KK	 9<S		 ??!QU#**,33I4S4STW4XYZ[\	 T(S	.7H8P.Q*I*!,Iyy2388::
 R\QfQfnvQfQwQwg009;TAQw   8}! gZ C' $$8$<#CR.K   "a' 9,678A=".B#'==1F1Fq1I]n=#oI E /RH y cDs   H"9H'r  c           
         U R                   R                  (       a  [        S5      eUR                  n	UR                  n
UR
                  nUR                  nUR                  nUR                  n[        S U 5       5      nUR                  nU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U(       a  SOSnUR                  SS u  nn[        R                  " U[        R                  UR                  S9nU R!                  UUR                  U5      nSnU R                   R#                  5       R$                  nU R                   R&                  (       d  SnOUS:  a  SnOUS:X  a  S	nOSn[)        U[*        5      (       aI  US
:X  aC  UUS-  :X  a  U/nOUS::  a  [-        [/        UUS-  S5      5      O[-        [/        USS5      5      nO[)        U[*        5      (       a?  US:X  a9  US::  a  [-        [/        US-  US5      5      O[-        [/        US-
  US5      5      nO;[)        U[,        5      (       a  U Vs/ s H  nUU:  d  M  UPM     nnO[        S5      eU R1                  5       nUc  [        S5      eU R3                  UXaR                  S9(       Ga  U R4                  " U40 UD6nU " S0 UDSU
SS.D6n U R6                  SS2SSS24   R9                  5       R;                  S[        R<                  S9n!U R6                  SS2SSS24   R?                  5       n"0 n#U H>  n$U" U R@                  U$   SS2SSS24   5      R;                  U"R                  5      U#U$'   M@     U RC                  U UU R                   R                  S9nU(       a
  U(       a  GM!  [E        UU#U"5      n%U%R;                  UR                  5      n%U" UU%5      n&U(       a  U(       a  UU&4-  nU(       a  UU!4-  nU
(       ac  UU R                   R                  (       a  U RF                  4OU RH                  4-  nU R                   R                  (       a  UU RJ                  4-  nU(       a8  UU R                   R                  (       a  U RL                  4OU R@                  4-  nU(       aC  [N        RP                  RS                  U&SS9n'[        RT                  " U'S	S9RW                  S	5      n(O[        RX                  " U&SS9n(U(       a  U(U-  U	S	U-
  -  -   n([        RZ                  " UU(SS2S4   /SS9nUb  UR]                  U(R_                  5       5        UU" UU5      ) -  nURa                  5       S:H  nU R3                  UXaR                  S9(       a  GM  Ub  URc                  5         U(       a  [e        UUUUUURg                  S5      S9$ U$ s  snf )a	  
Generates sequences of token ids for models with a language modeling head using **dola decoding** and can be
used for decoder-only text models.
The method is based on the paper "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language
Models" (https://arxiv.org/abs/2309.03883) in ICLR 2024.

Parameters:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The sequence used as a prompt for the generation.
    dola_layers (`Union[str, List[int]]`):
        The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
        means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
        to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
    logits_processor (`LogitsProcessorList`):
        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
        used to modify the prediction scores of the language modeling head applied at each generation step.
    stopping_criteria (`StoppingCriteriaList`, *optional*):
        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
        used to tell if the generation loop should stop.
    generation_config ([`~generation.GenerationConfig`]):
        The generation configuration to be used as parametrization of the decoding method.
    synced_gpus (`bool`):
        Whether to continue running the while loop until max_length (needed to avoid deadlocking with
        `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
    streamer (`BaseStreamer`, *optional*):
        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
    model_kwargs:
        Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
        If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

Return:
    [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
    or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
    [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
    `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
    `model.config.is_encoder_decoder=True`.
z8DoLa decoding is only available for decoder-only models.c              3   :   #    U  H  n[        US 5      v   M     g7fr*  Nr   r<  r  s     r|   r?  1GenerationMixin._dola_decoding.<locals>.<genexpr>Q       'lZkh.(I(IZk   rp   Nr   r   Fr   r)   low(   r  highz?dola_layers must be either 'low', 'high' or a list of integers.zCDoLa is not supported for models that don't have output embeddings.r  T)rC  rA  rB  r   )r  r   r   rO  num_samplesrc   rk   rl   rm   rn   ro   rc   )4r   r   r  r$  rA  rB  output_scoresoutput_logitsreturn_dict_in_generater'  r  r   rv   r  r   r   r0  r  r6  tie_word_embeddingsr   strr  r9  get_output_embeddingsr  r  rm   detachrj  float32floatro   ro  _dola_select_contrastr   rn   r   r   r   r   softmaxmultinomialsqueezeargmaxrV  r  r3  r  endri   r   ))r   r   r  rw  r  r  r  r  r	  r)  rA  rB  r   r!  r"  has_eos_stopping_criteriar  rl   
raw_logitsr   r   r   r   
cur_lengthunfinished_sequencesr  final_layerstart_layercandidate_premature_layersilm_headr   rc  final_layer_next_token_logitsfinal_logitscandidate_premature_logitscandidate_premature_layernext_token_logitsnext_token_scoresprobsnext_tokenss)                                            r|   r  GenerationMixin._dola_decoding  s   d ;;))WXX )::-??0EE)77)77"3"K"K$''lZk'l$l!%//	 0M3RD
$;@QRX\"9>O2VZ'>CW^b "+!!4
J$zz*EJJyO_O_`77
IDTDTVbc" kk113EE
 {{..K1_KAKK k3''K5,@kQ...9]* #b( {K1,<a@AeKQ78 +
 S))kV.C "$ U;!+[!<=%b 0+qAB ' T**5@)T[AO![&)T&^__,,.?bcc,,-?UeUe,f==iX<XL   "3%)	G -4NN1b!8,D,K,K,M,P,PVZbgbobo,P,p)">>!R(399;L)+&-G)HO))*CDQAXNI"\(() ++DE .H  CC#';;#A#A D L
 1 5*,F! !2 4 4Y5E5E F 0<M N ' 022F #@"BBJ$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) --.?R-H#//1EMMaP#ll+<"E )),@@<STWkSkCll 		9k!T'.B"CLI#[__./ $8;LYX^;_:_#_ !5!9!9!;q!@U ,,-?UeUe,ffX LLN",#!-3 , 0 01B C  C *Us   '
W25W2c                 n   [        S U 5       5      nUR                  n	UR                  n
UR                  nUR                  nUR
                  nUR                  nUR                  nUR                  nUR                  nU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       aU  U R                  R                  (       a:  U(       a  US   R                  S5      OSnU(       a  US   R                  S5      OSnUR                  SS u  nn[        R                  " U[        R                   UR"                  S9nU R%                  UUR"                  U5      n[        R&                  " U[        R                   S	9nU R                  R                  (       a  S
U;   a  US
   b  US
   nOUS   nUR)                  U	SS9nSnU R+                  UXQR"                  S9(       G
aN  UR                  S5      b7  [-        US   [.        [0        45      (       Ga  US   R3                  5       S:X  Ga  SUS'   U R4                  " U40 UD6nU " S0 UDSSUS.D6nU R                  R                  (       a  UR6                  S   n OUR8                  S   n UR:                  SS2SSS24   R=                  S[        R>                  UR"                  S9n!U RA                  UUU R                  R                  S9nU(       d-  U RB                  " SUU	U R                  R                  S.UD6u  n"nUR                  S5      n#U#c"  [E        U RF                  RH                   S35      e[-        U#S   [J        [        RL                  45      (       a  U#S   S   R                  S   U:w  a"  [E        U RF                  RH                   S35      eU" UW!5      n$[N        RP                  RS                  U$SS9n%[        RT                  " U%SU	S9u  n&n'U(       a  U(       a  UU!4-  nU(       a  UU$4-  nU(       ac  UU R                  R                  (       a  WRV                  4OWRX                  4-  nU R                  R                  (       a  UURZ                  4-  nU(       a8  UU R                  R                  (       a  WR6                  4OWR8                  4-  nAU(       d  US   n([-        U([\        5      (       d4  [-        U([0        5      (       a1  [-        U(R^                  [\        5      (       a  U(Ra                  U	5        OZ/ n)U( HG  n*/ n+U* H"  n,U+Rc                  U,R)                  U	SS95        M$     U)Rc                  [K        U+5      5        MI     [K        U)5      n(U(US'   U(       a  / n-[e        U	5       H  n.U R4                  " U'SS2U.4   Rg                  SS5      40 UD6n/U " S0 U/DSSUS.D6n[-        US   [\        5      (       d:  [-        US   [0        5      (       a;  [-        US   R^                  [\        5      (       a  SUS'   US   Ri                  S5        U-Rc                  U5        M     [k        U-U R                  Rm                  5       5      nO1U R4                  " U'Rg                  SS5      40 UD6n/U " S0 U/DSSUS.D6nA/U R                  R                  (       a  UR6                  S   n0UR6                  n1OUR8                  S   n0UR8                  n1UR:                  SS2SSS24   Ro                  5       n2W R)                  U	SS9n3[q        U3U0U&UX5      n4[        Rr                  " UURu                  UR                  S   S45      /SS9nU4R=                  S5      n4[        Rv                  " [y        U45       V.V5s/ s H  u  n.n5U5U.U	-  -   PM     sn5n.5      n6U'[e        [{        U'5      5      U44   n7[        R|                  " [        R~                  " U0R                  SS9U	5      5      n0U0[e        U5      U4SS24   n0[        Rr                  " U U0R                  S5      /SS9n Sn8U1 HE  n*[        R|                  " [        R~                  " U*U	5      5      [e        U5      U4SS24   n*U8U*4-  n8MG     U(       a>  U R4                  " U'SS2U44   Rg                  SS5      40 UD6n9U " S0 U9DSSSS.D6n:U:S   n;OSn;[         H  n<U;=(       d    [        UU<S5      n;M     [-        U;[\        5      (       d4  [-        U;[0        5      (       a1  [-        U;R^                  [\        5      (       a  U;R                  U65        OQ/ n)U; H>  n*/ n+U* H  n,U+Rc                  U,U6S4   5        M     U)Rc                  [K        U+5      5        M@     [K        U)5      n;[        R|                  " [        R~                  " U2U	5      5      [e        U5      U4SS24   n!U!R=                  UR"                  5      n!U R                  R                  (       a  Sn=Sn>U(       a  URZ                   HB  n*[        R|                  " [        R~                  " U*U	SS95      [e        U5      U4S4   n*U=U*4-  n=MD     URV                   HB  n*[        R|                  " [        R~                  " U*U	SS95      [e        U5      U4S4   n*U>U*4-  n>MD     [        U;U8U>=(       d    SU==(       d    SS9nOoSn?U(       aR  URX                   HB  n*[        R|                  " [        R~                  " U*U	SS95      [e        U5      U4S4   n*U?U*4-  n?MD     [        U;U8U?=(       d    SS9nU RA                  UUU R                  R                  S9nU(       a
  U(       a  G	M  U(       a  U7U-  USU-
  -  -   n7[        Rr                  " UU7SS2S4   /SS9nUb  UR                  U7R                  5       5        UU" UU5      ) -  nUR                  5       S:H  nU R+                  UXQR"                  S9(       a  G
MN  Ub  UR                  5         U(       Ga-  UR                  S5      b  [-        US   [\        5      (       d:  [-        US   [0        5      (       a7  [-        US   R^                  [\        5      (       a  US   Ri                  S5        O\/ n#US    HC  n*/ n@U* H  n,W@Rc                  U,SSS2SS24   5        M      U#Rc                  [K        W@5      5        ME     [K        U#5      US'   U R                  R                  (       a   [        UUUWWUUUUR                  S5      S 9	$ [        UUUUUUR                  S5      S!9$ U$ s  sn5n.f )"a  
Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

Parameters:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The sequence used as a prompt for the generation.
    logits_processor (`LogitsProcessorList`):
        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
        used to modify the prediction scores of the language modeling head applied at each generation step.
    stopping_criteria (`StoppingCriteriaList`):
        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
        used to tell if the generation loop should stop.
    generation_config ([`~generation.GenerationConfig`]):
        The generation configuration to be used as parametrization of the decoding method.
    synced_gpus (`bool`):
        Whether to continue running the while loop until max_length (needed to avoid deadlocking with
        `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
    streamer (`BaseStreamer`, *optional*):
        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
    model_kwargs:
        Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
        If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

Return:
    [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
    or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
    [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
    `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
    `model.config.is_encoder_decoder=True`.
c              3   :   #    U  H  n[        US 5      v   M     g7fr  r  r  s     r|   r?  6GenerationMixin._contrastive_search.<locals>.<genexpr>  r  r  rp   Nr  rn   ro   r   r   rf  r   r   r   rO  Fr  rc   Tr9  )rC  rB  rA  r   r  r   r   r  r  zQ does not support caching and therefore **can't** be used for contrastive search.z| does not have a standard cache format and therefore **can't** be used for contrastive search without further modifications.)rP  r  r)   r3  .)rc   r   r   r   )rc   ro   rn   	rk   rl   rm   r   r   r   r   r   rc   r  )Mr'  r  penalty_alphar$  rA  rB  r   r!  r"  
low_memoryr   r   r   r   rv   r  r   r   r0  rW  r]  r  r   r   r   r,  r  r   ro   rm   rj  r'  ro  ra  r  r   rq   rn  r  r   r   r*  topkr   rn   r   r   rT  batch_repeat_interleaver  r9  rQ  cropstack_model_outputsr  r(  _ranking_fastrV  ri  r  r  r   r  splitr,  rh  rg  r   batch_select_indicesr    r   r  r3  r  r.  r~   ri   )Ar   r   rw  r  r  r  r  r	  r/  r  rF  r)  rA  rB  r   r!  r"  
sequentialr0  rl   r   r   r   r   r   r   cur_lenr2  cosine_matrix_maskr  r   rc  last_hidden_stateslogit_for_next_stepr   rc   processed_logit_for_next_step
next_probstop_k_probs	top_k_idspastnew_key_valuesrF  r   rU  all_outputsr6  next_model_inputsnext_hiddenfull_hidden_statesrm   context_hiddenselected_idxr  augmented_idxr?  next_decoder_hidden_statesnext_model_inputselected_outputsnext_past_key_valuesrk  next_step_cross_attentionsnext_step_decoder_attentionsnext_step_attentionslayer_past_key_valuessA                                                                    r|   r  #GenerationMixin._contrastive_search  s   X %('lZk'l$l!!'')77(::-??0EE)77)77"3"K"K&11
 4RD
/M$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G$zz*EJJyO_O_`77AQAQS_` #__YejjI;;))'<7LIa<b<n%12J%K"!-.>!?/AA%QAO",,-?UeUe,f  12:<(9:UDW<XYY !23BBDI -1[)#AA)\|\  "044ct ;;11)0)F)Fr)J&)0)>)>r)B& '.nnQAX&>&A&AU]]9;K;K 'B '#  $GG '+{{'E'E  H   " '+&H&H '"+$)+/;;+I+I' '	'OA| #/"2"23D"E"*$>>223 42 2 
 #?1#5u||7LMM&q)!,2215C$>>223 4U U  -=YH[,\)../LRT.UJ%*ZZ
e%L"K ' #6"88J <>>F$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) #$56dL11t%899jIbIbdp>q>q007%'N!% "$)D!LL)?)?1)?)MN %*&--eEl; "& !0D26./ uA(,(J(J9UVXYUY?K_K_`bdeKf(wjv(w%" +$(-1*;	G "'*;"<lKK"7+<#=?RSS&w/@'A'V'VXdee 6: 12$%67<<R@&&w/' &( .k4;;;V;V;XY
 %)$F$Fy~~VXZ[G\$m`l$m! ' $)-&7	 " {{--%;;B?%,%B%B"%33B7%,%:%:" ^^Ar1H-335F/AA%QAON
 )[:LmL "'#%7%@%@BTBZBZ[\B]_`Aa%bcik" (??51L "LLIlD[)\D[DAq!a%i-D[)\]M
 $E#i.$9<$GHK++ekk+2E2E!2E2Le&TUK%eJ&7q&HIK!&,>@U@UVW@X+Y_`!a)+&+EKKu$=>uZ?PR^`a?ab*uh6* ,
 #'#E#Eao.33B:$>J$  $( $&$ $).&+	$  (88I'J$ (,$+:'+?+n77TgimCn( ,; 2LAA35HII"#7#L#Ll[[(==mL%'N!5 "$)D!LLmS.@)AB %*&--eEl; "6 ,1+@("'++ekk&%.H"I%PZJ[]iklJl"m"5"8"89I9I"J {{---/*/1,$!(!9!9 %EKKu!,L MeT^N_amorNr s2uh>2 ": ")!;!; %EKKu!,L MeT^N_amorNr s4@4 "< *$8*D'C'Kt%?%G4	 (*$$!(!3!3 %EKKu!,L MeT^N_amorNr s,8, "4 1$8"<3;t  CC#';;#A#A D L
 1 )),@@<STWkSkCll 		9k!T'.B"CLI#[__./ $8;LYX^;_:_#_ !5!9!9!;q!@w ,,-?UeUe,ffz LLN"  12>l+<=|LL|,=>@STT"<0A#B#W#WYeff !2388<&(O!-.?!@02-$)D188c3B3k9JK %*'..u5J/KL	 "A
 7<O6LL!23{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  i *]s   !v1
c                 j
   UR                   nUR                  n	UR                  n
UR                  nUR                  nUR
                  n[        S U 5       5      nUR                  nU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U	(       a  SOSnU(       a	  U	(       a  SOSnU(       a	  U
(       a  SOSnU(       aU  U R                  R                  (       a:  U	(       a  US   R                  S5      OSnU
(       a  US   R                  S5      OSnUR                  SS u  nnSn[        R                  " U[        R                  UR                  S	9nU R!                  UUR                  U5      nU R"                  nU R%                  Xt5      nU(       a.  S
[&        R(                  S'   U R+                  UR,                  5      nUR.                  b  U R0                  " X40 UD6nSnOSnU R3                  UXQR                  S9(       Ga  U R4                  " U40 UD6nUR7                  U	(       a  SU	0O0 5        UR7                  U
(       a  SU
0O0 5        U(       a  U " S0 UDSS0D6nSnOU" S0 UDSS0D6nU R9                  UUU R                  R                  S9nU(       a	  U(       a  M  UR:                  SS2SSS24   R=                  S[        R>                  UR                  S9n U" UU 5      n!U(       a  U(       a  UU!4-  nU(       a  UU 4-  nU	(       ac  UU R                  R                  (       a  UR@                  4OURB                  4-  nU R                  R                  (       a  UURD                  4-  nU
(       a8  UU R                  R                  (       a  URF                  4OURH                  4-  nU(       aC  [J        RL                  RO                  U!SS9n"[        RP                  " U"SS9RS                  S5      n#O[        RT                  " U!SS9n#U(       a  U#U-  USU-
  -  -   n#[        RV                  " UU#SS2S4   /SS9nUb  URY                  U#R[                  5       5        UU" UU5      ) -  nUR]                  5       S:H  nUS-  nAU R3                  UXQR                  S9(       a  GM  Ub  UR_                  5         U(       aX  U R                  R                  (       a   [a        UUUWWUUUUR                  S5      S9	$ [c        UUUUUUR                  S5      S9$ U$ )a  
Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

Parameters:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The sequence used as a prompt for the generation.
    logits_processor (`LogitsProcessorList`):
        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
        used to modify the prediction scores of the language modeling head applied at each generation step.
    stopping_criteria (`StoppingCriteriaList`):
        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
        used to tell if the generation loop should stop.
    generation_config ([`~generation.GenerationConfig`]):
        The generation configuration to be used as parametrization of the decoding method.
    synced_gpus (`bool`):
        Whether to continue running the while loop until max_length (needed to avoid deadlocking with
        `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
    streamer (`BaseStreamer`, *optional*):
        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
    model_kwargs:
        Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
        an encoder-decoder model the kwargs should include `encoder_outputs`.

Return:
    [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
    A `torch.LongTensor` containing the generated tokens (default behaviour) or a
    [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
    `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
    `model.config.is_encoder_decoder=True`.
c              3   :   #    U  H  n[        US 5      v   M     g7fr  r  r  s     r|   r?  *GenerationMixin._sample.<locals>.<genexpr>  r  r  rp   Nr  rn   ro   r   Fr   0TOKENIZERS_PARALLELISMTr  rA  rB  rC  r  r   rD  rO  r)   r  r   rc   rE  r  )2r$  rA  rB  r   r!  r"  r'  r  r   r   r   r   rv   r  r   r   r0  __call__r  r   environget_compiled_callr  prefill_chunk_size_prefill_chunkingr  r  r  ro  rm   rj  r'  r   rn   r   r   ro   r   r   r*  r+  r,  r-  rV  r  r3  r  r.  r~   ri   )$r   r   rw  r  r  r  r  r	  r)  rA  rB  r   r!  r"  r/  r  rl   r0  r   r   r   r   r   r   rP  r  r2  model_forwardcompile_forward
is_prefillr   rc  r<  r=  r>  r?  s$                                       r|   r  GenerationMixin._sample  s    V )::-??0EE)77)77"3"K"K$''lZk'l$l!%//	 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G"$zz*EJJyO_O_`77AQAQS_`;;L\36BJJ/0 223D3S3STM//;11)_R^_LJJ,,-?UeUe,f==iX<XL L]!46G HcefRf!79M Nlno@@4@"
'I,IDI  CC#';;#A#A D L
 1 !(q"ax 8 ; ;U]]clcscs ; t !1<M N ' 022F #4"66J$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) --.?R-H#//1EMMaP#ll+<"E )),@@<STWkSkCll 		9k!T'.B"CLI#[__./#7;LYX^;_:_#_ !5!9!9!;q!@qLG S ,,-?UeUe,ffV LLN"{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  r{   c                 ~   U R                   R                  R                  5       n[        U[        [
        45      (       a  U R                  X5      nU$ SU;   aS  [        U[        [        45      (       d  [        SU S35      eU R                  X5      n[        R                  " U5      nU$ UR                  U5        U$ )a#  
Temporary function to handle the different types of cache reordering processes while we roll out `Cache`.

TODO: standardize cache formats and make all models compatible with `Cache`. It would remove the need
for this function, with `Cache.reorder_cache` being the sole remaining code path

gptbigcodez'Using an unsupported cache format with zG. Currently, it only supports the legacy tuple format or `DynamicCache`)r   rq   rR  r   rn  r  rt  r   r   r  ro  reorder_cache)r   rc   rs  model_classs       r|   _temporary_reorder_cache(GenerationMixin._temporary_reorder_cacheD  s     nn--335ot}55"11/LO  [(o>Q/RSS =k] K< <  #11/LO*<<_MO  ))(3r{   r  c                 x    [        U R                  5      n[        R                  " XS   US   -  /USS -   5      $ )z=[batch_size, num_beams, ...] -> [batch_size * num_beams, ...]r   r)   r   Nr  r   rv   r  )r  r   s     r|   _flatten_beam_dim!GenerationMixin._flatten_beam_dim^  s<     V\\"}}VAhq&9%:U12Y%FGGr{   r  c                 h    [        U R                  5      n[        R                  " XU/USS -   5      $ )z=[batch_size * num_beams, ...] -> [batch_size, num_beams, ...]r)   Nr  )r  r   r  r   s       r|   _unflatten_beam_dim#GenerationMixin._unflatten_beam_dimd  s1     V\\"}}V)%<uQRy%HIIr{   c                    [        UR                  5      [        U R                  5      :  a?  UR                  S5      n[        UR                  5      [        U R                  5      :  a  M?  [        R                  " XSS9nU$ )a  
Gathers the beam slices indexed by beam_indices into new beam array.

Args:
    tensor (`torch.Tensor`): A tensor containing data to be gathered. The tensor is a 2D or a 3D tensor
        with the two first dimensions depicting the batch and the beam dimensions.
    beam_indices (`torch.Tensor` of shape `(batch_size, num_beams_to_select)`): The indices of the beams to
        select .

Returns:
    A tensor with the selected beams
r   r)   )inputr  rP  )r   r   rh  rv   take_along_dim)r  r   gathered_tensors      r|   _gather_beamsGenerationMixin._gather_beamsj  sg     ,$$%FLL(99'11"5L ,$$%FLL(99..VWXYr{   running_beam_scoresbeam_scoresis_sent_finished!next_token_hits_stopping_criteriarP  r  decoder_prompt_lenr  r  c	           	      N   US:X  a  US:  a  XV-
  n	OXF-
  n	U SS2SS24   X-  -  n
[         R                  " U[         R                  " USSS9S   S5      n[         R                  " X:  5      n[         R                  " U5      USL -  ) n[         R                  " U5      ) nX-  U-  $ )	zf
Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
neverr  Nr)   TrP  keepdimr       e)rv   r  r  r'  rT  )r  r  r  r  rP  r  r  r  r  best_hypothetical_lengthbest_possible_running_scoreworst_finished_scoreimprovement_possibleexists_open_beamvalid_continuationss                  r|   %_beam_search_has_unfinished_sequences5GenerationMixin._beam_search_has_unfinished_sequences~  s    , W$#)='1'F$'.'C$&9!RaR%&@D\Dl&m#${{+;UYY{XYcg=hij=kmst$yy)D)[\ #YY'78Nd<RST  %yy)JKK#69LLLr{   accumulated_log_probsrunning_sequencesrunning_beam_indicesr  beams_to_keepr  c                    U(       aH  [         R                  " [        R                  R	                  USS9US9n[         R
                  " USUS9nO[         R                  " XS9u  pX-  nU R                  X=5      nU R                  X-5      nX-  nUUSS2SS2U4'   [         R                  " U
UR                  S9R                  SS5      U-  nUU-   nUUSS2SS2XE-
  4'   XU4$ )	a  
Get top-K continuations given the accumulated log probs on the next token.

A few notes to understand what's going on:
1. Each item in batch has `num_beams` * `vocab_size` candidate continuations. For each item, get the
top K [K = (number of EOS tokens + 1) * `num_beams`] candidates with the highest accumulated
log-probabilities, or sample them without replacement using the accumulated scores
2. We gather the top K (as opposed to `num_beams`, or any number lower than K) here so that we have at
least `num_beams` sequences remaining to continue the live beam search.
3. Note that other stopping criteria might result in impossible to continue beams, i.e. all continuations
selected in this step hit the stopping criteria.
r   rO  r  r)   )r  rP  indexr  Nr  )rv   r+  r   r   r*  r  rH  r  r   r   rQ  )r   r  r  r  rP  r  r  r  r  r  r   topk_indicestopk_log_probstopk_current_beam_indicestopk_running_beam_indicestopk_running_sequencestopk_idsbatch_offsetbatch_modified_indicess                      r|   _get_top_k_continuations(GenerationMixin._get_top_k_continuations  s    <  ,,%%&;%DR_L #\\0E1T`aN+0::6K+](N %1$>!$($6$67K$g!!%!3!34E!a, 19q!W}- ||JxGLLRQRSV__!:\!IH^!!Q(D"DE7PPPr{   r  r  r  c                     XR                  [        R                  5      S-  -   n[        R                  " XeS9S   nU R	                  X'5      nU R	                  Xg5      n	U R	                  X75      n
XU
4$ )z
Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
best non-finished beams to continue beam search in the next iteration.
r  r  r)   )rj  rv   r'  rH  r  )r   r  r  r  r  r  topk_running_log_probsnext_topk_indicesr  r  r  s              r|   %_get_running_beams_for_next_iteration5GenerationMixin._get_running_beams_for_next_iteration  s|     "02V2VW\WdWd2ehn2n!n!JJ'=KAN ../EY"001G[#112K_ 7KKKr{   top_num_beam_maskc                 :   XSSS24   -  nXKS-   U-
  U-  -  n[         R                  " USSS9USL -  nUUR                  [         R                  5      S-  -  nXO) S-  -  n[         R                  " X4SS9n[         R                  " X44SS9n[         R                  " XV4SS9n[         R                  " X4SS9n[         R
                  " UU
S9S   nU R                  UU5      nU R                  UU5      nU R                  UU5      nU R                  UU5      nXXW4$ )	z
Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
the current finished sequences.
Nr)   r   T)axiskeepdimsr  rO  r  )rv   rT  rj  r'  rV  rH  r  )r   rk   r  r  r  r   r  r  r  r  r  rP  r  r  r  did_top_num_beams_just_finishedbeams_in_batch_are_fullmerged_sequencesmerged_scoresmerged_beam_indicesmerged_is_sent_finishedtopk_merged_indicess                         r|   _update_finished_beams&GenerationMixin._update_finished_beams  sI   0 +L`dfg`gNh*h' (aK:L,LQ_+_`"')),<2PT"UYgkoYo"p144U]]CfLL;vEE
 !99i%HaP		;"?QG#ii(QWXY"'))-=,_ef"g#jj)DQG&&'79LM	((8KL))*=?RS--.EGZ[|EEr{   c                    UR                   nUR                  nUR                  n	UR                  n
UR                  nUR
                  nUR                  nUR                  nUR                  nUR                  nUR                  nUR                  nUR                  nUR                  SS u  nnUU-  nU R                  R                  S:X  a  U R                   R"                  nOYU R                  R                  S:X  a  U R%                  5       R&                  nO$U R                   R)                  5       R*                  nUnSnUb  UR                  S   OSn[-        SSU-   5      U-  n[.        R0                  " [.        R2                  " U[.        R4                  S9[.        R6                  " UU-
  [.        R4                  S94SS	9R9                  UR:                  5      nU R=                  UUR:                  U5      nUR>                  nU(       a  [A        S
5      eU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U(       a  SOSn U(       a	  U	(       a  SOSn!U(       a	  U	(       a  SOSn"U(       a	  U
(       a  SOSn#U(       aU  U R                   RB                  (       a:  U	(       a  US   RE                  S5      OSn$U
(       a  US   RE                  S5      OSn%Ub  U=(       d    US   OSn&[.        RF                  " UUU4U&[.        RH                  UR:                  S9n'U RK                  UUU5      U'SS2SS2SU24'   U'RM                  5       RO                  5       n([.        R6                  " UU4[.        RP                  UR:                  S9n)SU)SS2SS24'   [.        RF                  " UU4S[.        RP                  UR:                  S9n*[.        R6                  " UU4[.        R4                  UR:                  S9n+[.        R6                  " UU4[.        R4                  UR:                  S9n,[.        RF                  " UUUU-
  4S[.        RR                  UR:                  S9n-U-RM                  5       RO                  5       n U RU                  UXQR:                  S9(       GaL  U RW                  U'SS2SS2SU24   5      n.U RX                  " U.40 UD6n/U/R[                  U	(       a  SU	0O0 5        U/R[                  U
(       a  SU
0O0 5        U " S0 U/DSS0D6n0U R]                  U0UU R                   RB                  S9nU(       a	  U(       a  M  U0R^                  SS2SSS24   R9                  S[.        R`                  UR:                  S9n1[b        Rd                  Rg                  U1SS	9n2U" U.U25      n2U(       a  U(       a  UU1RO                  5       4-  nU(       a  U(       a  UU2RO                  5       4-  nU	(       ac  U!U R                   RB                  (       a  U0Rh                  4OU0Rj                  4-  n!U R                   RB                  (       a  U"U0Rl                  4-  n"U
(       a8  U#U R                   RB                  (       a  U0Rn                  4OU0Rp                  4-  n#A0U RK                  U2UU5      n2U2U)SS2SS2S4   -   n2[.        Rr                  " U2UUU-  45      n2U Ru                  U2U'U-UUUUUUUS9
u  n3n4n5U" U RW                  U4SS2SS2SUS-   24   5      U5      n,U RK                  U,UU5      n,U Rw                  U3U4U5U,US9u  n'n)n-U Ry                  U(U4U*U3U U5U+U,UUUUUUS9u  n(n*n n+URE                  SS5      b-  U R{                  US   U RW                  U-SUU-
  4   5      S9US'   US-   nU R}                  U)U*U+U,UUUUU5	      (       + nU RU                  UXQR:                  S9(       a  GML  U RW                  U(SS2SU2SS24   5      n(U RW                  U*SS2SU24   5      n*U RW                  U SS2SU2SS24   5      n U S-   R5                  5       R                  SS	9R-                  5       n6UU6-   n7U(SS2SU724   n(U SS2SU624   n U(       ae  U(       d  Sn*U R                   RB                  (       a"  [        U(U*UUU W$W%U!U"U#URE                  S5      S 9$ [        U(U*UUU U!U#URE                  S5      S!9$ U($ )"a  
Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

If it's the first time you're diving into Beam Search, we recommend you read the following blog post:
https://huggingface.co/blog/how-to-generate (especially the beam search section).

You can recompute the sequence scores from the individual scores using the `compute_transition_scores` function
(https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.compute_transition_scores)

Parameters:
    input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
        The sequence used as a prompt for the generation.
    logits_processor (`LogitsProcessorList`):
        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
        used to modify the prediction scores of the language modeling head applied at each generation step.
    stopping_criteria (`StoppingCriteriaList`:
        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
        used to tell if the generation loop should stop.
    generation_config ([`~generation.GenerationConfig`]):
        The generation configuration to be used as parametrization of the decoding method.
    synced_gpus (`bool`):
        Whether to continue running the while loop until max_length (needed to avoid deadlocking with
        `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
    model_kwargs:
        Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
        an encoder-decoder model the kwargs should include `encoder_outputs`.

Return:
    [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
    `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
    [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
    `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
    `model.config.is_encoder_decoder=True`.
Nr   MoshiDepthDecoderImageGPTForCausalImageModelingFr   r)   rf  rO  z`low_memory=True` is not supported after the beam search refactor. Please check the discussion in #35802 *after the PR got merged*, and add a comment there if your questions are not yet answered.rp   r  rn   ro   r   )
fill_valuer   r   r   r  r  rA  rB  rC  Tr  rD  )
r  r  r  rP  r  r  r  r  r  r   )r  r  r  r  r  )rk   r  r  r  r   r  r  r  r  r  rP  r  r  r  rc   .)rc   rs  rk   r   rl   rm   r   r   r   r   r   r   rc   rk   r   rl   rm   r   rn   ro   rc   )Br$  r%  rA  rB  r   r!  r"  r  r  r  r  r  rq  r   r   rq   r   audio_vocab_sizer%  out_featuresr  r  r  rv   rV  r  r  zerosrj  r   r0  rG  r  r   r   fullr-  r  r&  r   r(  int32r  r  r  r  ro  rm   r'  r   r   r  r   rn   r   r   ro   r  r  r  r  r|  r  r  r   r   )8r   r   rw  r  r  r  r	  r)  r*  rA  rB  r   r!  r"  r  r  r  r  r  rq  batch_size_unflattenedrP  r   r  r  r  n_eos_tokensr  r  rO  
all_scoresr0  r   r   r   r   r   r   output_fill_valuer  rk   r  r  r  r  r  flat_running_sequencesr   model_outputsrm   	log_probsr  r  r  max_generated_lengthoutput_lengths8                                                           r|   r  GenerationMixin._beam_search%  sc	   \ )::(::-??0EE)77)77"3"K"K%//	*99*99&11
%//	0EE*3//"1*='+y8
>>""&9955J^^$$(HH335BBJ446AAJ$" 1=0H|))!,aAq</09<!IIZZ5::6]U^E^glgqgq8rs
 "Y
 	
 77AQAQS_` '11
t  4RD
3RD
5-rd$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf " @L?WL;LO]_!JJJ/(++##	
 -1,D,DYPZ\e,f!Q.)%,,.446	
 $kk:y*A]f]m]mn%)AqrE"jj*i!8TQVQ\Q\eneueuv !;;
I'>ejjYbYiYij -2KK#5::i>N>N-
)
  %zzJ$89bPUP[P[dmdtdt 
 ,224::< ,,-?UeUe,f%)%;%;<MaQRT\U\T\n<]%^"==>TeXdeL L]!46G HcefRf!79M Nlno B<BTBM  CC#';;#A#A D L
 1 #))!R(366D^g^n^n6oF 11&b1AI()?KI ' 6<<>"33J*}9??#4"66J$&;;99 '99;+668&
 {{55(]-K-K,MM(');;99 '<<>+99;) 00J	RI!$71d
$CCIi*i*>T1UVI QUPmPm&/"3%9#5#+#%% Qn QMN24M 1B&&'=aMgPQkM>Q'RS1- 150H0H1:}1-
 LPKuKu-'=*C2S# Lv LH24H FJE`E`#'='-)*C!12S"3##5-- Fa FBI{L2B*  148D262O2O$01B$C!334HgXjNjIj4kl 3P 3./
 kG%)%O%O# 1"
& 
"s ,,-?UeUe,ffN **9Q8M9M8Mq5P+QR	,,[<Q=Q<Q9Q-RS--l1>S?S>SUV;V.WX ".!1 7 7 9>>1>EIIK*-AAa-/0	#A'<(<'<$<=" "{{--7'%0%%!-'9*?'9%5*?$0$4$45F$G  5'%0%%!-1"7$0$4$45F$G	 	 r{   r  c                 f  ^:^;^< UR                   nUR                  n	UR                  n
UR                  nUR                  nUR
                  nUR                  nUR                  nUR                  nUU-  n[        UR                  5      U-  nUR                  nUR                  u  nnU R                  UUR                  U5      nU(       a@  U(       a9  [        U5       Vs/ s H!  n[        S [        UU-  5       5       5      PM#     snm<OSm<UU-  U:w  a  [!        SUU-   SU S35      eU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U(       a  SOSnU(       aU  U R"                  R$                  (       a:  U
(       a  US   R'                  S5      OSnU(       a  US   R'                  S	5      OSn[(        R*                  " UU4S
[(        R,                  US9nSUSS2SSU24'   UR/                  UU-  45      nSnUR                  S   n U R1                  UXaR                  S9(       Ga  [(        R2                  " UU-  UR4                  US9n![(        R2                  " UU-  [(        R6                  US9n"U R8                  " U40 UD6n#U#R;                  U
(       a  SU
0O0 5        U#R;                  U(       a  SU0O0 5        U " S0 U#DSS0D6n$U R=                  U$UU R"                  R$                  S9nU(       a  U(       a  US-   nM  U(       a*  [(        R>                  " U$R@                  SS2SSS24   5      n%U(       a.  U$R@                  SS2SSS24   RC                  SUR                  S9n&[        U5       GHX  m:T:U-  n'[E        U'U-   U5      n(U(U'-
  n)/ n*[        U5       H5  n+U*RG                  [        U'U(5       V,s/ s H  n,U+U-  U,-   PM     sn,5        M7     UU*   n-U$R@                  U*SSS24   RC                  [(        RH                  UR                  S9n.[J        RL                  RO                  U.SS9n/U/R                  S   n0U" U-U/U!T:S9n1U1UU*   RQ                  S5      -   n/U/RS                  U15      n/U(       a  U1W%U*'   U/R/                  UU)U0-  5      n/U	b  U	R                  S   OSn2[(        RT                  " U/[W        SSU2-   5      U)-  SSSS9u  n/n3[(        RX                  " U3U0SS9n4U3U0-  n3T<b  [[        T<S5      OSn5UR]                  U-U/U3U4UU	U5T:U S9	n6U6S   UU*'   U6S   n7U6S    m;U(       a6  U(       a/  [        U:U;U<4S! j[        [        T<S   5      5       5       5      T<T:'   U-T;   UU*'   [(        R^                  " U-T;SS24   U7RQ                  S5      /SS9n-U-SS2S4   U!U*'   U[(        RX                  " T;U)SS9-  U'-   T;U)-  -   U"U*'   GM[     U(       a  U(       a  UW%4-  nU(       a  UW&4-  nU
(       ac  UU R"                  R$                  (       a  U$R`                  4OU$Rb                  4-  nU R"                  R$                  (       a  UU$Rd                  4-  nU(       a8  UU R"                  R$                  (       a  U$Rf                  4OU$Rh                  4-  n[(        R^                  " UU!RQ                  S5      /SS9nA$UR'                  S"S5      b  U Rk                  US"   U"5      US"'   US-   nURl                  (       d  [o        U" UU5      5      (       a  SnU R1                  UXaR                  S9(       a  GM  T<b  [[        T<S5      OSn8URq                  UUW3W4UU	URr                  U8U S#9	n9U(       az  U(       d  SU9S$'   U R"                  R$                  (       a+  [u        U9S%   U9S$   UUU9S&   WWUUUUR'                  S"5      S'9$ [w        U9S%   U9S$   UUU9S&   UUUR'                  S"5      S(9$ U9S%   $ s  snf s  sn,f ))a'  
Generates sequences of token ids for models with a language modeling head using **diverse beam search
decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

Parameters:
    input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
        The sequence used as a prompt for the generation.
    beam_scorer (`BeamScorer`):
        An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
        sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
    logits_processor (`LogitsProcessorList`):
        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
        used to modify the prediction scores of the language modeling head applied at each generation step.
    stopping_criteria (`StoppingCriteriaList`):
        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
        used to tell if the generation loop should stop.
    generation_config ([`~generation.GenerationConfig`]):
        The generation configuration to be used as parametrization of the decoding method.
    synced_gpus (`bool`):
        Whether to continue running the while loop until max_length (needed to avoid deadlocking with
        `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
    model_kwargs:
        Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
        model is an encoder-decoder model the kwargs should include `encoder_outputs`.

Return:
    [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
    `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
    [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
    `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
    `model.config.is_encoder_decoder=True`.
c              3   &   #    U  H  nS v   M	     g7frp   Nrp   r<  r   s     r|   r?  5GenerationMixin._group_beam_search.<locals>.<genexpr>  s     !P.O".O   N)Batch dimension of `input_ids` should be 	, but is r  rp   r  rn   ro   r  r   r   Fr)   r  rA  rB  rC  Tr  r   )r  r   rO  )current_tokensbeam_group_idxr   rP  largestsortedfloor)rounding_mode)r)  r*  r   group_indexr  next_beam_scoresnext_beam_tokensnext_beam_indicesc              3   H   >#    U  H  nTT   TU      TU   4-   v   M     g 7fr   rp   )r<  r6  r  rs  r   s     r|   r?  r  (  s/      9\wWX^4Xa[AXa[NR\ws   "rc   r)  r*  r  r   r  sequence_scoresrk   r   r  r  )<r$  r%  rA  rB  r   r!  r"  r  r  r   
_beam_hypsr   r   r0  r9  rn  r  r   r   r   rv   r  r(  rQ  r  r  r   r   r  r  ro  
zeros_likerm   rj  r  extendr'  r   r   r  rh  	expand_asrH  r  divr  processrV  r   rn   r   r   ro   r|  is_donerT  finalizer  r   r   )=r   r   r  rw  r  r  r  r	  r)  r*  rA  rB  r   r!  r"  r  r  num_sub_beamsr   r   batch_beam_sizerP  r   rl   r0  r   r   r   r   r   r  r  r  r  reordering_indicesr   rc  processed_scoreraw_logit_scoregroup_start_idxgroup_end_idx
group_sizebatch_group_indicesr	  rE  group_input_idsr<  r=  r  next_token_scores_processedr  r?  next_indicesprocess_beam_indicesbeam_outputsbeam_next_tokensfinal_beam_indicessequence_outputsr  rs  r   s=                                                             @@@r|   r  "GenerationMixin._group_beam_searchi  s   V )::(::-??0EE)77)77"3"K"K))	%55!_4//0OC
!!#,?? 77AQAQS_`"}Z_`oZpqZpUVE!PeMJ4N.O!PPZpqLLz!_4;I
<R;SS\]l\mmno 
 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf " jj*i!8$ekkZ`a*+A&'!&&
Y(>'@A"&__Q/,,-?UeUe,f"[[i)?y_efN "'Z)-C5::^d!e  ==iX<XL L]!46G HcefRf!79M Nlno<\<t<G  CC#';;#A#A D L
 1!A+"'"2"27>>!R(3K"L #*..B":"="=4PYP`P`"="a"'"8"0="@ #Om$CY O*_<
 ')#!&z!2I'..@EoWd@ef@eY.4@ef "3 #,,?"@
 %,NN3FA3M$N$Q$Q--	0@0@ %R %! %'MM$=$=%2 %> %! /44R8
.>#%6~ft/+ %@+NaBbBlBlmoBp$p!$5$?$?@[$\! ;VO$78 %6$:$::zT^G^$_! 9E8P|11!4VW16%s1a,.>'?*'LRS]ajn2.!;  %yyjPWX)J6 AM@Xs<'<^b$*22#% !-!-!5 .'9  3 
  4@@R3S/0#/0B#C '(;<*}38 9\abefrstfubv\w9 4L0 2A1J	-."'))_Xq[-IK[KeKefhKi,jpr"s6Eae6L23
 		(Jg VV%&*,. ##67W #9d ' 00F ?"44J$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) 		9n.F.Fr.J"KQSTI  148D262O2O !235G3./
 kG""c*;Iv*N&O&O%)"o ,,-?UeUe,ffr 7C6NSr2TX&//%%(33+1 0 

 # 6: !23{{--7.{;%56G%H!%!1.!A'9*?'9%5*?$0$4$45F$G  5.{;%56G%H!%!1.!A1"7$0$4$45F$G	 	 $K00I rV gs    (`)-`.r  c                 r  ^*^+ UR                   nUR                  n	UR                  n
UR                  nUR                  nUR
                  nUR                  n[        UR                  5      nUR                  nUR                  SS u  nnU R                  UUR                  U5      nUU-  U:w  a  [        SUU-   SU S35      eU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a"  U(       a  [        S [        U5       5       5      OSm+U(       a	  U
(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U(       a  SOSnU(       aU  U R                   R"                  (       a:  U
(       a  US   R%                  S	5      OSnU(       a  US   R%                  S
5      OSn[&        R(                  " UU4[&        R*                  UR                  S9nSUSS2SS24'   UR-                  UU-  45      nSnUR                  S   nU R/                  UXaR                  S9(       Gak  U R0                  " U40 UD6nUR3                  U
(       a  SU
0O0 5        UR3                  U(       a  SU0O0 5        U " S0 UDSS0D6nU R5                  UUU R                   R"                  S9nU(       a  U(       a  US-   nM  UR6                  SS2SSS24   R9                  S[&        R:                  UR                  S9n[<        R>                  RA                  USS9n U" UU 5      n!U!USS2S4   RC                  U!5      -   n U RE                  5       n"U(       a  U(       a  UU 4-  nU(       a  UU4-  nU
(       ac  UU R                   R"                  (       a  URF                  4OURH                  4-  nU R                   R"                  (       a  UURJ                  4-  nU(       a8  UU R                   R"                  (       a  URL                  4OURN                  4-  nU R                  S   n#U R-                  UUU#-  5      n U	b  U	R                  S   OSn$[&        RP                  " U [S        SSU$-   5      U-  SSSS9u  n n%U%U#-  RU                  5       n&U%U#-  n%URW                  UU U%U&U"UU	T+US9	n'U'S   nU'S   n(U'S   m*[&        RX                  " UT*SS24   U(R[                  S5      /SS9nAUR%                  SS5      b  U R]                  US   T*5      US'   U(       a/  U(       a(  [        U*U+4S j[        [        T+5      5       5       5      m+US-   nUR^                  (       d  [a        U" UU5      5      (       a  SnU R/                  UXaR                  S9(       a  GMk  URc                  UUW%W&UU	URd                  T+US 9	n)U(       az  U(       d  SU)S!'   U R                   R"                  (       a+  [g        U)S"   U)S!   UUU)S#   WWUUUUR%                  S5      S$9$ [i        U)S"   U)S!   UUU)S#   UUUR%                  S5      S%9$ U)S"   $ )&a  
Generates sequences of token ids for models with a language modeling head using **constrained beam search
decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

Parameters:
    input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
        The sequence used as a prompt for the generation.
    constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
        A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
        sorted during generation, while satisfying a list of positive constraints. For more information, the
        documentation of [`ConstrainedBeamSearchScorer`] should be read.
    logits_processor (`LogitsProcessorList`):
        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
        used to modify the prediction scores of the language modeling head applied at each generation step.
    stopping_criteria (`StoppingCriteriaList`):
        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
        used to tell if the generation loop should stop.
    generation_config ([`~generation.GenerationConfig`]):
        The generation configuration to be used as parametrization of the decoding method.
    synced_gpus (`bool`):
        Whether to continue running the while loop until max_length (needed to avoid deadlocking with
        `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
    model_kwargs:
        Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
        an encoder-decoder model the kwargs should include `encoder_outputs`.

Return:
    [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
    `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
    [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
    `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
    `model.config.is_encoder_decoder=True`.
Nr   r  r  r  rp   c              3   &   #    U  H  nS v   M	     g7fr  rp   r  s     r|   r?  ;GenerationMixin._constrained_beam_search.<locals>.<genexpr>  s     54"4r  r  rn   ro   r   r  r)   Fr  rA  rB  rC  Tr  r   rD  rO  r   r  )r)  r*  r   r  r  r  r  rc   c              3   B   >#    U  H  nTTU      TU   4-   v   M     g 7fr   rp   )r<  r6  rs  r   s     r|   r?  r  G  s(     %sZrUVl8A;&?8A;.&PZrs   r  r  rk   r   r  r  )5r$  r%  rA  rB  r   r!  r"  r   r  r  r   r0  r   r  rn  r9  r   r   r   rv   r  r(  rQ  r  r  r  ro  rm   rj  r'  r   r   r  r  r   r   rn   r   r   ro   rH  r  r   r  rV  rh  r|  r  rT  r  r  r   r   ),r   r   r  rw  r  r  r  r	  r)  r*  rA  rB  r   r!  r"  r   r  r  rP  rl   r0  r   r   r   r   r   r  r  r  r   rc  r<  r=  r  scores_for_all_vocabr  r  r?  r  r   r  r  rs  r   s,                                             @@r|   r  (GenerationMixin._constrained_beam_search  s   X )::(::-??0EE)77)77"3"K"K0;;<
+55	#,??2A#6 77AQAQS_`z!_4;I
<R;SS\]l\mmno 
 0M3RD
:QVcE5eO455jn 	 %<@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf " kk:y"9U^UeUef!AqrE!&&
Y(>'@A"&__Q/,,-?UeUe,f==iX<XL L]!46G HcefRf!79M Nlno<\<t<G  CC#';;#A#A D L
 1!A+
 !(q"ax 8 ; ;U]]clcscs ; t " 9 9!r !: ! +;9FW*X' ;k!T'>R>\>\+? ! $5#:#:#<  ' 022F #4"66J$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) +004J 1 6 6z9zCY Z 5A4L<--a0RSL-2ZZ!3q!l*:#;i#GQX\ei.*{ (*4::<L%
2K 3::!$)))#5 ; 
L ''9:K+,>?#$78H		9Xq[#9;K;U;UVX;Y"Z`bcI  148D262O2O !23X3./ '=$%sZ_`cdp`qZr%st kG&..#6G	SY6Z2[2[%)"Q ,,-?UeUe,ffT 3;;%%(33%1 < 

 # 6: !23{{--7.{;%56G%H!%!1.!A'9*?'9%5*?$0$4$45F$G  5.{;%56G%H!%!1.!A1"7$0$4$45F$G	 	 $K00r{   r  c                   ^+^, UR                   n	UR                  n
UR                  nUR                  nUR                  nUR
                  nU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U(       a  SOSnU(       aU  U R                  R                  (       a:  U
(       a  US   R                  S5      OSnU(       a  US   R                  S5      OSnUR                  SS u  nn[        R                  " U[        R                  UR                  S9nU R                  UUR                  U5      nSnS	nU R                  UXaR                  S
9(       Ga.  UR                  S   nUR!                  U5      u  nnUR#                  U R                  5      nUb  UR#                  U R                  5      nUR                  S   UR                  S   -
  nU" US5      n[$        R$                  " U5      n['        UUR                  S   U R                  R                  5      n[)        UUR                  S   5      nSU;   aM  [        R*                  " US   [        R,                  " UUU-   UR                  [        R                  S94SS9US'   U R.                  " U40 UD6n SU ;   a  US-   U S'   U R1                  U
(       a  SU
0O0 5        U R1                  U(       a  SU0O0 5        U " S0 U D6n!U!R2                  SS2U* S-
  S24   R#                  [        R4                  UR                  S9m+T+R7                  5       m,[9        U5      S:  a>  [;        US-   5       H,  n"U" USS2SUU"-   24   T+SS2U"SS24   5      T+SS2U"SS24'   M.     U	(       a  Ub  [=        UUUT+U5      u  n#n$OU	(       aE  T+R?                  SS9n%[        R@                  " U%SSS2SS24   SS9RC                  S5      SSS24   n&OT+RE                  SS9n&USS2US24   n'U'U&SS2SS24   :H  ) RG                  SS9S:  RI                  5       n$U(       a  U$U:X  a  U$S-  n$U&SS2SU$S-   24   n#[        R*                  " UU#4SS9nUb  URK                  U#RM                  5       5        UR                  S   n(U(S-
  n)[O        U U!RP                  U)5      U!l(        URS                  UT+U$5        U RU                  U!UU R                  R                  U$S-   S9nU(       a
  U(       a  GM  U(       Ga,  U$S-   n*U(       a!  U[W        U+4S j[;        U*5       5       5      -  nU(       a!  U[W        U,4S j[;        U*5       5       5      -  nU(       a  U(OU*n*U
(       ar  U R                  R                  (       a0  [Y        UU!RZ                  UU*5      n[Y        UU!R\                  UU*S	S9nO'U!R^                  S   b  [Y        UU!R^                  UU*S	S9nU(       aL  U R                  R                  (       a  [Y        UU!R`                  UU*5      nO[Y        UU!Rb                  UU*5      nUU" X5      ) -  nURe                  5       S:H  nSnU R                  UXaR                  S
9(       a  GM.  Ub  URg                  5         [i        US5      (       aI  URj                  Rl                  Rn                  S:X  a%  URp                  URj                  Rl                  l8        U(       aX  U R                  R                  (       a   [s        UUUWWUUUUR                  S5      S9	$ [u        UUUUUUR                  S5      S9$ U$ )a	  
Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
**sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
models.

Parameters:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The sequence used as a prompt for the generation.
    candidate_generator (`CandidateGenerator`):
        A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
        more information, the documentation of [`CandidateGenerator`] should be read.
    logits_processor (`LogitsProcessorList`):
        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
        used to modify the prediction scores of the language modeling head applied at each generation step.
    stopping_criteria (`StoppingCriteriaList`):
        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
        used to tell if the generation loop should stop.
    generation_config ([`~generation.GenerationConfig`]):
        The generation configuration to be used as parametrization of the decoding method.
    synced_gpus (`bool`):
        Whether to continue running the while loop until max_length (needed to avoid deadlocking with
        `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
    streamer (`BaseStreamer`, *optional*):
        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
    model_kwargs:
        Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
        If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

Return:
    [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
    `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
    [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
    `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
    `model.config.is_encoder_decoder=True`.
rp   Nr  rn   ro   r   r   FTr  r)   r   r  r   rO  r{  rA  rB  r   r  )r   rd  c              3   <   >#    U  H  nTS S 2US S 24   v   M     g 7fr   rp   )r<  r6  
new_logitss     r|   r?  5GenerationMixin._assisted_decoding.<locals>.<genexpr>:  s     #[AZAJq!Qw$7AZ   c              3   <   >#    U  H  nTS S 2US S 24   v   M     g 7fr   rp   )r<  r6  r<  s     r|   r?  r  <  s     'fLeq(9!Q'(BLer  )is_decoder_attentionrv  	heuristicrc   rE  r  );r  rA  rB  r   r!  r"  r   r   r   r   rv   r  r   r   r0  r  get_candidatesrj  r  r7   r8   rV  r   r  r  rm   r'  r   r   r9  _speculative_samplingr*  r+  r,  r-  r   r  r  r3  r6   rc   update_candidate_strategyro  rn  _split_model_outputsr   r   rn   r   ro   r  r.  r   rv  r  num_assistant_tokens_schedulenum_assistant_tokensr~   ri   )-r   r   r  rw  r  r  r  r  r	  r  rA  rB  r   r!  r"  rl   r0  r   r   r   r   r   r   rP  r2  r  is_first_iterationcandidate_input_idscandidate_logitscandidate_lengthis_done_candidatecandidate_kwargsr   rc  r6  valid_tokens	n_matchesr>  selected_tokenscandidate_new_tokensnew_cur_lennew_cache_sizenewly_added_lengthr  r<  s-                                              @@r|   r  "GenerationMixin._assisted_decodingz  s   b &//	-??0EE)77)77"3"K"K 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G$zz*EJJyO_O_`77AQAQS_`"!,,-?UeUe,fooa(G 5H4V4VW`4a1!1"5"8"8"E+#3#6#6t{{#C 288;iooa>PP 12Et L  $yy66 "5";";A">@^@^   77GI\IbIbcdIef#335:YY()9:Wg8H.HQZQaQainisist 6 !12  ==>QfUefL</1AA1E-. L]!46G HcefRf!79M Nlno*\*G !,<+<q+@+B(BCFFmmI,<,< G J !+ 0 0 2#$q(/!34A*:;NqR_T[^_T_R_O_;`blmnpqstmtbu*vJq!Qw' 5 -9*?'$$%+'i &..2.6E&+&7&7aAgTU&V&^&^_`&abfhibi&jO&0&7&7B&7&?O':1gh;'G$ 43B38O OPXX]_X`cddiik	 %6F)FNI.q/IM//AB 		9l";DI#\--/0#//!,K )1_N&;D'BYBY[i&jG#  99)ZQZ[  CC#';;#A#A(1}	 D L 1 '%.]" e#[GYAZ#[[[F %'fERdLe'f"ffJ4F[L^"${{55+?,g.F.FQc,( .B.#66#.15.* !++A.:-A.#..#.15.* ({{550D173P3PRY[m1- 1E173H3H'Se1- $8;LY;_:_#_ !5!9!9!;q!@!&s ,,-?UeUe,ffv LLN '):;;#33EEccgrr $88  //AAV #{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  r{   c                    S[         R                  R                  l        UR                  n[         R
                  " US S 2S S24   USS9nSU;  a  [        S5      eU R                  nU R                  X25      nU(       a  U R                  UR                  5      nUR                  SS 5      nSn	U H  n
XR                  S   -   nUb  US S 2S U24   US'   [         R                  " X[         R                  U
R                  S9US	'   US	   R!                  S5      US
'   U R"                  " U
40 UD6nU" S0 UDSS0D6nUR$                  US'   Un	M     XS'   US	   SS  S-   US	'   UR                  S
S 5      nU$ )N@   r   rO  rc   z+Cannot use prefill chunking without a cacher   r   r   r   r   rC  Tr)   rp   )rv   _dynamor   cache_size_limitrr  rM  r  r   r  rq  r  r   r   r   r   r   rh  r  rc   )r   r   r  r	  
chunk_sizeinput_chunksrt  ru  r   r   input_chunkcurrent_lengthr   rc  r   s                  r|   rs  !GenerationMixin._prefill_chunking  s    13-&99
 {{9QV#4jbIL0JKK;;L\ 223D3S3STM%))*:DA'K(+<+<R+@@N)1??N?@R1S-.-2\\5::kFXFX.L)* ,88H+I+S+STU+VL(==kZ\ZL#ElEEG.5.E.EL*+(K (  *8%&)56F)G)Lq)P%&^T2r{   )rO  r  )NN)NNNN)NNNr   )r)   FN)Fr)   )NF)NNNNNNNNNNNN)Vrq   rr   rs   rt   ru   r	   r   r$  r   PathLiker  r   r   rv   rw   ry   r
   r   r   r   r  r  r   r  r  r;   r   r1  rI  r&  r   rX  staticmethodra  r#   ro  rt  rG   r2   r  r   r  r\   r  r  r  r  r  r  r  r(  r0  rG  r]  rd  rx  r|  r  r  no_gradGenerateOutputr   r  r  GenerateNonBeamOutputr  r  r  r|  r  r  r  r(  r  r  r  r  GenerateBeamOutputr  r,   r  r.   r  r  rs  rz   rp   r{   r|   r   r   a  s   B LP,0A('/c2;;6F0G'HA( $D>A(
 
A(F (## (   1 12 ( !!1!12	 (
 
u  %"2"22	3 (D<(##<(   1 12<( !!1!12	<(
 
u  %"2"22	3<(B ,0595959G##G "%G !!1!12	G
   1 12G !!1!12GV *./3:>	>0&>0 u||,>0 tC$567	>0
 
u||Xc]Dell1B,CC	D>0D *./3:>	`&` u||,` tC$567	`
 
		`@"||" ," 38n	"
 
		"H'||' #3-	'
 ,' 
c3h'^ *.9/9/ 9/ 3,-	9/
 !&9/ &9/ 
uc5<<&7!88	99/v #(04 ' '  ' E,,- '
 
uc3h/	0 '  'L $)// 38n/ !	/
 / 
c3h/b
N#+N# ##N# ||	N#
 +N# .N# 4N# 7N# N# 
N#n !%156:AEg+g "g !++	g
 #+C+>S	+I"Jg ##67g g tCH~.g &ell3g )1(>g 
gZ :>	%+% $$89% 56	% 
%N#/1EEF# .0DDE# 
"$88	9	#R 04!&x!<<x! ell#x! u||,	x!
 x! 
x!t<64S> 6p*X6!r cgS/!)*:!;S/QYZ^Q_S/rvS/	%	&S/j8L htCsTWxDX?Y6Z L \>$'>58>IL>V[VbVb>	>@
 
w+w w +	w
 w w w 
wrZ$ Z 5959	PS+PS $,D>PS u||S012	PSd' 'Rb 'gk 'R ]]_ *.8<:><@W[&*7;-16:AE-1)-z&z $$45z ##67	z
 $$89z #+8S%,,4Gc4R+S"Tz d^z ""34z >*z &ell3z )1(>z %TNz "#z 
~u///	0z zxD t ]b]i]i nr & ]aS))S6>?X6YS			SjR##R 3S	>*R .	R
 0R ,R R !R 
$e&6&66	7Rh ]]_\##\ .\ 0	\
 ,\ \ >*\ 
$e&6&66	7\ \|y##y .y 0	y
 ,y y >*y 
$e&6&66	7yx4 H%,, H5<< H H
 JELL Jc Jc JV[VbVb J J
 ell %,, 5<<  & %M"\\%M\\%M  ,,%M ,1<<	%M
 %M %M  %M dCi(%M %M %MN4Q$||4Q !<<4Q $ll	4Q
 4Q  4Q 4Q 4Q 4Q 4Q 4Q 
u||U\\5<<7	84QlLL !&L $)<<	L
 ,1<<L L 
u||U\\5<<7	8L,/F<</F !&/F \\	/F
 /F ll/F $)<</F  ,,/F ,1<</F !<</F /F /F  /F /F dCi(/F  
u||U\\5<<E	F!/FfB##B .B 0	B
 ,B B 
!5#3#33	4BH
a1##a1  a1 .	a1
 0a1 ,a1 a1F	l1##l1 "=l1 .	l1
 0l1 ,l1 l1 
!5#3#33	4l1\I##I 0I .	I
 0I ,I I >*I 
$e&6&66	7IV*5+;+; *P` *r{   r   c                 N   U SS2U* S24   nUR                  SS9nUSS2[        R                  " U5      U4   R                  SS5      nUR                  SS9nUSS2[        R                  " U5      U4   R                  SS5      n	X-  n
[        R                  " U
5      nX:*  nU) R                  SS9S:  R                  5       nU(       a  X:X  a  US-  nUSS2SUS-   24   nX4$ UR                  S   nUSS2USS24   nX:  aD  USS2USS24   n[        R                  " UU-
  SS9nUR                  UR                  5       5        OUn[        R                  " USS9R                  S5      SSS24   nUS:  a#  [        R                  " USS2SU24   U4SS9nX4$ UnX4$ )a  
Applies sampling as in the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf, algorithm 1). Returns
the selected tokens, as well as the number of candidate matches.

NOTE: Unless otherwise stated, the variable names match those in the paper.
Nr   rO  r   r)   )r  r  )r*  rv   r   r,  	rand_liker   r  r   clampdiv_r+  rV  )r  r  r  r  r  new_candidate_input_idsqq_ir=  p_iprobability_ratior_iis_acceptedr   r  gamma
p_n_plus_1
q_n_plus_1p_primer  s                       r|   r  r    s    2!6F5F5G2GH 	  R (A
Au||,-/FF
G
O
OPQST
UCr"A
Au||,-/FF
G
O
OPQST
UC	
 //+
,C*K,&&2&.2779I Y: 	Q	.q/IM//AB& ""! !&&q)q)Q'
1i?+Jkk:
#:CGLL' Gg15==a@qI q= 99&=a)m&La%PVXYL "" L""r{   c                 >   [        U 5      S:X  a@  SnU H*  nU(       a  UOUR                  S   nXVSSU2SU24   4-  nM,     X4-  n US-  nX2-  n[        U5       H>  nSnU H.  nU(       a  X(-   OUR                  S   nXVSXS-   2SU24   4-  nM0     X4-  n M@     U $ )z
Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
where each member corresponds to a single generated token.
r   rp   r   .Nr)   )r   r   r9  )	rc  new_outputsrP  	added_lenr  	new_tuplerF  last_dim_sizer6  s	            r|   r  r    s     7|q	 E';GRMXgX~~ =>@@I ! 	<1	9	 E+?GKU[[QS_MQQY >?AAI ! 	<  Nr{   r^  r\  next_top_k_probsrQ  alpha
beam_widthr   c                 2   X R                  SSS9-  nXR                  SSS9-  n[        R                  " XgR                  SS5      5      R	                  S5      nUR                  UR                  S9nSU-
  [        R                  " UR                  5      R                  -  nX-   n[        R                  " USS9u  pUR                  S5      nSU-
  U-  XI-  -
  n[        R                  " [        R                  " X5      5      nUR                  SS9u  pU$ )	z
Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
row in the batch.
r   Tr  r)   r   rf  rO  r  )normrv   matmulr  r,  rj  r   finfor  r  rQ  r  rM  )r^  r\  rJ  rQ  rK  rL  norm_context_hiddennorm_next_hiddencosine_matrixdegeneration_penaltyr   contrastive_scorer_  s                r|   rL  rL    s    )+>+>1d+>+SS"%5%5!T%5%JJLL!46P6PQRTU6VW__`bcM ,..]5H5H.I00EKK@S@S4T4X4XX!6M#ii2>',,R0u(885;WWEKK0A$NO'+++3OAr{   full_batch_size
split_sizec                   ^^ U c	  S/UT-  -  $ [        U [        R                  5      (       a"  [        SUT5       Vs/ s H	  o0X3T-    PM     sn$ [        U [        5      (       d4  [        U [
        5      (       a1  [        U R                  [        5      (       a  U R                  UT5      $ [        U [        5      (       a~  [        U S   [        5      (       a3  [        SUT5       V^s/ s H  m[        UU4S jU  5       5      PM     sn$ [        SUT5       V^s/ s H  m[        UU4S jU  5       5      PM     sn$ [        S[        U 5       35      es  snf s  snf s  snf )a  
Takes care of three cases:
1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and
   return a list of tuples
3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and
   return a list of tuples of tuples
(see documentation of ModelOutput)
Nr   c              3   P   >#    U  H  n[        UU4S  jU 5       5      v   M     g7f)c              3   2   >#    U  H  oTTT-    v   M     g 7fr   rp   )r<  r  r6  rW  s     r|   r?  #_split.<locals>.<genexpr>.<genexpr>:  s     Q[61q:~6[   N)rn  )r<  inner_tupler6  rW  s     r|   r?  _split.<locals>.<genexpr>:  s"     jeiVaeQ[QQQeis   #&c              3   2   >#    U  H  oTTT-    v   M     g 7fr   rp   )r<  
sub_tensorr6  rW  s     r|   r?  r^  @  s     LtQ^4tr\  Unexpected attribute type: )r   rv   r  r9  r   r   rT  batch_splitrn  	TypeErrorr  )datarV  rW  r6  s     ``r|   _splitre  #  sI    |vJ677$%%27?J2WX2WQQZ(2WXX	D,	'	'4,--*T=V=VXd2e2e<<	D%	 	 d1gu%% q/:>>A jeijj>  q/:>>A LtLL> 
 5d4j\BCC) Ys   E( E Er   r   c                    U c  U /X!-  -  $ [        U 5      nX!-  S:w  a  [        S5      eX:  a  [        S5      e[        U S5      (       a  U R                  R	                  5       OU R	                  5       nU Vs/ s H  ofU ;   d  M
  UPM     nnU Vs/ s H$  n[        X   [        5      (       d  US:X  d  M"  UPM&     nn/ SQnU Vs/ s H%  n[        X   [        5      (       a  M  Xh;  d  M#  UPM'     n	n[        X!-  5       V
Vs/ s H%  n
U	 Vs0 s H  of[        X   X!5      U
   _M     snPM'     nn
nU Vs0 s H  ofX   _M	     nnSU ;   aE  [        U S   XUR                  5       5      n[        U5       V
Vs/ s H  u  p0 UESX   0EPM     nn
nSU ;   a  U Vs/ s H  n0 UESU S   0EPM     nnU Vs/ s H  o" S	0 UDUD6PM     nnU$ s  snf s  snf s  snf s  snf s  snn
f s  snf s  snn
f s  snf s  snf )
z
Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
previous forward pass.
r   z3`full_batch_size` must be divisible by `split_size`z:`split_size` must be smaller or equal to `full_batch_size`__dataclass_fields__r   )r   r  r{  r  r{  rp   )r  r  r   rg  r   r   r  r9  re  _split_model_inputsr  r  )r   rW  rV  r   model_output_clsr   r  	bool_keyskeys_to_ignorenon_bool_keysr6  data_split_list	bool_dataencoder_outputs_split
data_splitsplit_model_inputss                   r|   rh  rh  G  s^    } =>>K($*NOO#UVV 4;;H^3_3_((--/epeueuew 	 0t!K/AtD0 !^DqJ{~t$D$DM]H]DI^LN $i1J{~t,LQQRQhQMi
 455A MZZMqF;>??B	BMZ5  
 -66IqKN"II6K' 3)*JI_I_Ia!
 ^ggv]w
]wMAGzG,.C.FG]w 	 
 ;&^m
^mPZKzK+[9I-JK^m 	 
 GV:FU
3:33o  : E 1 _i 	[
 7



:sZ   9	GG!G7GG&G-G
G#G*G#8G):G.G48G9G#r  c                 R  ^ U (       d  [        S5      e[        U S   5      m[        U4S jU  5       5      (       d  [        S5      eS nTR                  R	                  5        VVs0 s H%  nX2" U  Vs/ s H  n[        XC5      PM     sn5      _M'     nnnT" S0 UD6$ s  snf s  snnf )z
Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
specific ModelOutput subclass from the list provided.
zInput list is empty.r   c              3   <   >#    U  H  n[        UT5      v   M     g 7fr   )r   )r<  objri  s     r|   r?  &stack_model_outputs.<locals>.<genexpr>  s     JMSz#/00Mr  z4All elements in the list should be of the same type.c           	        ^  [        S T  5       5      (       a  g[        T S   [        R                  5      (       a  [        R                  " T SS9$ [        T S   [
        5      (       a  [
        R                  " T 5      $ [        T S   [        5      (       a  [        R                  " T 5      $ [        T S   [        5      (       ao  [        T S   S   [        5      (       a*  [        U 4S j[        [        T S   5      5       5       5      $ [        U 4S j[        [        T S   5      5       5       5      $ [        T S   [        [        45      (       a  [        R                  " T 5      $ [        S[        T S   5       35      e)z%
Reverse of `_split` function above.
c              3   (   #    U  H  oS L v   M
     g 7fr   rp   )r<  rd  s     r|   r?  7stack_model_outputs.<locals>._concat.<locals>.<genexpr>  s     -t|r}  Nr   rO  c           
   3      >^#    U  H3  m[        UU4S  j[        [        TS   S   5      5       5       5      v   M5     g7f)c              3      >#    U  H0  n[         R                  " T Vs/ s H
  o"T   U   PM     snS S9v   M2     gs  snf 7fr   rO  Nrv   rV  )r<  jr  rd  r6  s      r|   r?  Astack_model_outputs.<locals>._concat.<locals>.<genexpr>.<genexpr>  s6     jSia%))D$ADD!WQZD$AqISi$As   A ;
A r   N)rn  r9  r   )r<  r6  rd  s    @r|   r?  rx    s?      0 jSXY\]abc]def]gYhSijjj0s   ;?c              3   ~   >#    U  H-  n[         R                  " T Vs/ s H  o"U   PM	     snS S9v   M/     gs  snf 7fr{  r|  )r<  r6  r  rd  s      r|   r?  rx    s2     gSfaUYYD'ADDQD'AqISf'As   =8
=ra  )r'  r   rv   r  rV  r   from_batch_splitsr   rn  r9  r   r&  r(  r  rc  r  )rd  s   `r|   _concat$stack_model_outputs.<locals>._concat  s6    ----d1gu||,,99Tq))Q..11$77Q!455&88>>Q''$q'!*e,, "3tAw<0  
 gSXY\]abc]dYeSfgggQ#u..<<%%9$tAw-IJJr{   rp   )r  r  rT  rg  r   r   )r  r   r  r  model_outputconcatenated_datari  s         @r|   rK  rK    s    
 /00 M!,- JMJJJOPPK> "66;;==A 	
7OGL,OPP=   0/00 Ps   'B#3BB#B#g?InfgMbPrl   baseline_scoresrelative_topfilter_valuer  c                 \   U R                  SS9nUR                  SS9n[        R                  " USS9u  pUSUS-
  4   n
[        R                  " USS9R                  nU[
        R                  " U5      -   n[        R                  " X5      nUR                  S5      nXGXl:  '   X6Xl:  '   Xg4$ )aQ  
Reference: https://github.com/XiangLi1999/ContrastiveDecoding/blob/170e9142e92159c1237d731e240f5eb14aabf428/transformers/src/transformers/generation_logits_process.py#L235
Apply filtering to only keep tokens with a probability above a certain threshold. The threshold is defined as `relative_top` * max probability in the distribution.
r   rO  T)
descending.r)   )	r  rv   sortr  r  nplogr  rh  )rl   r  r  r  base_filter_valuer  scores_normalizedbaseline_scores_normalizedsorted_logitssorted_indices
min_thresh	probs_maxprobs_threshs                r|   _relative_top_filterr    s     **r*2!0!<!<!<!D$)JJ/@T$R!Ms$6$::;J		+4;;Irvvl33L99Z6L))"-LCT0?@:F'6788r{   r5  r:  r9  c                    [        U 5      S:X  a  XS      n[        X#5      u  p#X#-
  nU$ [        R                  " U  Vs/ s H  oQU   PM	     snSS9n[        R
                  " USS9n[        R
                  " USS9nSUS S S 2S S 24   U-   -  n	[        R                  " USS9n
[        R                  " USS9n[        R                  " U
S S S 2S S 24   U	SS9R                  S5      n[        R                  " XSS9R                  S5      nSX-   -  nUR                  S5      nU [        UR                  5       R                  5       5         nX   n[        X#5      u  p#X#-
  nU$ s  snf )Nr)   r   rO  r   g      ?none)	reduction)r   r  rv   r  Fr*  r  kl_divmeanr&  r-  rU  )r5  r:  r9  base_logitsrm   r6  stacked_premature_layerssoftmax_mature_layersoftmax_premature_layersavg_distlog_softmax_mature_layerlog_softmax_premature_layerskl1kl2js_divspremature_layers                   r|   r)  r)    sw   
 %&!+0A1NO$8$S!+  %{{Sm+nSmaq,ISm+ntuv 99\r: yy)ArJ *4A:69QQRH  !}}\rB#$==1Ir#R  ((+D!QJ7V
T
Y
YZ\
]C
((/V
L
Q
QRT
UCSYG ll2G0W^^5E5J5J5L1MNO,=K 4\ OL'FMA ,os   E$)F)r  r   r   r  dataclassesr   typingr   r   r   r   r   r	   r
   r   numpyr  rv   torch.distributeddistributedr  huggingface_hubr   	packagingr   r   torch.nnr   r  cache_utilsr   r   r   r   r   r   r   configuration_utilsr   dynamic_module_utilsr   r   r   r   integrations.deepspeedr   integrations.fsdpr   modeling_outputsr   r    pytorch_utilsr!   tokenization_utilsr"   utilsr#   r$   r%   r&   r'   r(   beam_constraintsr*   r+   beam_searchr,   r-   r.   r  r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   logits_processr=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   r  rW   rX   rY   rZ   r[   r\   r]   modeling_utilsr^   tokenization_utils_baser_   	streamersr`   
get_loggerrq   r   accelerate.hooksra   rb   rg  ri   r~   r   r   GreedySearchDecoderOnlyOutput"ContrastiveSearchDecoderOnlyOutputSampleDecoderOnlyOutput%ContrastiveSearchEncoderDecoderOutput GreedySearchEncoderDecoderOutputSampleEncoderDecoderOutputBeamSearchDecoderOnlyOutputBeamSampleDecoderOnlyOutputBeamSearchEncoderDecoderOutputBeamSampleEncoderDecoderOutputGreedySearchOutputSampleOutputBeamSearchOutputBeamSampleOutputContrastiveSearchOutputr4  r5  r3  r   r  r  ry   rw   r(  r&  rL  re  rh  rK  r  r)  rp   r{   r|   <module>r     s      	  ! S S S     '   $   3  @ 6 F - /  G R R         8   0A'			H	%E  M  M  MF ,M; ,M ,M^ (MK (M (MV 5M{ 5M 5Mt !: %> "3 (D %#?  9 ; ; !A !A ;=ZZ[ /1HHI79TTU 79TTU  EGi ij  79UUV 8:ZZ[ ,.@@ANI NIbR5#p4%%"" '' ((	
   >!D# !D3 !DH={D()=7:=MP=Zj=	%T!
"#=@31tK'8 31BR 31Wb 31r  <-99&&9 9 	9 9 92, $S	, $S%*;*;%; <, ##, 	,r{   