
    eTh٤                        S SK JrJrJr  S SKrS SKJr  SSKJ	r	  \R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S	 S
\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\r\R                  R                   " S S\	5      5       r\R                  R                   " S S\	5      5       r\R                  R                   " S S \	5      5       r\R                  R                   " S! S"\	5      5       r\R                  R                   " S# S$\	5      5       r\R                  R                   " S% S&\	5      5       r\R                  R                   " S' S(\	5      5       r\R                  R                   " S) S*\	5      5       rg)+    )DictOptionalTupleN   )ModelOutputc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FlaxBaseModelOutput   a6  
Base class for model's outputs, with potential hidden states and attentions.

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlast_hidden_statehidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   jnpndarray__annotations__r   r   r   __static_attributes__r       Z/var/www/auris/envauris/lib/python3.13/site-packages/transformers/modeling_flax_outputs.pyr	   r	      sM    & 04x,326M8E#++./6/3Js{{+,3r   r	   c                   p    \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Srg)"FlaxBaseModelOutputWithNoAttention0   a  
Base class for model's outputs, with potential hidden states.

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
        for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
        model at the output of each layer plus the optional initial embedding outputs.
Nr   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   0   s5    
 04x,326M8E#++./6r   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Srg),FlaxBaseModelOutputWithPoolingAndNoAttentionB   aw  
Base class for model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state after a pooling operation on the spatial dimensions.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
        for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
        model at the output of each layer plus the optional initial embedding outputs.
Nr   pooler_outputr   r   )r   r   r   r   r   r   r   r   r   r   r    r   r   r   r   r   r   r   r   B   sH     04x,3+/M8CKK(/26M8E#++./6r   r   c                   p    \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Srg)(FlaxImageClassifierOutputWithNoAttentionW   a  
Base class for outputs of image classification models.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when
    `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
        for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
        called feature maps) of the model at the output of each stage.
Nlogitsr   r   )r   r   r   r   r   r$   r   r   r   r   r   r   r   r   r   r   r"   r"   W   s4     %)FHS[[!(26M8E#++./6r   r"   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                  4      \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	FlaxBaseModelOutputWithPastj   aN  
Base class for model's outputs, with potential hidden states and attentions.

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    past_key_values (`Dict[str, jnp.ndarray]`):
        Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
        auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr   past_key_valuesr   r   r   )r   r   r   r   r   r   r   r   r   r   r(   r   strr   r   r   r   r   r   r   r&   r&   j   sj    , 04x,38<OXd3#345<26M8E#++./6/3Js{{+,3r   r&   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	FlaxBaseModelOutputWithPooling   a  
Base class for model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) further processed by a
        Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
        prediction (classification) objective during pretraining.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr   r    r   r   r   )r   r   r   r   r   r   r   r   r   r   r    r   r   r   r   r   r   r   r+   r+      s`    . 04x,3+/M8CKK(/26M8E#++./6/3Js{{+,3r   r+   c                   (   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)0FlaxBaseModelOutputWithPoolingAndCrossAttentions   a^  
Base class for model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
        for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
    past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
Nr   r    r   r(   r   cross_attentionsr   )r   r   r   r   r   r   r   r   r   r   r    r   r   r(   r   r0   r   r   r   r   r.   r.      s    %N 04x,3+/M8CKK(/26M8E#++./6;?OXeE#++$678?/3Js{{+,359huS[[129r   r.   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
-FlaxBaseModelOutputWithPastAndCrossAttentions   a	  
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
Nr   r(   r   r   r0   r   )r   r   r   r   r   r   r   r   r   r   r(   r   r   r   r0   r   r   r   r   r2   r2      s    #J 04x,3;?OXeE#++$678?26M8E#++./6/3Js{{+,359huS[[129r   r2   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Srg)FlaxSeq2SeqModelOutputi  a  
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.

Args:
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
        `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
        blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
    encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
Nr   r(   decoder_hidden_statesdecoder_attentionsr0   encoder_last_hidden_stateencoder_hidden_statesencoder_attentionsr   )r   r   r   r   r   r   r   r   r   r   r(   r   r6   r7   r0   r8   r9   r:   r   r   r   r   r5   r5     s    /b 04x,3;?OXeE#++$678?:>8E#++$67>7;s{{!34;59huS[[1297;x4;:>8E#++$67>7;s{{!34;r   r5   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
%FlaxCausalLMOutputWithCrossAttentionsiC  a1  
Base class for causal language model (or autoregressive) outputs.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Cross attentions weights after the attention softmax, used to compute the weighted average in the
        cross-attention heads.
    past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `jnp.ndarray` tuples of length `config.n_layers`, with each tuple containing the cached key, value
        states of the self-attention and the cross-attention layers if model is used in encoder-decoder setting.
        Only relevant if `config.is_decoder = True`.

        Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
Nr$   r(   r   r   r0   r   )r   r   r   r   r   r$   r   r   r   r   r(   r   r   r   r0   r   r   r   r   r<   r<   C  s    @ %)FHS[[!(;?OXeE#++$678?26M8E#++./6/3Js{{+,359huS[[129r   r<   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FlaxMaskedLMOutputil  a.  
Base class for masked language models outputs.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr$   r   r   r   r   r   r   r   r   r$   r   r   r   r   r   r   r   r   r   r   r   r>   r>   l  L    & %)FHS[[!(26M8E#++./6/3Js{{+,3r   r>   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Srg)FlaxSeq2SeqLMOutputi  a  
Base class for sequence-to-sequence language models outputs.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
        `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
        blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
    encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
Nr$   r(   r6   r7   r0   r8   r9   r:   r   r   r   r   r   r   r$   r   r   r   r   r(   r   r6   r7   r0   r8   r9   r:   r   r   r   r   rB   rB         +Z %)FHS[[!(;?OXeE#++$678?:>8E#++$67>7;s{{!34;59huS[[1297;x4;:>8E#++$67>7;s{{!34;r   rB   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FlaxNextSentencePredictorOutputi  aT  
Base class for outputs of models predicting if two sentences are consecutive or not.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr$   r   r   r   r?   r   r   r   rF   rF     sL    ( %)FHS[[!(26M8E#++./6/3Js{{+,3r   rF   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FlaxSequenceClassifierOutputi  a  
Base class for outputs of sentence classification models.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr$   r   r   r   r?   r   r   r   rH   rH     r@   r   rH   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Srg)#FlaxSeq2SeqSequenceClassifierOutputi  a  
Base class for outputs of sequence-to-sequence sentence classification models.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
        `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
        blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
    encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
Nr$   r(   r6   r7   r0   r8   r9   r:   r   rC   r   r   r   rJ   rJ     rD   r   rJ   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FlaxMultipleChoiceModelOutputi0  a=  
Base class for outputs of multiple choice models.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, num_choices)`):
        *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

        Classification scores (before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr$   r   r   r   r?   r   r   r   rL   rL   0  sL    * %)FHS[[!(26M8E#++./6/3Js{{+,3r   rL   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FlaxTokenClassifierOutputiL  a  
Base class for outputs of token classification models.

Args:
    logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.num_labels)`):
        Classification scores (before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr$   r   r   r   r?   r   r   r   rN   rN   L  r@   r   rN   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	 FlaxQuestionAnsweringModelOutputif  a[  
Base class for outputs of question answering models.

Args:
    start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
        Span-end scores (before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nstart_logits
end_logitsr   r   r   )r   r   r   r   r   rQ   r   r   r   r   rR   r   r   r   r   r   r   r   rP   rP   f  s_    * +/L(3;;'.(,J%,26M8E#++./6/3Js{{+,3r   rP   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\R                     \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)'FlaxSeq2SeqQuestionAnsweringModelOutputi  a   
Base class for outputs of sequence-to-sequence question answering models.

Args:
    start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
        Span-end scores (before SoftMax).
    past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
        `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
        blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
    encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
NrQ   rR   r(   r6   r7   r0   r8   r9   r:   r   )r   r   r   r   r   rQ   r   r   r   r   rR   r(   r   r6   r7   r0   r8   r9   r:   r   r   r   r   rT   rT     s    -^ +/L(3;;'.(,J%,;?OXeE#++$678?:>8E#++$67>7;s{{!34;59huS[[1297;x4;:>8E#++$67>7;s{{!34;r   rT   ) typingr   r   r   flax	jax.numpynumpyr   utilsr   struct	dataclassr	   r   r   r"   r&   r+   r.   r2   r5   r<   r>   FlaxCausalLMOutputrB   rF   rH   rJ   rL   rN   rP   rT   r   r   r   <module>r]      s   ) (    4+ 4 42 7 7 7" 7; 7 7( 7{ 7 7$ 4+ 4 4: 4[ 4 4< -:{ -: -:` *:K *: *:Z 9<[ 9< 9<x %:K %: %:P 4 4 42 (  5<+ 5< 5<p 4k 4 44 4; 4 42 5<+ 5< 5<p 4K 4 46 4 4 42 4{ 4 48 8<k 8< 8<r   