o
    Zhg                     @   s$  d Z ddlmZmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ e( rddl,m-Z- ddl.m/Z/ e r	 e)0e1Z2G dd de	j3Z4	d5de	j5dej6dej6dej6deej6 de7de7fddZ8G d d! d!e	j5Z9G d"d# d#e	j5Z:e&G d$d% d%e!Z;G d&d' d'e;Z<e&G d(d) d)e;Z=G d*d+ d+ee%Z>G d,d- d-e;eZ?e&d.d/G d0d1 d1e;Z@e&G d2d3 d3e;ZAg d4ZBdS )6zPyTorch OPT model.    )CallableListOptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargsis_flash_attn_available)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )	OPTConfig)	BlockMask)make_flex_block_causal_maskc                       sR   e Zd ZdZdedef fddZ		ddejd	ed
eej f fddZ	  Z
S )OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr$   r%   	__class__ S/var/www/auris/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.pyr*   ;   s   z&OPTLearnedPositionalEmbedding.__init__r   Nattention_maskpast_key_values_lengthposition_idsc                    sL   |du rt j|dd}|| d  }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr)   forwardr(   )r+   r0   r1   r2   r,   r.   r/   r8   A   s
   z%OPTLearnedPositionalEmbedding.forwardr   N)__name__
__module____qualname____doc__intr*   r5   
LongTensorr   r8   __classcell__r.   r.   r,   r/   r#   6   s    	r#           modulequerykeyvaluer0   scalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )N)r4   dtypeptrainingr   r'   )r5   matmul	transposer   
functionalZsoftmaxZfloat32torJ   rG   rM   
contiguous)
rB   rC   rD   rE   r0   rF   rG   kwargsattn_weightsattn_outputr.   r.   r/   eager_attention_forwardS   s   
rV   c                       s   e Zd ZdZ	ddedee f fddZ					ddej	d	ee
ej	  d
eej	 deej	 dedeej	 de
ej	eej	 ee f fddZ  ZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s  t    || _|j| _|j| _|j| _|j	| _	|| _
|d u r*td| jj d | j| j | _d| _| j| j | jkrJtd| j d| j d| jd | _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _d S )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r)   r*   rX   hidden_size	embed_dimZnum_attention_heads	num_headsZattention_dropoutrG   enable_biasrY   loggerwarning_oncer-   r:   head_dimZ	is_causal
ValueErrorrF   r   Lineark_projv_projq_projout_proj)r+   rX   rY   rS   r,   r.   r/   r*   m   s0   

zOPTAttention.__init__Fhidden_statespast_key_valuer0   layer_head_maskoutput_attentionscache_positionreturnc                 K   s>  |  \}}	}
| || j }||d| j| jdd}| |}| |}||d| j| jdd}||d| j| jdd}|durT|	||| j
d|i\}}t}| jjdkrp| jjdkrj|rjtd nt| jj }|| ||||f| js|d	n| j| jd
|\}}|||	d }| |}|sd}|||fS )z#Input shape: Batch x Time x ChannelrH   r   r'   Nrm   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rA   )rG   rF   )sizerg   rF   viewr^   rb   rO   re   rf   updaterY   rV   rX   _attn_implementationr`   ra   r   rM   rG   reshaperR   rh   )r+   ri   rj   r0   rk   rl   rm   rS   ZbszZtgt_len_Zquery_statesZ
key_statesZvalue_statesZattention_interfacerU   rT   r.   r.   r/   r8      sF   




zOPTAttention.forwardN)NNNFN)r:   r;   r<   r=   r    r   r>   r*   r5   Tensorr   boolr   r8   r@   r.   r.   r,   r/   rW   j   s8    &	rW   c                       s   e Zd Zddedee f fddZ							ddejdeej d	eej d
ee	ej  dee
 dee
 deej deej dee de	ejee	ejejf  f fddZ  ZS )OPTDecoderLayerNrX   rY   c                    s   t    |j| _t||d| _|j| _|j| _t|j	 | _
tj| j|jd| _tj| j|j|jd| _tj|j| j|jd| _tj| j|jd| _d S )N)rX   rY   Zelementwise_affinerZ   )r)   r*   r\   r]   rW   	self_attndo_layer_norm_beforerG   r   Zactivation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normrd   Zffn_dimr_   fc1fc2final_layer_norm)r+   rX   rY   r,   r.   r/   r*      s   
zOPTDecoderLayer.__init__Fri   r0   rk   rj   rl   	use_cacher2   rm   rS   rn   c	              
   K   s  |}
| j r
| |}| jd|||||||d|	\}}}tjj|| j| jd}|
| }| j s4| |}|j}|d|	d}|}
| j rJ| 
|}| |}| |}| |}tjj|| j| jd}|
| |}| j ss| 
|}|f}|r}||f7 }|r||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence..
        )ri   rj   r2   r0   rk   rl   rm   rK   rH   Nr.   )r}   r   r|   r   rP   rG   rM   shaperu   rq   r   r   r~   r   rr   )r+   ri   r0   rk   rj   rl   r   r2   rm   rS   ZresidualZself_attn_weightsZpresent_key_valueZhidden_states_shapeoutputsr.   r.   r/   r8      sH   









zOPTDecoderLayer.forwardrw   )NNNFFNN)r:   r;   r<   r    r   r>   r*   r5   rx   r   ry   r?   r   r   FloatTensorr8   r@   r.   r.   r,   r/   rz      s<    	
rz   c                   @   sB   e Zd ZeZdZdZdgZdZdZ	dZ
dZdZdZdZdd ZdS )OPTPreTrainedModelmodelTrz   c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )NrA   )meanstdg      ?)rX   Zinit_std
isinstancer   rd   weightdataZnormal_r[   Zzero_	Embeddingpadding_idxr   Zfill_)r+   rB   r   r.   r.   r/   _init_weightsA  s   

z OPTPreTrainedModel._init_weightsN)r:   r;   r<   r    Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_attention_backendZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacher   r.   r.   r.   r/   r   3  s    r   c                       s6  e Zd ZdZdef fddZdd Zdd Z		d$d
ee	j
df de	j
de	j
dedef
ddZed
e	j
dedede	jde	j
defddZe											d%dee	j d
ee	j
 dee	j
 deee	j  dee	j dee dee dee dee dee	j dee	j
 d ee d!eeef fd"d#Z  ZS )&
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    rX   c                    s  t     j| _ j| _ j| _ j| _ j| _t	
 j j| j| _t j j| _ j jkr?t	j j jdd| _nd | _ j jkrTt	j j jdd| _nd | _ jrh jsht	j j jd| _nd | _t	 fddt jD | _d| _|   d S )NFrZ   r{   c                    s   g | ]}t  |d qS ))rY   )rz   ).0irX   r.   r/   
<listcomp>w  s    z'OPTDecoder.__init__.<locals>.<listcomp>)r)   r*   rG   	layerdroppad_token_idr   Zmax_position_embeddingsZmax_target_positions
vocab_sizer   r   word_embed_proj_dimembed_tokensr#   r\   embed_positionsrd   project_out
project_inr}   Z_remove_final_layer_normr   r   r   Z
ModuleListrangeZnum_hidden_layerslayersgradient_checkpointing	post_initr+   rX   r,   r   r/   r*   X  s,   
 zOPTDecoder.__init__c                 C      | j S rw   r   r+   r.   r.   r/   get_input_embeddings}     zOPTDecoder.get_input_embeddingsc                 C   
   || _ d S rw   r   r+   rE   r.   r.   r/   set_input_embeddings     
zOPTDecoder.set_input_embeddingsFr0   r!   input_tensorrm   past_key_valuesrl   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2rA   Zflex_attentionr   Frp   )inputs_embedsr1   Zis_trainingr   rH   )sequence_lengthtarget_lengthrJ   rm   
batch_size)cudaZxpuZnpu)rX   rt   anyr   r5   rx   r"   get_seq_lengthZis_compileabler   Z_ignore_causal_mask_sdparM   rJ   r   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypefinfominZ_unmask_unattended)r+   r0   r   rm   r   rl   past_seen_tokensZusing_compilable_cacherJ   r   r   causal_mask	min_dtyper.   r.   r/   _update_causal_mask  sT   




zOPTDecoder._update_causal_maskr   r   rJ   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuerJ   r   r   )Zdiagonalr   rH   r   )r4   r5   r   r   fullr   Ztriuarangeru   expandcloner   rQ   Zmasked_fill)r0   r   r   rJ   rm   r   rS   r   r   Zmask_lengthZpadding_maskr.   r.   r/   r     s,    $
6  z@OPTDecoder._prepare_4d_causal_attention_mask_with_cache_positionN	input_ids	head_maskr   r   output_hidden_statesreturn_dictr2   rS   rn   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|	dur$|	n| j j}	|du |duA r4td| jrC| jrC|rCt	d d}|durP|
d|jd }|du rY| |}d}|rrt|tsrd}t|}|du rrt	d |durz| nd}|du rtj|||jd	  |jd
}|du r||jd	  }tj|jd ||jd
}| |||||}|
du rtj|d	d}
|
| d	  }
|
dd|df }
| j|||
d}| jdur| |}|||j }|rdnd}|rdnd}d}t|gdgD ]*\}}|dur$| d t| jkr$td| dt| j d| d  dqt | jD ]q\}}|r6||f7 }| jrGt!g }|| j"k rGq*| jrg| jrg| #|j$|||dur^|| ndd|||
|	}n||f||
|duru|| nd||||d|}|d }|r||rdnd	 }|r||d	 f7 }q*| j%dur| %|}| j&dur| &|}|r||f7 }|r|nd}|r|' }t(||||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`. for padding use -1.

                [What are position IDs?](../glossary#position-ids)
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
                the complete sequence length.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrH   TzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   r3   )r2   r.   r   zThe `z` should be specified for z layers, but it is for .)r0   r2   rk   rj   rl   r   rm   r'   last_hidden_stater   ri   
attentions))rX   rl   r   r   use_return_dictrc   r   rM   r`   ra   rr   r   r   r   r   r   Zfrom_legacy_cacher   r5   r   r   Zonesr   r6   r7   r   r   rQ   ziprq   lenr   	enumerateZrandr   Z_gradient_checkpointing_func__call__r   r   Zto_legacy_cacher   )r+   r   r0   r   r   r   r   rl   r   r   r2   rm   rS   Zreturn_legacy_cacher   Z
seq_lengthr   Z
pos_embedsri   Zall_hidden_statesZall_self_attnsZnext_decoder_cacheZ	attn_maskZ	mask_nameidxZdecoder_layerZdropout_probabilityZlayer_outputsZ
next_cacher.   r.   r/   r8      s   H








	


zOPTDecoder.forward)FNNNNNNNNNNN)r:   r;   r<   r=   r    r*   r   r   r   r5   rx   r   ry   r   staticmethodr>   rJ   r   r   r   r?   r   r   r   r   r   r   r8   r@   r.   r.   r,   r/   r   P  s    %

D6	

r   c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
	
	
dde
ej de
ej de
ej de
eeej ef  de
ej de
e de
e de
e de
e de
ej de
ej dee deeef fddZ  ZS )OPTModelrX   c                    s"   t  | t|| _|   d S rw   )r)   r*   r   decoderr   r   r,   r.   r/   r*     s   
zOPTModel.__init__c                 C      | j jS rw   r   r   r   r.   r.   r/   r        zOPTModel.get_input_embeddingsc                 C      || j _d S rw   r   r   r.   r.   r/   r        zOPTModel.set_input_embeddingsc                 C   r   rw   )r   r   r.   r.   r/   get_decoder  r   zOPTModel.get_decoderNr   r0   r   r   r   r   rl   r   r   r2   rm   rS   rn   c                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	| jd|||
||||||d|d|}t|j|j|j	|j
dS )NTr   r0   r2   r   r   r   r   rl   r   r   rm   r   r.   )rX   rl   r   r   r   r   r   r   r   ri   r   )r+   r   r0   r   r   r   r   rl   r   r   r2   rm   rS   Zdecoder_outputsr.   r.   r/   r8     s4   zOPTModel.forwardr   )r:   r;   r<   r    r*   r   r   r   r   r   r   r5   r?   rx   r   r   r   r   ry   r   r   r   r   r8   r@   r.   r.   r,   r/   r     sZ    	

r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r:   r;   r<   r.   r.   r.   r/   r     s    r   c                !       s  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
ee												d#deej deej deej deeeej ef  deej deej dee dee dee dee deej deej dee deeef fdd Zed!d" Z  ZS )$OPTForCausalLMzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFrZ   )
r)   r*   r   r   r   rd   r   r   lm_headr   r   r,   r.   r/   r*     s   
zOPTForCausalLM.__init__c                 C   
   | j jjS rw   r   r   r   r   r.   r.   r/   r   '  r   z#OPTForCausalLM.get_input_embeddingsc                 C      || j j_d S rw   r   r   r.   r.   r/   r   *     z#OPTForCausalLM.set_input_embeddingsc                 C   r   rw   r   r   r.   r.   r/   get_output_embeddings-  r   z$OPTForCausalLM.get_output_embeddingsc                 C   r   rw   r   )r+   Znew_embeddingsr.   r.   r/   set_output_embeddings0  r   z$OPTForCausalLM.set_output_embeddingsc                 C   r   rw   r   r   )r+   r   r.   r.   r/   set_decoder3  r   zOPTForCausalLM.set_decoderc                 C   r   rw   r   r   r.   r.   r/   r   6  r   zOPTForCausalLM.get_decoderNr   r0   r   r   r   labelsr   rl   r   r   r2   rm   rS   rn   c                 K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| jjd|||||||||	d|d|}| |d  }d}|durV||j	}| j
||fd| j ji|}t|||j|j|jdS )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```NTr   r   r   losslogitsr   ri   r   r.   )rX   rl   r   r   r   r   r   rR   rQ   r   Zloss_functionr   r   r   ri   r   )r+   r   r0   r   r   r   r   r   rl   r   r   r2   rm   rS   r   r   r   r.   r.   r/   r8   9  sL   )zOPTForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr.   c                 3   s$    | ]}| d  |jV  qdS r9   )Zindex_selectrQ   r   )r   Z
past_statebeam_idxr.   r/   	<genexpr>  s   " z0OPTForCausalLM._reorder_cache.<locals>.<genexpr>)tuple)r   r   Zreordered_pastZ
layer_pastr.   r   r/   _reorder_cache  s   zOPTForCausalLM._reorder_cacheNNNNNNNNNNNN)r:   r;   r<   Z_tied_weights_keysr*   r   r   r   r   r   r   r   r   r   r5   r?   rx   r   r   r   r   ry   r   r   r   r   r8   r   r   r@   r.   r.   r,   r/   r     sl    
	

Rr   a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )Zcustom_introc                       s   e Zd Zdef fddZe											ddeej deej	 deej	 dee
eej	 ef  d	eej	 d
eej dee dee dee dee deej de
eef fddZdd Zdd Z  ZS )OPTForSequenceClassificationrX   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r)   r*   
num_labelsr   r   r   rd   r   scorer   r   r,   r.   r/   r*     s
   
z%OPTForSequenceClassification.__init__Nr   r0   r   r   r   r   r   rl   r   r   r2   rn   c                 C   sL  |
dur|
n| j j}
| j|||||||||	|
d
}|d }| |}|dur0|jdd \}}n	|jdd \}}| j jdu rG|dkrGtd| j jdu rPd}n1|duru|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrd| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n,|||}n&| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	r   r0   r2   r   r   r   rl   r   r   r   r'   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rH   )r   rJ   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   Z
regressionZsingle_label_classificationZmulti_label_classificationr   )rX   r   r   r   r   r   rc   rQ   r   r5   Zint32r   Zargmaxr`   ra   r-   r:   Zproblem_typer   rJ   r7   r>   r
   squeezer	   rr   r   r   r   ri   r   )r+   r   r0   r   r   r   r   r   rl   r   r   r2   transformer_outputsri   r   r   r   Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesZpooled_logitsr   loss_fctoutputr.   r.   r/   r8     sx   



"


z$OPTForSequenceClassification.forwardc                 C   r   rw   r   r   r.   r.   r/   r     r   z1OPTForSequenceClassification.get_input_embeddingsc                 C   r   rw   r   r   r.   r.   r/   r     r   z1OPTForSequenceClassification.set_input_embeddingsr   )r:   r;   r<   r    r*   r   r   r5   r?   r   r   r   r   ry   r   r   r8   r   r   r@   r.   r.   r,   r/   r     sR    		

^r   c                       s   e Zd Zdef fddZe												ddeej deej	 deej	 dee
eej	 ef  d	eej	 d
eej deej dee dee dee dee deej de
eef fddZdd Zdd Z  ZS )OPTForQuestionAnsweringrX   c                    s2   t  | t|| _t|jd| _|   d S r&   )	r)   r*   r   r   r   rd   r   
qa_outputsr   r   r,   r.   r/   r*     s   
z OPTForQuestionAnswering.__init__Nr   r0   r   r   r   start_positionsend_positionsr   rl   r   r   r2   rn   c                 C   sZ  |dur|n| j j}| j||||||||	|
|d
}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrP|d}t| dkr]|d}|d}|	d|
|j}|	d|
|j}t|d}|||}|||}|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```Nr   r   r   rH   r3   )Zignore_indexr'   )r   start_logits
end_logitsri   r   )rX   r   r   r   splitr   rR   r   rq   clamprQ   r   r	   r   ri   r   )r+   r   r0   r   r   r   r   r  r   rl   r   r   r2   r   ri   r   r  r  Z
total_lossZignored_indexr   Z
start_lossZend_lossr   r.   r.   r/   r8     sR   0






zOPTForQuestionAnswering.forwardc                 C   r   rw   r   r   r.   r.   r/   r     r   z,OPTForQuestionAnswering.get_input_embeddingsc                 C   r   rw   r   r   r.   r.   r/   r     r   z,OPTForQuestionAnswering.set_input_embeddingsr   )r:   r;   r<   r    r*   r   r   r5   r?   r   r   r   r   ry   r   r   r8   r   r   r@   r.   r.   r,   r/   r     sX    	

ar   )r   r   r   r   r   )rA   )Cr=   typingr   r   r   r   r   r5   Ztorch.utils.checkpointr   Ztorch.nnr   r	   r
   Zactivationsr   Zcache_utilsr   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   r   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zconfiguration_optr    Z!torch.nn.attention.flex_attentionr!   Zintegrations.flex_attentionr"   Z
get_loggerr:   r`   r   r#   Modulerx   floatrV   rW   rz   r   r   r   r   r   r   r   __all__r.   r.   r.   r/   <module>   st   
$
ah   	@}pr