
    fThg                     L   S r SSKJrJrJrJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+  \(" 5       (       a  SSK,J-r-  SSK.J/r/  \" 5       (       a   \)R`                  " \15      r2 " S S\	Rf                  5      r4 S3S\	Rj                  S\Rl                  S\Rl                  S\Rl                  S\\Rl                     S\7S\74S jjr8 " S S\	Rj                  5      r9 " S  S!\	Rj                  5      r:\& " S" S#\!5      5       r; " S$ S%\;5      r<\& " S& S'\;5      5       r= " S( S)\\%5      r> " S* S+\;\5      r?\&" S,S-9 " S. S/\;5      5       r@\& " S0 S1\;5      5       rA/ S2QrBg)4zPyTorch OPT model.    )CallableListOptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargsis_flash_attn_available)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )	OPTConfig)	BlockMask)make_flex_block_causal_maskc                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\S\	\R                     4U 4S	 jjjr
S
rU =r$ )OPTLearnedPositionalEmbedding6   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr'   r(   	__class__s      \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/opt/modeling_opt.pyr.   &OPTLearnedPositionalEmbedding.__init__;   s"     ++5}E    attention_maskpast_key_values_lengthposition_idsc                    > Uc5  [         R                  " USS9nX1-  S-
  R                  5       nUSS2US24   n[        TU ]  X0R
                  -   5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr    dim)torchcumsumlongr-   forwardr,   )r/   r4   r5   r6   r0   s       r1   r=   %OPTLearnedPositionalEmbedding.forwardA   sZ      <<A>L(9A=CCEL'+A+B(BCLw|kk9::r3   )r,   r   N)__name__
__module____qualname____firstlineno____doc__intr.   r:   
LongTensorr   r=   __static_attributes____classcell__r0   s   @r1   r%   r%   6   s]    Fs F3 F '(37	;((; !$; u//0	; ;r3   r%   modulequerykeyvaluer4   scalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )N)r9   dtypeptrainingr    r+   )r:   matmul	transposer   
functionalsoftmaxfloat32torS   rO   rV   
contiguous)
rJ   rK   rL   rM   r4   rN   rO   kwargsattn_weightsattn_outputs
             r1   eager_attention_forwardra   S   s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r3   c                   :  ^  \ rS rSrSr SS\S\\   4U 4S jjjr     SS\	R                  S\\\	R                        S\\	R                     S	\\	R                     S
\S\\	R                     S\\	R                  \\	R                     \\   4   4S jjrSrU =r$ )OPTAttentionj   z=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        X l
        Uc-  [        R                  SU R                  R                   S35        U R                  U R                  -  U l        SU l        U R                  U R                  -  U R                  :w  a&  [#        SU R                   SU R                   S35      eU R                  S-  U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        g )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r-   r.   re   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrO   enable_biasrf   loggerwarning_oncer0   r@   head_dim	is_causal
ValueErrorrN   r   Lineark_projv_projq_projout_proj)r/   re   rf   r^   r0   s       r1   r.   OPTAttention.__init__m   s{    	++33//!--" !8!8 9 :, , $..8MMDNN*t~~=MdnnM]$T^^$4B8  }}d*iiTEUEUViiTEUEUViiTEUEUV		$..$..tGWGWXr3   hidden_statespast_key_valuer4   layer_head_maskoutput_attentionscache_positionreturnc                    UR                  5       u  pn
U R                  U5      U R                  -  nUR                  USU R                  U R
                  5      R                  SS5      nU R                  U5      nU R                  U5      nUR                  USU R                  U R
                  5      R                  SS5      nUR                  USU R                  U R
                  5      R                  SS5      nUb!  UR                  XU R                  SU05      u  p[        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[         U R                  R                     nU" U UUUU4U R"                  (       d  S	OU R$                  U R                  S
.UD6u  nnUR'                  XS5      R)                  5       nU R+                  U5      nU(       d  SnUUU4$ )z#Input shape: Batch x Time x ChannelrQ   r    r+   Nr   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )rO   rN   )sizerx   rN   viewrm   rr   rX   rv   rw   updaterf   ra   re   _attn_implementationrp   rq   r   rV   rO   reshaper]   ry   )r/   r{   r|   r4   r}   r~   r   r^   bsztgt_len_query_states
key_statesvalue_statesattention_interfacer`   r_   s                    r1   r=   OPTAttention.forward   s    (,,.a {{=1DLL@#((b$..$--PZZ[\^_`[[/
{{=1__S"dnndmmLVVWXZ[\
#((b$..$--PZZ[\^_`%'5'<'<$..;K^:\($J )@;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))#;FFHmmK0 LL.88r3   )re   rO   rk   ro   rr   rs   rv   rf   rm   ry   rx   rN   rw   N)NNNFN)r@   rA   rB   rC   rD   r!   r   rE   r.   r:   Tensorr   boolr   r=   rG   rH   rI   s   @r1   rc   rc   j   s    G
 $(!Y!Y C=!Y !YL 9=1526"'1589||89 !u||!4589 !.	89
 "%,,/89  89 !.89 
u||Xell3Xe_D	E89 89r3   rc   c                     ^  \ rS rSrSS\S\\   4U 4S jjjr       SS\R                  S\\R                     S\\R                     S\\
\R                        S	\\   S
\\   S\\R                     S\\R                     S\\   S\
\R                  \\
\R                  \R                  4      4   4S jjrSrU =r$ )OPTDecoderLayer   re   rf   c                 p  > [         TU ]  5         UR                  U l        [	        XS9U l        UR                  U l        UR                  U l        [        UR                     U l
        [        R                  " U R                  UR                  S9U l        [        R                  " U R                  UR                   UR"                  S9U l        [        R                  " UR                   U R                  UR"                  S9U l        [        R                  " U R                  UR                  S9U l        g )N)re   rf   elementwise_affinerh   )r-   r.   rj   rk   rc   	self_attndo_layer_norm_beforerO   r   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normru   ffn_dimro   fc1fc2final_layer_norm)r/   re   rf   r0   s      r1   r.   OPTDecoderLayer.__init__   s    ++%VI$*$?$?!~~#F$>$>?$&LLNNv/S/S%
! 99T^^V^^&BTBTU99V^^T^^&BTBTU "T^^PVPtPt ur3   r{   r4   r}   r|   r~   	use_cacher6   r   r^   r   c	                 D   Un
U R                   (       a  U R                  U5      nU R                  " SUUUUUUUS.U	D6u  pn[        R                  R                  XR
                  U R                  S9nX-   nU R                   (       d  U R                  U5      nUR                  nUR                  SUR                  S5      5      nUn
U R                   (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n[        R                  R                  XR
                  U R                  S9nX-   R                  U5      nU R                   (       d  U R                  U5      nU4nU(       a  X4-  nU(       a  X4-  nU$ )a4  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence..
)r{   r|   r6   r4   r}   r~   r   rT   rQ    )r   r   r   r   rY   rO   rV   shaper   r   r   r   r   r   r   )r/   r{   r4   r}   r|   r~   r   r6   r   r^   residualself_attn_weightspresent_key_valuehidden_states_shapeoutputss                  r1   r=   OPTDecoderLayer.forward   s   < ! $$ 55mDM ?Cnn 	?
')%)+/)	?
 	?
;*; --m||VZVcVc-d 0 (( 55mDM ,11%--b-2D2DR2HI  $$ 11-@M/**=9/--m||VZVcVc-d!1778KL (( 11-@M "++G++Gr3   )	r   r   rO   rk   r   r   r   r   r   r   )NNNFFNN)r@   rA   rB   rC   r!   r   rE   r.   r:   r   r   r   rF   r   r   FloatTensorr=   rG   rH   rI   s   @r1   r   r      s#   vy vXc] v v( 26268<,1$)3715S||S !.S "%,,/	S
 !u||!45S $D>S D>S u//0S !.S -.S 
u  (51B1BEDUDU1U+V"WW	XS Sr3   r   c                   H    \ rS rSr\rSrSrS/rSr	Sr
SrSrSrSrSrS rSrg)OPTPreTrainedModeli3  modelTr   c                    U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g [        U[        R                  5      (       aJ  UR
                  R                  R                  S5        UR                  R                  R                  5         g g )Nr   )meanstdg      ?)re   init_std
isinstancer   ru   weightdatanormal_ri   zero_	Embeddingpadding_idxr   fill_)r/   rJ   r   s      r1   _init_weights OPTPreTrainedModel._init_weightsA  s   kk""fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$ .r3   r   N)r@   rA   rB   rC   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cacher   rG   r   r3   r1   r   r   3  sJ    L&*#*+"&!N  $!%r3   r   c                   X  ^  \ rS rSrSrS\4U 4S jjrS rS r SS\	\
R                  S4   S	\
R                  S
\
R                  S\S\4
S jjr\S\
R                  S\S\S\
R"                  S
\
R                  S\4S j5       r\           SS\\
R*                     S\\
R                     S\\
R                     S\\\
R.                        S\\
R.                     S\\   S\\   S\\   S\\   S\\
R*                     S
\\
R                     S\\   S\	\\4   4S jj5       rSrU =r$ ) 
OPTDecoderiP  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

Args:
    config: OPTConfig
re   c           
      F  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " UR                  UR                  U R
                  5      U l        [        UR                  UR                  5      U l        UR                  UR                  :w  a0  [        R                   " UR                  UR                  SS9U l        OS U l        UR                  UR                  :w  a0  [        R                   " UR                  UR                  SS9U l        OS U l        UR&                  (       a@  UR(                  (       d/  [        R*                  " UR                  UR,                  S9U l        OS U l        [        R0                  " [3        UR4                  5       Vs/ s H  n[7        XS9PM     sn5      U l        SU l        U R=                  5         g s  snf )NFrh   r   )rf   )r-   r.   rO   	layerdroppad_token_idr   max_position_embeddingsmax_target_positions
vocab_sizer   r   word_embed_proj_dimembed_tokensr%   rj   embed_positionsru   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing	post_init)r/   re   ir0   s      r1   r.   OPTDecoder.__init__X  s    ~~))!..$*$B$B! ++LL):):F<V<VX\XhXhi<V=[=[]c]o]op%%););;!yy););V=W=W^cdD#D%%););; ii(B(BFDVDV]bcDO"DO
 &&v/N/N$&LL""v7[7[%D! %)D!mmSXY_YqYqSr$sSra_V%ISr$st&+#	 %ts   'Hc                     U R                   $ r   r   r/   s    r1   get_input_embeddingsOPTDecoder.get_input_embeddings}  s       r3   c                     Xl         g r   r   r/   rM   s     r1   set_input_embeddingsOPTDecoder.set_input_embeddings  s    !r3   r4   r"   input_tensorr   past_key_valuesr~   c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fr   )inputs_embedsr5   is_trainingr    rQ   )sequence_lengthtarget_lengthrS   r   
batch_size)cudaxpunpu)re   r   anyr   r:   r   r#   get_seq_lengthis_compileabler   _ignore_causal_mask_sdparV   rS   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypefinfomin_unmask_unattended)r/   r4   r   r   r   r~   past_seen_tokensusing_compilable_cacherS   r   r   causal_mask	min_dtypes                r1   _update_causal_maskOPTDecoder._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr3   r   r   rS   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuerS   r   r    )diagonalr   rQ   r   )r9   r:   r  r  fullr   triuaranger   expandcloner   r\   masked_fill)r4   r   r   rS   r   r   r^   r  r  mask_lengthpadding_masks              r1   r   @OPTDecoder._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r3   	input_ids	head_maskr   r   output_hidden_statesreturn_dictr6   r^   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	USL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUb  UR                  SUR                  S   5      nUc  U R                  U5      nSnU(       aE  [        U[        5      (       d0  Sn[        R                   " U5      nUc  [        R                  S5        Ub  UR#                  5       OSnUc/  [$        R&                  " XUR                  S	   -   UR(                  S
9nUc=  XR                  S	   -   n[$        R*                  " UR                  S   XR(                  S
9nU R-                  X%XU5      nU
c5  [$        R.                  " US	S9n
X-  S	-
  R1                  5       n
U
SS2US24   n
U R3                  X.U
S9nU R4                  b  U R5                  U5      nUUR7                  UR(                  5      -   nU(       a  SOSnU(       a  SOSnSn[9        U/S/5       Hn  u  nnUc  M  UR;                  5       S   [=        U R>                  5      :w  d  M7  [        SU S[=        U R>                  5       SUR;                  5       S    S35      e   [A        U R>                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [$        RB                  " / 5      nUU RD                  :  a  ML  U R                  (       a=  U R                  (       a,  U RG                  URH                  UUUb  UU   OSSUUU
U5	      nOU" U4UU
Ub  UU   OSUUUUS.UD6nUS   nU(       a  UU(       a  SOS	   nU(       d  M  UUS	   4-  nM     U RJ                  b  U RK                  U5      nU RL                  b  U RM                  U5      nU(       a  UU4-  nU(       a  UOSnU(       a  URO                  5       n[Q        UUUUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
        shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
    position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
        config.n_positions - 1]`. for padding use -1.

        [What are position IDs?](../glossary#position-ids)
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
        this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
        the complete sequence length.
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrQ   TzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r    r  r8   )r6   r   r  zThe `z` should be specified for z layers, but it is for .)r4   r6   r}   r|   r~   r   r   r+   last_hidden_stater   r{   
attentions))re   r~   r  r   use_return_dictrt   r   rV   rp   rq   r   r   r   r   r   r   from_legacy_cacher   r:   r  r   onesr	  r;   r<   r   r   r\   zipr   lenr   	enumeraterandr   _gradient_checkpointing_func__call__r   r   to_legacy_cacher   )r/   r  r4   r  r   r   r   r~   r  r  r6   r   r^   return_legacy_cacher  
seq_lengthr  
pos_embedsr{   all_hidden_statesall_self_attnsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputs
next_caches                                r1   r=   OPTDecoder.forward   s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I !r9??2+>?I  --i8M#Z??"&*<<_MO&##Y @O?Z?99;`a!"\\ ]5H5H5K"KTaThThN !),?,?,BBJ"ZZ(;(;A(>
SgSghN..>L]

  <<A>L(9A=CCEL'+;+<(<=L)).Ye)f
??& OOM:M%
m6J6J(KK #7BD0d! %(k]$C Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %D #,DKK"8C#!m%55!}}&+jjn#&7**t}} $ A A!**!&/&;IcN% "
! !.!
!#.!-7@7LYs^RV#2&7'#1
! 
! *!,M%28I1q%Q"  =#3"55S #9V   , 11-@M' ,,];M  -!11+4'$
#335J&+&+%	
 	
r3   )rO   r   r   r   r   r   r   r   r   r   r   r   )FNNNNNNNNNNN)r@   rA   rB   rC   rD   r!   r.   r   r   r   r:   r   r   r   r	  staticmethodrE   rS   r   r   r   rF   r   r   r   r   r   r   r=   rG   rH   rI   s   @r1   r   r   P  s   #y #J!" #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4l  1515,0=A59$(,0/3&*3715R
E,,-R
 !.R
 ELL)	R

 "$u'8'8"9:R
   1 12R
 D>R
 $D>R
 'tnR
 d^R
 u//0R
 !.R
 -.R
 
u--	.R
 R
r3   r   c                     ^  \ rS rSrS\4U 4S jjrS rS rS r\	\
           SS\\R                     S\\R                     S	\\R                     S
\\\\R"                     \4      S\\R"                     S\\   S\\   S\\   S\\   S\\R                     S\\R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )OPTModeli  re   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )r-   r.   r   decoderr   r/   re   r0   s     r1   r.   OPTModel.__init__  s&     !&)r3   c                 .    U R                   R                  $ r   r?  r   r   s    r1   r   OPTModel.get_input_embeddings  s    ||(((r3   c                 $    XR                   l        g r   rC  r   s     r1   r   OPTModel.set_input_embeddings  s    $)!r3   c                     U R                   $ r   r?  r   s    r1   get_decoderOPTModel.get_decoder      ||r3   r  r4   r  r   r   r   r~   r  r  r6   r   r^   r   c                 ~   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R
                  " SUUU
UUUUUUSUS.UD6n[        UR                  UR                  UR                  UR                  S9$ )NTr  r4   r6   r  r   r   r   r~   r  r  r   r  r   )re   r~   r  r   r"  r?  r   r   r   r{   r!  )r/   r  r4   r  r   r   r   r~   r  r  r6   r   r^   decoder_outputss                 r1   r=   OPTModel.forward  s    " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ,, 
)%+'/!5)
 
 '-??+;;)77&11	
 	
r3   rH  r:  )r@   rA   rB   rC   r!   r.   r   r   rI  r   r   r   r:   rF   r   r   r   r   r   r   r   r   r   r   r=   rG   rH   rI   s   @r1   r=  r=    sT   y )*  1515,0KO59$(,0/3&*3715+
E,,-+
 !.+
 ELL)	+

 "%U->->(?(F"GH+
   1 12+
 D>+
 $D>+
 'tn+
 d^+
 u//0+
 !.+
 -.+
 
u--	.+
  +
r3   r=  c                       \ rS rSrSrg)KwargsForCausalLMi  r   N)r@   rA   rB   rC   rG   r   r3   r1   rQ  rQ    s    3r3   rQ  c            !         ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
S	 r\\            SS
\\R                      S\\R"                     S\\R"                     S\\\\R(                     \4      S\\R(                     S\\R                      S\\   S\\   S\\   S\\   S\\R                      S\\R"                     S\\   S\\\4   4S jj5       5       r\S 5       rSrU =r$ )OPTForCausalLMi  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFrh   )
r-   r.   r=  r   r   ru   r   r   lm_headr   r@  s     r1   r.   OPTForCausalLM.__init__  sK     f%
 yy!;!;V=N=NUZ[ 	r3   c                 B    U R                   R                  R                  $ r   r   r?  r   r   s    r1   r   #OPTForCausalLM.get_input_embeddings'      zz!!...r3   c                 8    XR                   R                  l        g r   rY  r   s     r1   r   #OPTForCausalLM.set_input_embeddings*      */

'r3   c                     U R                   $ r   rV  r   s    r1   get_output_embeddings$OPTForCausalLM.get_output_embeddings-  rK  r3   c                     Xl         g r   r`  )r/   new_embeddingss     r1   set_output_embeddings$OPTForCausalLM.set_output_embeddings0  s    %r3   c                 $    XR                   l        g r   r   r?  )r/   r?  s     r1   set_decoderOPTForCausalLM.set_decoder3  s    $

r3   c                 .    U R                   R                  $ r   rh  r   s    r1   rI  OPTForCausalLM.get_decoder6  s    zz!!!r3   r  r4   r  r   r   labelsr   r~   r  r  r6   r   r^   r   c                 "   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  R
                  " SUUUUUUUUU	SUS.UD6nU R                  US   5      R                  5       nSnUbE  UR                  UR                  5      nU R                  " UU4SU R                   R                  0UD6n[        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, OPTForCausalLM

>>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
```NTrM  r   r   losslogitsr   r{   r!  r   )re   r~   r  r"  r   r?  rV  r]   r\   r   loss_functionr   r   r   r{   r!  )r/   r  r4   r  r   r   rm  r   r~   r  r  r6   r   r^   r   rq  rp  s                    r1   r=   OPTForCausalLM.forward9  s7   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$ 
)%+'/!5)
 
 gaj)446YYv}}-F%%  ;;11 	D &#33!//))
 	
r3   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nr   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7fr?   )index_selectr\   r   ).0
past_statebeam_idxs     r1   	<genexpr>0OPTForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)tuple)r   ry  reordered_past
layer_pasts    `  r1   _reorder_cacheOPTForCausalLM._reorder_cache  s8    )Jncmnn N * r3   )rV  r   NNNNNNNNNNNN) r@   rA   rB   rC   _tied_weights_keysr.   r   r   ra  re  ri  rI  r   r   r   r:   rF   r   r   r   r   r   r   r   rQ  r   r   r=   r;  r  rG   rH   rI   s   @r1   rS  rS    s   *+/0&%"  1515,0KO59-1$(,0/3&*3715P
E,,-P
 !.P
 ELL)	P

 "%U->->(?(F"GHP
   1 12P
 ))*P
 D>P
 $D>P
 'tnP
 d^P
 u//0P
 !.P
 *+P
 
u,,	-P
  P
d  r3   rS  a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                     ^  \ rS rSrS\4U 4S jjr\           SS\\R                     S\\R                     S\\R                     S\\\\R                     \4      S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rS rS rSrU =r$ )OPTForSequenceClassificationi  re   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g rU  )
r-   r.   
num_labelsr=  r   r   ru   r   scorer   r@  s     r1   r.   %OPTForSequenceClassification.__init__  sT      ++f%
YYv994??QVW
 	r3   r  r4   r  r   r   rm  r   r~   r  r  r6   r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUUU	U
S9
nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S	35        U[        R                  " XR                  S
9U4   nSnUGb  U R                   R"                  c  U R$                  S:X  a  SU R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N	r   r4   r6   r  r   r   r~   r  r  r   r+   r    z=Cannot handle batch sizes > 1 if no padding token is defined.rQ   )r   rS   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationro  )re   r"  r   r  r   r   rt   r\   r   r:   int32r  argmaxrp   rq   r0   r@   problem_typer  rS   r<   rE   r   squeezer
   r   r	   r   r   r{   r!  )r/   r  r4   r  r   r   rm  r   r~   r  r  r6   transformer_outputsr{   rq  r   r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrp  loss_fctoutputs                           r1   r=   $OPTForSequenceClassification.forward  s   * &1%<k$++B]B]"jj+)%'/!5# ) 
 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r3   c                 B    U R                   R                  R                  $ r   rY  r   s    r1   r   1OPTForSequenceClassification.get_input_embeddings  r[  r3   c                 8    XR                   R                  l        g r   rY  r   s     r1   r   1OPTForSequenceClassification.set_input_embeddings  r^  r3   )r   r  r  r:  )r@   rA   rB   rC   r!   r.   r   r   r:   rF   r   r   r   r   r   r   r   r=   r   r   rG   rH   rI   s   @r1   r  r    sQ   y   156:15KO59-1$(,0/3&*37\
E,,-\
 !!2!23\
 E--.	\

 "%U->->(?(F"GH\
   1 12\
 ))*\
 D>\
 $D>\
 'tn\
 d^\
 u//0\
 
u66	7\
 \
|/0 0r3   r  c                     ^  \ rS rSrS\4U 4S jjr\            SS\\R                     S\\R                     S\\R                     S\\\\R                     \4      S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rS rS rSrU =r$ )OPTForQuestionAnsweringi  re   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g r*   )	r-   r.   r=  r   r   ru   r   
qa_outputsr   r@  s     r1   r.    OPTForQuestionAnswering.__init__  s@     f%
))F$>$>B 	r3   r  r4   r  r   r   start_positionsend_positionsr   r~   r  r  r6   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      R                  UR                  5      nUR                  SU5      R                  UR                  5      n[        US9nU" UU5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S	9$ )
a  
Example:

```python
>>> from transformers import AutoTokenizer, OPTForQuestionAnswering
>>> import torch

>>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

>>> # note: we are loading a OPTForQuestionAnswering from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random
>>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

>>> inputs = tokenizer(question, text, return_tensors="pt")
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> answer_start_index = outputs.start_logits.argmax()
>>> answer_end_index = outputs.end_logits.argmax()

>>> answer_offset = len(tokenizer(question)[0])

>>> predict_answer_tokens = inputs.input_ids[
...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
... ]
>>> predicted = tokenizer.decode(predict_answer_tokens)
>>> predicted
' a nice puppet'
```Nr  r   r    rQ   r8   )ignore_indexr+   )rp  start_logits
end_logitsr{   r!  )re   r"  r   r  splitr  r]   r&  r   clampr\   r   r
   r   r{   r!  )r/   r  r4   r  r   r   r  r  r   r~   r  r  r6   r  r{   rq  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                           r1   r=   OPTForQuestionAnswering.forward  s   ` &1%<k$++B]B]"jj+)%'/!5# ) 
 ,A./#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EHHWO)//=ADDV]]SM']CH!,@J
M:H$x/14J"J/2Eab2IIF/9/EZMF*Q6Q+%!-;;*55
 	
r3   c                 B    U R                   R                  R                  $ r   rY  r   s    r1   r   ,OPTForQuestionAnswering.get_input_embeddings  r[  r3   c                 8    XR                   R                  l        g r   rY  r   s     r1   r   ,OPTForQuestionAnswering.set_input_embeddings  r^  r3   )r   r  r  )r@   rA   rB   rC   r!   r.   r   r   r:   rF   r   r   r   r   r   r   r   r=   r   r   rG   rH   rI   s   @r1   r  r    sj   y   156:15KO596:48$(,0/3&*37_
E,,-_
 !!2!23_
 E--.	_

 "%U->->(?(F"GH_
   1 12_
 "%"2"23_
   0 01_
 D>_
 $D>_
 'tn_
 d^_
 u//0_
 
u22	3_
 _
B/0 0r3   r  )rS  r=  r   r  r  )r   )CrD   typingr   r   r   r   r   r:   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   configuration_optr!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerr@   rp   r   r%   Moduler   floatra   rc   r   r   r   r=  rQ  rS  r  r  __all__r   r3   r1   <module>r     s    9 9    A A ! . ) > [  G & h h (  !!;J  
		H	%;BLL ;H %II%<<% 
% <<	%
 U\\*% % %.^9299 ^9Bebii eP % % %8C
# C
L =
! =
 =
@ ?,j >z' zz m0#5 m0m0` o00 o0 o0dr3   