
    fThr                        S r SSKrSSKJrJrJr  SSKrSSKrSSKJr  SSK	J
r
JrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJrJrJ r J!r!  SSK"J#r#  \" 5       (       a  SSK$J%r%  SSK&J'r'  \!RP                  " \)5      r* " S S\RV                  5      r, " S S\RV                  5      r- " S S\R\                  5      r/ " S S\/5      r0\/\0S.r1 " S S\R\                  5      r2\ " S S\5      5       r3\ " S S \35      5       r4\" S!S"9 " S# S$\3\5      5       r5\ " S% S&\35      5       r6\" S'S"9 " S( S)\35      5       r7/ S*Qr8g)+zPyTorch BioGPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )BioGptConfig)	BlockMask)make_flex_block_causal_maskc                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\S\	\R                     4U 4S	 jjjrS
rU =r$ ) BioGptLearnedPositionalEmbedding:   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetsuper__init__)selfr    r!   	__class__s      b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/biogpt/modeling_biogpt.pyr&   )BioGptLearnedPositionalEmbedding.__init__?   s"     ++5}E    attention_maskpast_key_values_lengthposition_idsc                    > UcS  UR                  5       n[        R                  " USS9R                  U5      U-  R                  5       S-
  nUSS2US24   n[        TU ]  X0R                  -   5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)longtorchcumsumtype_asr%   forwardr$   )r'   r,   r-   r.   	positionsr(   s        r)   r6   (BioGptLearnedPositionalEmbedding.forwardE   sv     +002N n!<DD^TWeekkmpqqI %Q(>(?%?@Lw|kk9::r+   )r$   r   N)__name__
__module____qualname____firstlineno____doc__intr&   r3   
LongTensorr   Tensorr6   __static_attributes____classcell__r(   s   @r)   r   r   :   s[    Fs F3 F '(/3	;((; !$; u||,	; ;r+   r   c            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )BioGptScaledWordEmbeddingY   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r    r!   padding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)r%   r&   rI   )r'   r    r!   rH   rI   r(   s        r)   r&   "BioGptScaledWordEmbedding.__init__^   s    D&r+   	input_idsc                 <   > [         TU ]  U5      U R                  -  $ rK   )r%   r6   rI   )r'   rM   r(   s     r)   r6   !BioGptScaledWordEmbedding.forwardb   s    wy)D,<,<<<r+   rI   )      ?)r:   r;   r<   r=   r>   r?   r   floatr&   r3   rA   r6   rB   rC   rD   s   @r)   rF   rF   Y   sJ    's '3 'S '_ghm_n ' '= = =r+   rF   c                     ^  \ rS rSrSr      SS\S\S\S\S\S\S	\\	   S
\\   4U 4S jjjr
      SS\R                  S\\R                     S\\   S\\R                     S\\R                     S\S\\R                     S\\R                  \\R                     \\\R                        4   4S jjrSrU =r$ )BioGptAttentiong   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rZ   )r%   r&   rV   rW   rX   head_dimr\   
ValueErrorscalingrY   r[   r]   loggerwarning_oncer(   r:   r   Lineark_projv_projq_projout_proj)
r'   rV   rW   rX   rY   rZ   r[   r\   r]   r(   s
            r)   r&   BioGptAttention.__init__j   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr+   hidden_stateskey_value_statespast_key_valuer,   layer_head_maskoutput_attentionscache_positionreturnc                 
   USLnUR                  5       u  pnU R                  U5      R                  U	SU R                  U R                  5      R                  SS5      nXR                  -  nUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       a=  Ub:  W(       a3  WR                  U R                     nUR                  U R                     nOU R!                  U5      nU R#                  U5      nUR                  U	SU R                  U R                  5      R                  SS5      nUR                  U	SU R                  U R                  5      R                  SS5      nUbN  U(       d  UOSnWR%                  UUU R                  SU05      u  nnU(       a  SUR                  U R                  '   XR                  -  SU R                  4nUR&                  " U6 nUR&                  " U6 nUR&                  " U6 nUR                  S5      n[(        R*                  " UUR                  SS5      5      nUR                  5       XR                  -  U
U4:w  a.  [-        SXR                  -  U
U4 SUR                  5        35      eUb]  USS2SS2SS2SUR.                  S	   24   nUR                  XR                  U
U5      U-   nUR                  XR                  -  U
U5      n[0        R2                  R5                  USS
9nUb  UR                  5       U R                  4:w  a*  [-        SU R                  4 SUR                  5        35      eUR                  SSSS5      UR                  XR                  U
U5      -  nUR                  XR                  -  U
U5      nU(       a=  UR                  XR                  U
U5      nUR                  XR                  -  U
U5      nOSn[0        R2                  R7                  UU R6                  U R8                  S9n[(        R*                  " UU5      nUR                  5       XR                  -  XR                  4:w  a7  [-        SXR                  -  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR'                  XU R:                  5      nU R=                  U5      nUUU4$ )#Input shape: Batch x Time x ChannelNr   r#   rp   Tz$Attention weights should be of size z	, but is r0   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )sizerh   viewrW   r`   	transposerb   
isinstancer   
is_updatedgetr]   cross_attention_cacheself_attention_cache	key_cachevalue_cacherf   rg   updatereshaper3   bmmra   shaper   
functionalsoftmaxrX   rx   rV   ri   )r'   rk   rl   rm   r,   rn   ro   rp   is_cross_attentionbsztgt_len_query_statesr}   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r)   r6   BioGptAttention.forward   s    .T9',,.a {{=166sBPTP]P]^hhijlmn#ll2%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,66t~~FJ.::4>>JL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=NN*B>
#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(* 
 %+Aq!5Kz7G7G7K5K,KLN',,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C..4H'S`S`3a2b c$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK01>AAr+   )r\   rX   rV   r`   r[   rY   rf   r]   rW   ri   rh   rb   rg   )        FTFNNNNNNFN)r:   r;   r<   r=   r>   r?   rR   boolr   r   r&   r3   rA   r   r   r6   rB   rC   rD   s   @r)   rT   rT   g   sY   G  )-#'%C%C %C 	%C
 %C %C %C &%C C=%C %CT 48*.1526"'15pB||pB #5<<0pB !	pB
 !.pB "%,,/pB  pB !.pB 
u||Xell3XeELL>Q5RR	SpB pBr+   rT   c                   8  ^  \ rS rSr      SS\R
                  S\\R
                     S\\   S\\R
                     S\\R
                     S\S\\R
                     S	\	\R
                  \\R
                     \\	\R
                        4   4U 4S
 jjjr
SrU =r$ )BioGptSdpaAttentioni  rk   rl   rm   r,   rn   ro   rp   rq   c           	        > U(       a'  [         R                  S5        [        TU ]  UUUUUUS9$ USLnUR	                  5       u  pnU R                  U5      R                  U	SU R                  U R                  5      R                  SS5      nUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                   nOUnU(       a  UOUnU(       a=  Ub:  W(       a3  WR"                  U R                     nUR$                  U R                     nOU R'                  U5      nU R)                  U5      nUR                  U	SU R                  U R                  5      R                  SS5      nUR                  U	SU R                  U R                  5      R                  SS5      nUbN  U(       d  UOSnWR+                  UUU R                  SU05      u  nnU(       a  SUR                  U R                  '   SnUb  USS2SS2SS2SUR,                  S	   24   nUR.                  R0                  S
:X  a3  Ub0  UR3                  5       nUR3                  5       nUR3                  5       nU R4                  (       a  Uc  U
S:  a  SOSn[6        R8                  R:                  R=                  UUUUU R>                  (       a  U R@                  OSUS9nUR                  SS5      R3                  5       nUR                  XU RB                  5      nU RE                  U5      nUSU4$ )rs   a  BioGptModel is using BioGptSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` . Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)rl   rm   r,   ro   rp   Nrt   r   r#   rp   Tru   cudaFr   )	attn_mask	dropout_pr[   )#rc   rd   r%   r6   ry   rh   rz   rW   r`   r{   r|   r   r}   r~   r]   r   r   r   r   rf   rg   r   r   devicetype
contiguousr[   r3   r   r   scaled_dot_product_attentionrx   rX   rV   ri   )r'   rk   rl   rm   r,   rn   ro   rp   r   r   r   r   r   r}   r   r   r   r   causal_maskr[   r   r(   s                        r)   r6   BioGptSdpaAttention.forward  s    l 7?!1--"3- #   .T9',,.a {{=166sBPTP]P]^hhijlmn%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,66t~~FJ.::4>>JL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=%(Aq2HJ4D4DR4H2H)HIK ##v-+2I'224L#..0J'224L
 !NN{/BwQR{DX]	 hh))FF!&*mmdll G 
 "++Aq1<<> "&&sT^^DmmK0D.00r+    r   )r:   r;   r<   r=   r3   rA   r   r   r   r   r6   rB   rC   rD   s   @r)   r   r     s     48*.1526"'15e1||e1 #5<<0e1 !	e1
 !.e1 "%,,/e1  e1 !.e1 
u||Xell3XeELL>Q5RR	Se1 e1r+   r   )eagersdpac                   B  ^  \ rS rSrSS\S\\   4U 4S jjjr      SS\R                  S\\R                     S\\R                     S\\
   S	\\   S
\\   S\\R                     S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )BioGptDecoderLayerit  r\   r]   c           	        > [         TU ]  5         UR                  U l        [        UR
                     " U R                  UR                  UR                  SSUS9U l        UR                  U l
        [        UR                     U l        UR                  U l        [        R                   " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                   " U R                  5      U l        g )NT)rV   rW   rX   rY   r[   r]   )r%   r&   hidden_sizerV   BIOGPT_ATTENTION_CLASSES_attn_implementationnum_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrX   r   
hidden_actactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normre   intermediate_sizefc1fc2final_layer_norm)r'   r\   r]   r(   s      r)   r&   BioGptDecoderLayer.__init__u  s    ++1&2M2MNnn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r+   rk   r,   rn   rm   ro   	use_cacherp   rq   c           	      X   UnU R                  U5      nU R                  UUUUUUS9u  pn[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4n
U(       a  X4-  n
U(       a  X4-  n
U
$ )ay  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
        cache in the correct position and to infer the complete sequence length.
)rk   rm   r,   rn   ro   rp   rv   )r   r   r   r   rX   rx   r   r   r   r   r   )r'   rk   r,   rn   rm   ro   r   rp   residualself_attn_weightsoutputss              r)   r6   BioGptDecoderLayer.forward  s3   8 !11-@ <@>>'))+/) <J <
8. --m||VZVcVc-d 0 !--m</**=9--m?V?Vaeanan-o/--m||VZVcVc-d 0 "++G((Gr+   )	r   r   rX   rV   r   r   r   r   r   rK   )NNNFTN)r:   r;   r<   r=   r   r   r?   r&   r3   rA   r   r   r   FloatTensorr6   rB   rC   rD   s   @r)   r   r   t  s    =| = = =2 2626*.,1$(15>||> !.> "%,,/	>
 !> $D>> D>> !.> 
u  (51B1BEDUDU1U+V"WW	X> >r+   r   c                      \ rS rSr\rSrSrSrSr	Sr
S r SS\\R                  S4   S\R                  S\R                  S	\S
\4
S jjr\S\R                  S\S\S\R(                  S\R                  S\4S j5       rSrg)BioGptPreTrainedModeli  biogptTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsr   )meanstdNrQ   )r|   r   re   weightdatanormal_r\   initializer_rangerZ   zero_	EmbeddingrH   r   fill_)r'   modules     r)   _init_weights#BioGptPreTrainedModel._init_weights  s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r+   r,   r   input_tensorrp   past_key_valuesro   c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fr   )inputs_embedsr-   is_trainingr   rt   )sequence_lengthtarget_lengthdtyperp   
batch_size)r   xpunpu)r\   r   anyr|   r3   rA   r   get_seq_lengthis_compileabler   _ignore_causal_mask_sdparx   r   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finfomin_unmask_unattended)r'   r,   r   rp   r   ro   past_seen_tokensusing_compilable_cacher   r   r   r   	min_dtypes                r)   _update_causal_mask)BioGptPreTrainedModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr+   r   r   r   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r   )diagonalr   rt   r   )r1   r3   r   r   fullr   triuaranger   expandcloner   tomasked_fill)r,   r   r   r   rp   r   kwargsr   r   mask_lengthpadding_masks              r)   r   KBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position*  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r+   r   N)F)r:   r;   r<   r=   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdpa_supports_cache_class_supports_static_cacher   r   r3   rA   r   r   r   staticmethodr?   r   r   rB   r   r+   r)   r   r     s    L &*#N !*. #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r+   r   c                   ~  ^  \ rS rSrS\4U 4S jjrS rS r\           SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\\\
R                           S\	\   S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\\4   4S jj5       rSrU =r$ )BioGptModelic  r\   c           
        > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  U R                  U R                  US9U l        [!        UR"                  U R                  5      U l        [&        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        [&        R2                  " U R                  5      U l        SU l        UR8                  S:H  U l        U R=                  5         g s  snf )NrQ   rP   )r]   Fr   )r%   r&   r\   	layerdropr   rX   r   rV   pad_token_idrH   scale_embeddingmathsqrtrF   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positionsr   
ModuleListrangenum_hidden_layersr   layersr   
layer_normgradient_checkpointingr   	_use_sdpa	post_init)r'   r\   rI   ir(   s       r)   r&   BioGptModel.__init__e  s!    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vVuQR%7%LVu$vw,,t~~6&+#44> %ws   	E9c                     U R                   $ rK   r  r'   s    r)   get_input_embeddings BioGptModel.get_input_embeddings{  s       r+   c                     Xl         g rK   r  r'   values     r)   set_input_embeddings BioGptModel.set_input_embeddings~  s    !r+   rM   r,   	head_maskr   r   r   r.   ro   output_hidden_statesreturn_dictrp   rq   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eUb  UR                  SUR                  S   5      nUc  U R                  U5      nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnSnU(       aB  [        U[        5      (       d-  Sn[        R                  S5        [        R                   " U5      nUR#                  5       S S u  pUb  UR%                  5       OSnUc#  [&        R(                  " UUU-   UR*                  S9nUc4  [-        5       (       d%  UU-   n[&        R.                  " UUUR*                  S9n[        U[        5      (       a  UR0                  OUnU R3                  UUUUU5      nUc  UR5                  S5      nU R7                  UUUS	9nXG-   n[8        R:                  R=                  UU R<                  U R                  S
9nU	(       a  SOS nU(       a  SOS nS nS n[?        U R@                  5       H  u  nnU	(       a  UU4-  nU R                  (       a(  [&        RB                  " / 5      nUU RD                  :  a  ML  U R                  (       a<  U R                  (       a+  U RG                  URH                  UUUb  UU   OS S UUU5      nOU" UUUb  UU   OS UUUUS9nUS   nU(       a  UU(       a  SOS   nU(       d  M  UUS   4-  nM     U	(       a  UU4-  nU RK                  U5      nU(       a  UOS nU(       a  URM                  5       nU
(       d  [O        S UUUUU4 5       5      $ [Q        UUUUUS9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timert   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r.   rv   r   )r,   rn   rm   ro   r   rp   r#   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frK   r   ).0vs     r)   	<genexpr>&BioGptModel.forward.<locals>.<genexpr>  s      mA ms   	)last_hidden_stater   rk   
attentionscross_attentions))r\   ro   r)  r   use_return_dictra   rz   r   r  r  rx   rc   rd   r|   r   r   from_legacy_cachery   r   r3   r   r   r   onesr   r   	unsqueezer  r   r   rX   	enumerater  randr  _gradient_checkpointing_func__call__r  to_legacy_cachetupler   )r'   rM   r,   r(  r   r   r   r.   ro   r)  r*  rp   r   return_legacy_cacher   
seq_lengthr-   mask_seq_lengthself_attn_cacher   rk   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacheidxdecoder_layerdropout_probabilitylayer_outputs
next_caches                                 r)   r6   BioGptModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] -t";<cdd !r9??2+>?I  --i8M&&4==##p "	 $Z??"&\
 2CCOTO!.!3!3!5cr!:
ETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D*D4zAO"ZZ
OML`L`aN /+>?? 00  	
 ..
 )33A6L++N<Ram+n%4--mt||VZVcVc-d"6BD0d#!"+DKK"8C#!m%55!}}&+jjn#&7**t}} $ A A!**!&/&;IcN%"	! !.!#.7@7LYs^RV#2&7'#1! *!,M%28I1q%Q"  =#3"55K #9P  -!116+4'$
(88:J '5FXlm  
 9+&+%1
 	
r+   )r  r\   rX   rV   r  r  r  r  r  r  rH   NNNNNNNNNNN)r:   r;   r<   r=   r   r&   r!  r&  r   r   r3   r@   r   r   rA   r   r   r   r6   rB   rC   rD   s   @r)   r	  r	  c  sB   | ,!"  156:1559@D$(37,0/3&*15V
E,,-V
 !!2!23V
 E--.	V

   1 12V
 "%ell(;"<=V
 D>V
 u//0V
 $D>V
 'tnV
 d^V
 !.V
 
u??	@V
 V
r+   r	  zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                     ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\\\
R                           S\	\
R                     S\	\   S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\\4   4S jj5       r\S 5       rSrU =r$ )BioGptForCausalLMi  zoutput_projection.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr_   )
r%   r&   r	  r   r   re   r   r  output_projectionr  r'   r\   r(   s     r)   r&   BioGptForCausalLM.__init__#  sJ     !&)!#6+=+=v?P?PW\!] 	r+   c                     U R                   $ rK   rR  r   s    r)   get_output_embeddings'BioGptForCausalLM.get_output_embeddings,  s    %%%r+   c                     Xl         g rK   rV  )r'   new_embeddingss     r)   set_output_embeddings'BioGptForCausalLM.set_output_embeddings/  s    !/r+   rM   r,   r(  r   r   labelsr   r.   ro   r)  r*  rp   rq   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)
r,   r(  r   r   r   r.   ro   r)  r*  rp   r   r  r   )losslogitsr   rk   r2  r3  )r\   r4  r   rR  loss_functionr  r   r   rk   r2  r3  )r'   rM   r,   r(  r   r   r]  r   r.   ro   r)  r*  rp   r   r   sequence_outputprediction_scoreslm_lossoutputs                      r)   r6   BioGptForCausalLM.forward2  s   . &1%<k$++B]B]++)'+%/!5#)  
 "!* 22?C((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r+   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nr   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7fr9   )index_selectr   r   )r-  
past_statebeam_idxs     r)   r/  3BioGptForCausalLM._reorder_cache.<locals>.<genexpr>w  s1     ncmU_--aZ=N=N1OPPcms   7:)r=  )r   rk  reordered_past
layer_pasts    `  r)   _reorder_cache BioGptForCausalLM._reorder_cacher  s8    )Jncmnn N * r+   )r   rR  )NNNNNNNNNNNN)r:   r;   r<   r=   _tied_weights_keysr&   rW  r[  r   r   r3   r@   r   r   rA   r   r   r   r6   r  ro  rB   rC   rD   s   @r)   rO  rO    se    55&0  156:1559@D-1$(37,0/3&*15=
E,,-=
 !!2!23=
 E--.	=

   1 12=
 "%ell(;"<==
 ))*=
 D>=
 u//0=
 $D>=
 'tn=
 d^=
 !.=
 
u77	8=
 =
~  r+   rO  c                   j  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\
\
\R                           S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\
\4   4S jj5       rSrU =r$ )BioGptForTokenClassificationi|  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nOUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropout)r%   r&   
num_labelsr	  r   hasattrru  r   r   DropoutrX   re   r   
classifierr  )r'   r\   ru  r(   s      r)   r&   %BioGptForTokenClassification.__init__~  s      ++!&)6/00V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr+   rM   token_type_idsr,   r(  r   r   r]  r   ro   r)  r*  rq   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " UUR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r,   r(  r   r   ro   r)  r*  r   rt   r   r#   )r_  r`  rk   r2  )r\   r4  r   rX   ry  r   rz   rv  r3   wheretensorignore_indexr5   r   rk   r2  )r'   rM   r{  r,   r(  r   r   r]  r   ro   r)  r*  transformer_outputsrk   r`  r_  loss_fctactive_lossactive_logitsactive_labelsre  s                        r)   r6   $BioGptForTokenClassification.forward  so   * &1%<k$++B]B]"kk+)'/!5# * 

 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r+   )r   ry  rX   rv  rL  )r:   r;   r<   r=   r&   r   r   r3   r@   r   r   rA   r   r   r   r6   rB   rC   rD   s   @r)   rs  rs  |  s%     15596:15@D59-1$(,0/3&*=
E,,-=
 !!1!12=
 !!2!23	=

 E--.=
 "%ell(;"<==
   1 12=
 ))*=
 D>=
 $D>=
 'tn=
 d^=
 
u++	,=
 =
r+   rs  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   ^  ^  \ rS rSrS\4U 4S jjr\          SS\\R                     S\\R                     S\\R                     S\\\\R                           S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\\4   4S jj5       rS rS rSrU =r$ )BioGptForSequenceClassificationi  r\   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g rQ  )
r%   r&   rv  r	  r   r   re   r   scorer  rS  s     r)   r&   (BioGptForSequenceClassification.__init__  sS      ++!&)YYv114??O
 	r+   rM   r,   r(  r   r   r]  r   ro   r)  r*  rq   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nUb  UR                  SS u  pOUR                  SS u  pU R                   R
                  c  SnOUbV  [        R                  " XR                   R
                  5      R                  S5      S-
  R                  UR                  5      nO.Sn[        R                  U R                  R                   S35        U[        R                  " XR                  S9U4   nSnUGb  U R                   R                   c  U R"                  S:X  a  S	U R                   l        OoU R"                  S:  aN  UR$                  [        R&                  :X  d  UR$                  [        R(                  :X  a  S
U R                   l        OSU R                   l        U R                   R                   S	:X  aJ  [+        5       nU R"                  S:X  a&  U" UR-                  5       UR-                  5       5      nOU" UU5      nOU R                   R                   S
:X  a=  [/        5       nU" UR1                  SU R"                  5      UR1                  S5      5      nO-U R                   R                   S:X  a  [3        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [5        UUUR6                  UR8                  UR:                  S9$ )r}  Nr~  r   r#   rt   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r_  r`  r   rk   r2  )r\   r4  r   r  r   r  r3   nesumr   r   rc   rd   r(   r:   r   problem_typerv  r   r2   r?   r	   squeezer   rz   r   r   r   rk   r2  )r'   rM   r,   r(  r   r   r]  r   ro   r)  r*  r  rk   r`  r   r   pooled_logitsr_  r  re  s                       r)   r6   'BioGptForSequenceClassification.forward  s   ( &1%<k$++B]B]"kk+)'/!5# * 

 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88I{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||J}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r+   c                 .    U R                   R                  $ rK   r   r  r   s    r)   r!  4BioGptForSequenceClassification.get_input_embeddings>  s    {{'''r+   c                 $    XR                   l        g rK   r  r$  s     r)   r&  4BioGptForSequenceClassification.set_input_embeddingsA  s    #( r+   )r   rv  r  )
NNNNNNNNNN)r:   r;   r<   r=   r   r&   r   r   r3   r@   r   r   rA   r   r   r   r6   r!  r&  rB   rC   rD   s   @r)   r  r    s1   |   156:15@D59-1$(,0/3&*V
E,,-V
 !!2!23V
 E--.	V

 "%ell(;"<=V
   1 12V
 ))*V
 D>V
 $D>V
 'tnV
 d^V
 
u66	7V
 V
p() )r+   r  )rO  rs  r  r	  r   )9r>   r  typingr   r   r   r3   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   configuration_biogptr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerr:   rc   r   r   rF   ModulerT   r   r   r   r   r	  rO  rs  r  __all__r   r+   r)   <module>r     s     ) )    A A ! 5 )  .  /  !!;J 
		H	%
;r|| ;>
= 
=ZBbii ZB|f1/ f1T  U Up SO S Sl t
' t
 t
n 
Y- Y
Yx M
#8 M
 M
` g)&; g)g)Tr+   