o
    Zhr                     @   s  d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z# e ruddl$m%Z% ddl&m'Z' e!(e)Z*G dd dej+Z,G dd dej+Z-G dd dej.Z/G dd de/Z0e/e0dZ1G dd dej.Z2eG dd deZ3eG dd  d e3Z4ed!d"G d#d$ d$e3eZ5eG d%d& d&e3Z6ed'd"G d(d) d)e3Z7g d*Z8dS )+zPyTorch BioGPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )BioGptConfig)	BlockMask)make_flex_block_causal_maskc                       sR   e Zd ZdZdedef fddZ		ddejd	ed
eej	 f fddZ
  ZS ) BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S )N   )offsetsuper__init__)selfr   r   	__class__ Y/var/www/auris/lib/python3.10/site-packages/transformers/models/biogpt/modeling_biogpt.pyr"   ?   s   z)BioGptLearnedPositionalEmbedding.__init__r   Nattention_maskpast_key_values_lengthposition_idsc                    sV   |du r"|  }tj|dd||   d }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)longtorchZcumsumtype_asr!   forwardr    )r#   r(   r)   r*   Z	positionsr$   r&   r'   r0   E   s
    z(BioGptLearnedPositionalEmbedding.forwardr   N)__name__
__module____qualname____doc__intr"   r.   
LongTensorr   Tensorr0   __classcell__r&   r&   r$   r'   r   :   s    	r   c                
       sL   e Zd ZdZddedededee f fddZd	ej	f fd
dZ
  ZS )BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?r   r   padding_idxembed_scalec                    s   t  ||| || _d S N)r!   r"   r=   )r#   r   r   r<   r=   r$   r&   r'   r"   ^   s   
z"BioGptScaledWordEmbedding.__init__	input_idsc                    s   t  || j S r>   )r!   r0   r=   )r#   r?   r$   r&   r'   r0   b   s   z!BioGptScaledWordEmbedding.forward)r;   )r2   r3   r4   r5   r6   r   floatr"   r.   r8   r0   r9   r&   r&   r$   r'   r:   Y   s    $r:   c                       s   e Zd ZdZ						ddededed	ed
ededee dee f fddZ							dde
jdee
j dee dee
j dee
j dedee
j dee
jee
j eee
j  f fddZ  ZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	is_causalconfig	layer_idxc	           	         s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
|| _|d u rK| j	rKtd| jj d tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rG   )r!   r"   rC   rD   rE   head_dimrI   
ValueErrorscalingrF   rH   rJ   loggerwarning_oncer%   r2   r   Lineark_projv_projq_projout_proj)	r#   rC   rD   rE   rF   rG   rH   rI   rJ   r$   r&   r'   r"   j   s0   


zBioGptAttention.__init__hidden_stateskey_value_statespast_key_valuer(   layer_head_maskoutput_attentionscache_positionreturnc                 C   sv  |du}|  \}	}
}| ||	d| j| jdd}|| j }|dur=t|tr;|j	
| j}|r7|j}n|j}n|}|rA|n|}|rX|durX|rX|j| j }|j| j }nE| |}| |}||	d| j| jdd}||	d| j| jdd}|dur|s|nd}|||| jd|i\}}|rd|j	| j< |	| j d| jf}|j| }|j| }|j| }| d}t||dd}|  |	| j |
|fkrtd|	| j |
|f d|   |dur|ddddddd|jd	 f }||	| j|
|| }||	| j |
|}tjj|dd
}|durN|  | jfkr3td| jf d|   |dddd||	| j|
| }||	| j |
|}|re||	| j|
|}||	| j |
|}nd}tjj|| j| jd}t||}|  |	| j |
| jfkrtd|	| j |
| jf d|   ||	| j|
| j}|dd}||	|
| j}| |}|||fS )#Input shape: Batch x Time x ChannelNr   r   r[   Tz$Attention weights should be of size z	, but is r+   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )sizerT   viewrD   rL   	transposerN   
isinstancer   
is_updatedgetrJ   cross_attention_cacheself_attention_cache	key_cachevalue_cacherR   rS   updatereshaper.   ZbmmrM   shaper   
functionalZsoftmaxrE   rb   rC   rU   )r#   rV   rW   rX   r(   rY   rZ   r[   is_cross_attentionbsztgt_len_query_statesrg   curr_past_key_valuecurrent_states
key_statesvalue_statesZ
proj_shapeZsrc_lenZattn_weightsZattn_weights_reshapedZ
attn_probsattn_outputr&   r&   r'   r0      s   "








&
"

zBioGptAttention.forward)rB   FTFNNNNNNFN)r2   r3   r4   r5   r6   r@   boolr   r   r"   r.   r8   r   r   r0   r9   r&   r&   r$   r'   rA   g   s`    	*	rA   c                       s   e Zd Z						ddejdeej dee deej deej ded	eej d
eejeej eeej  f f fddZ	  Z
S )BioGptSdpaAttentionNFrV   rW   rX   r(   rY   rZ   r[   r\   c                    s,  |rt d t j||||||dS |du}| \}	}
}| ||	d| j| j	dd}|durKt
|trI|j| j}|rE|j}n|j}n|}|rO|n|}|rf|durf|rf|j| j }|j| j }nE| |}| |}||	d| j| j	dd}||	d| j| j	dd}|dur|s|nd}|||| jd|i\}}|rd|j| j< d}|dur|ddddddd|jd	 f }|jjd
kr|dur| }| }| }| jr|du r|
dkrdnd}tjjj||||| jr| j nd|d}|	dd }||	|
| j!}| "|}|d|fS )r]   a  BioGptModel is using BioGptSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` . Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)rW   rX   r(   rZ   r[   Nr^   r   r   r[   Tr_   cudaFrB   )Z	attn_maskZ	dropout_prH   )#rO   rP   r!   r0   rc   rT   rd   rD   rL   re   rf   r   rg   rh   rJ   ri   rj   rk   rl   rR   rS   rm   ro   devicetype
contiguousrH   r.   r   rp   Zscaled_dot_product_attentionrb   rE   rC   rU   )r#   rV   rW   rX   r(   rY   rZ   r[   rq   rr   rs   rt   ru   rg   rv   rw   rx   ry   causal_maskrH   rz   r$   r&   r'   r0     sr   "


&	

zBioGptSdpaAttention.forwardr{   )r2   r3   r4   r.   r8   r   r   r|   r   r0   r9   r&   r&   r$   r'   r}     s0    	r}   )eagersdpac                       s   e Zd Zddedee f fddZ						ddejd	eej d
eej dee	 dee
 dee
 deej deejeeejejf  f fddZ  ZS )BioGptDecoderLayerNrI   rJ   c                    s   t    |j| _t|j | j|j|jdd|d| _|j	| _
t|j | _|j| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rC   rD   rE   rF   rH   rJ   )r!   r"   hidden_sizerC   BIOGPT_ATTENTION_CLASSES_attn_implementationZnum_attention_headsZattention_probs_dropout_prob	self_attnhidden_dropout_probrE   r
   Z
hidden_actactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normrQ   Zintermediate_sizefc1fc2final_layer_norm)r#   rI   rJ   r$   r&   r'   r"   u  s"   
zBioGptDecoderLayer.__init__FTrV   r(   rY   rX   rZ   	use_cacher[   r\   c                 C   s   |}|  |}| j||||||d\}}	}tjj|| j| jd}|| }|}| |}| |}| |}tjj|| j	| jd}| 
|}tjj|| j| jd}|| }|f}
|r^|
|	f7 }
|re|
|f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )rV   rX   r(   rY   rZ   r[   r`   )r   r   r   rp   rE   rb   r   r   r   r   r   )r#   rV   r(   rY   rX   rZ   r   r[   ZresidualZself_attn_weightsoutputsr&   r&   r'   r0     s4   






zBioGptDecoderLayer.forwardr>   )NNNFTN)r2   r3   r4   r   r   r6   r"   r.   r8   r   r|   r   FloatTensorr0   r9   r&   r&   r$   r'   r   t  s2    	r   c                   @   s   e Zd ZeZdZdZdZdZdZ	dd Z
	ddeejdf dejd	ejd
edef
ddZedejdededejd	ejdefddZdS )BioGptPreTrainedModelbiogptTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsrB   )meanZstdNr;   )rf   r   rQ   weightdataZnormal_rI   Zinitializer_rangerG   Zzero_	Embeddingr<   r   Zfill_)r#   moduler&   r&   r'   _init_weights  s   

z#BioGptPreTrainedModel._init_weightsFr(   r   input_tensorr[   past_key_valuesrZ   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2rB   Zflex_attentionr   Fr   )inputs_embedsr)   Zis_trainingr   r^   )sequence_lengthtarget_lengthdtyper[   
batch_size)r~   ZxpuZnpu)rI   r   anyrf   r.   r8   r   get_seq_lengthZis_compileabler   Z_ignore_causal_mask_sdparb   r   ro   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finfominZ_unmask_unattended)r#   r(   r   r[   r   rZ   Zpast_seen_tokensZusing_compilable_cacher   r   r   r   	min_dtyper&   r&   r'   _update_causal_mask  sT   




z)BioGptPreTrainedModel._update_causal_maskr   r   r   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer   r   r   )Zdiagonalr   r^   r   )r,   r.   r   r   fullr   Ztriuarangern   expandclonero   toZmasked_fill)r(   r   r   r   r[   r   kwargsr   r   Zmask_lengthZpadding_maskr&   r&   r'   r   *  s,    $
6  zKBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_positionN)F)r2   r3   r4   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_supports_sdpaZ_supports_cache_classZ_supports_static_cacher   r   r.   r8   r   r|   r   staticmethodr6   r   r   r&   r&   r&   r'   r     sD    
Dr   c                       s   e Zd Zdef fddZdd Zdd Ze											dd	ee	j
 d
ee	j dee	j dee	j deeee	j   dee dee	j
 dee dee dee dee	j deeef fddZ  ZS )BioGptModelrI   c                    s   t     | _ j| _ j| _ j| _ j| _	 j
r"t jnd}t j| j| j	|d| _t j| j| _t fddt jD | _t| j| _d| _ jdk| _|   d S )Nr;   )r=   c                    s   g | ]}t  |d qS ))rJ   )r   ).0irI   r&   r'   
<listcomp>s  s    z(BioGptModel.__init__.<locals>.<listcomp>Fr   )r!   r"   rI   	layerdropr   rE   r   rC   pad_token_idr<   Zscale_embeddingmathsqrtr:   
vocab_sizeembed_tokensr   Zmax_position_embeddingsembed_positionsr   Z
ModuleListrangeZnum_hidden_layerslayersr   
layer_normgradient_checkpointingr   Z	_use_sdpa	post_init)r#   rI   r=   r$   r   r'   r"   e  s     zBioGptModel.__init__c                 C      | j S r>   r   r#   r&   r&   r'   get_input_embeddings{     z BioGptModel.get_input_embeddingsc                 C   
   || _ d S r>   r   r#   valuer&   r&   r'   set_input_embeddings~     
z BioGptModel.set_input_embeddingsNr?   r(   	head_maskr   r   r   r*   rZ   output_hidden_statesreturn_dictr[   r\   c                 K   s  |d ur|n| j j}|	d ur|	n| j j}	|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td|d urA|d|jd }|d u rJ| |}| j	rY| j
rY|rYtd d}d}|rnt|tsnd}td t|}| d d \}}|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}t|tr|jn|}| |||||}|d u r|d}| j|||d	}|| }tjj|| j| j
d
}|	rdnd }|rdnd }d }d }t| j D ]h\}}|	r||f7 }| j
rt!g }|| j"k rq| j	r'| j
r'| #|j$|||d ur|| nd d |||}n||||d ur3|| nd ||||d}|d }|rK||rHdnd }|rU||d f7 }q|	r^||f7 }| %|}|rh|nd }|rq|& }|
st'dd |||||fD S t(|||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer^   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r*   r`   r&   )r(   rY   rX   rZ   r   r[   r   r   c                 s   s    | ]	}|d ur|V  qd S r>   r&   )r   vr&   r&   r'   	<genexpr>  s    z&BioGptModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   rV   
attentionscross_attentions))rI   rZ   r   r   use_return_dictrM   rd   ro   r   r   rb   rO   rP   rf   r   r   Zfrom_legacy_cacherc   r   r.   r   r   r   Zonesrj   r   Z	unsqueezer   r   rp   rE   	enumerater   Zrandr   Z_gradient_checkpointing_func__call__r   Zto_legacy_cachetupler   )r#   r?   r(   r   r   r   r   r*   rZ   r   r   r[   r   Zreturn_legacy_cacher   Z
seq_lengthr)   Zmask_seq_lengthZself_attn_cacher   rV   Zall_hidden_statesZall_self_attnsZall_cross_attentionsZnext_decoder_cacheidxZdecoder_layerZdropout_probabilityZlayer_outputsZ
next_cacher&   r&   r'   r0     s   

	





zBioGptModel.forwardNNNNNNNNNNN)r2   r3   r4   r   r"   r   r   r   r   r.   r7   r   r   r8   r|   r   r   r0   r9   r&   r&   r$   r'   r   c  sR    	

r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )Zcustom_introc                       s   e Zd ZdgZ fddZdd Zdd Ze												dd	ee	j
 d
ee	j dee	j dee	j deeee	j   dee	j
 dee dee	j
 dee dee dee dee	j deeef fddZedd Z  ZS )BioGptForCausalLMzoutput_projection.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFrK   )
r!   r"   r   r   r   rQ   r   r   output_projectionr   r#   rI   r$   r&   r'   r"   #  s   
zBioGptForCausalLM.__init__c                 C   r   r>   r   r   r&   r&   r'   get_output_embeddings,  r   z'BioGptForCausalLM.get_output_embeddingsc                 C   r   r>   r   )r#   Znew_embeddingsr&   r&   r'   set_output_embeddings/  r   z'BioGptForCausalLM.set_output_embeddingsNr?   r(   r   r   r   labelsr   r*   rZ   r   r   r[   r\   c                 K   s   |dur|n| j j}| j||||||||	|
||d}|d }| |}d}|dur7| j||fd| j ji|}|sM|f|dd  }|durK|f| S |S t|||j|j|j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r(   r   r   r   r   r*   rZ   r   r   r[   r   r   r   )losslogitsr   rV   r   r   )rI   r   r   r   Zloss_functionr   r   r   rV   r   r   )r#   r?   r(   r   r   r   r   r   r*   rZ   r   r   r[   r   r   Zsequence_outputZprediction_scoresZlm_lossoutputr&   r&   r'   r0   2  sJ   
zBioGptForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr&   c                 3   s$    | ]}| d  |jV  qdS r1   )Zindex_selectr   r   )r   Z
past_statebeam_idxr&   r'   r   w  s   " z3BioGptForCausalLM._reorder_cache.<locals>.<genexpr>)r   )r   r   Zreordered_pastZ
layer_pastr&   r   r'   _reorder_cacher  s   z BioGptForCausalLM._reorder_cache)NNNNNNNNNNNN)r2   r3   r4   Z_tied_weights_keysr"   r   r   r   r   r.   r7   r   r   r8   r|   r   r   r0   r   r   r9   r&   r&   r$   r'   r     s^    		

?r   c                       s   e Zd Z fddZe											ddeej deej deej deej dee	e	ej
   d	eej d
eej dee dee dee dee dee	ef fddZ  ZS )BioGptForTokenClassificationc                    sj   t  | |j| _t|| _t|dr|jd ur|j}n|j}t	|| _
t|j|j| _|   d S )Nclassifier_dropout)r!   r"   
num_labelsr   r   hasattrr   r   r   ZDropoutrE   rQ   r   
classifierr   )r#   rI   r   r$   r&   r'   r"   ~  s   
z%BioGptForTokenClassification.__init__Nr?   token_type_idsr(   r   r   r   r   r   rZ   r   r   r\   c                 C   s
  |dur|n| j j}| j|||||||	|
|d	}|d }| |}| |}d}|duret }|durX|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|s{|f|dd  }|dury|f| S |S t|||j|jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r(   r   r   r   rZ   r   r   r   r^   r   r   )r   r   rV   r   )rI   r   r   rE   r   r   rd   r   r.   whereZtensorZignore_indexr/   r   rV   r   )r#   r?   r   r(   r   r   r   r   r   rZ   r   r   transformer_outputsrV   r   r   loss_fctZactive_lossZactive_logitsZactive_labelsr   r&   r&   r'   r0     sF   

z$BioGptForTokenClassification.forwardr   )r2   r3   r4   r"   r   r   r.   r7   r   r   r8   r|   r   r   r0   r9   r&   r&   r$   r'   r   |  sN    	

r   a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       s   e Zd Zdef fddZe										ddeej deej	 deej	 dee
e
ej   d	eej	 d
eej dee dee dee dee dee
ef fddZdd Zdd Z  ZS )BioGptForSequenceClassificationrI   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r!   r"   r   r   r   r   rQ   r   scorer   r   r$   r&   r'   r"     s
   
z(BioGptForSequenceClassification.__init__Nr?   r(   r   r   r   r   r   rZ   r   r   r\   c                 C   s  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}|dur/|jdd \}}n	|jdd \}}| j jdu rAd}n"|durWt|| j jdd 	|j
}nd}t| jj d |tj||j
d|f }d}|dur| j jdu r| jdkrd	| j _n| jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| jdkr|| | }n+|||}n%| j jd
krt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )r   Nr   r   r   r^   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   Z
regressionZsingle_label_classificationZmulti_label_classification)r   r   r   rV   r   )rI   r   r   r   ro   r   r.   nesumr   r   rO   rP   r%   r2   r   Zproblem_typer   r   r-   r6   r   Zsqueezer   rd   r   r   r   rV   r   )r#   r?   r(   r   r   r   r   r   rZ   r   r   r   rV   r   r   r   Zpooled_logitsr   r   r   r&   r&   r'   r0     sn   
$

"


z'BioGptForSequenceClassification.forwardc                 C   s   | j jS r>   r   r   r   r&   r&   r'   r   >  s   z4BioGptForSequenceClassification.get_input_embeddingsc                 C   s   || j _d S r>   r   r   r&   r&   r'   r   A  s   z4BioGptForSequenceClassification.set_input_embeddings)
NNNNNNNNNN)r2   r3   r4   r   r"   r   r   r.   r7   r   r   r8   r|   r   r   r0   r   r   r9   r&   r&   r$   r'   r     sL    		

Xr   )r   r   r   r   r   )9r5   r   typingr   r   r   r.   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr
   Zcache_utilsr   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   utilsr   r   r   r   Zconfiguration_biogptr   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerr2   rO   r   r   r:   ModulerA   r}   r   r   r   r   r   r   r   __all__r&   r&   r&   r'   <module>   sZ   
 jX  8\Pj