o
    Zh                     @   s  d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z# e  ruddl$m%Z% ddl&m'Z' e!(e)Z*G dd de	j+Z,dd Z-d+ddZ.G dd de	j+Z/G dd de	j+Z0G dd de	j+Z1eG dd deZ2eG d d! d!e2Z3G d"d# d#e2eZ4ed$d%G d&d' d'e2Z5eG d(d) d)e2Z6g d*Z7dS ),zPyTorch Persimmon model.    N)ListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )PersimmonConfig)	BlockMask)make_flex_block_causal_maskc                       s8   e Zd Zddef fddZe edd Z  Z	S )PersimmonRotaryEmbeddingNconfigc                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)super__init__hasattrr   getr   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr   r   Zrope_init_fnattention_scalingZregister_bufferr"   Zoriginal_inv_freq)selfr   devicer"   	__class__ _/var/www/auris/lib/python3.10/site-packages/transformers/models/persimmon/modeling_persimmon.pyr%   8   s   
z!PersimmonRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r   ZmpscpuF)device_typeenabled   dim)dtype)r"   floatexpandshapetor*   
isinstancer    strtorchZautocast	transposecatcosr(   sinr6   )
r)   xposition_idsZinv_freq_expandedZposition_ids_expandedr1   ZfreqsZembr@   rA   r-   r-   r.   forwardI   s   0&z PersimmonRotaryEmbedding.forwardN)
__name__
__module____qualname__r   r%   r=   Zno_gradr   rD   __classcell__r-   r-   r+   r.   r   7   s
    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr/   r3   r4   )r9   r=   r?   )rB   x1Zx2r-   r-   r.   rotate_halfZ   s   rK   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerK   )qkr@   rA   rC   Zunsqueeze_dimZq_embedZk_embedr-   r-   r.   apply_rotary_pos_embb   s
   

rO   c                       s$   e Zd Z fddZdd Z  ZS )PersimmonMLPc                    s>   t    t|j|j| _t|j|j| _t|j	 | _
d S rE   )r$   r%   r   Linearhidden_sizeZintermediate_sizedense_h_to_4hdense_4h_to_hr   Z
hidden_actactr)   r   r+   r-   r.   r%      s   
zPersimmonMLP.__init__c                 C   s"   |  |}| |}| |}|S rE   )rS   rU   rT   )r)   hidden_statesr-   r-   r.   rD      s   


zPersimmonMLP.forward)rF   rG   rH   r%   rD   rI   r-   r-   r+   r.   rP   ~   s    rP   c                       s   e Zd ZdZddedee f fddZdej	de
ej	ej	ej	f fd	d
Z							ddej	deej	 deej dee dededeej dee
ej	ej	f  de
ej	eej	 ee
ej	  f fddZ  ZS )PersimmonAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s:  t    || _|| _|d u rtd| jj d |j| _|j	| _
| j| j
 | _|j| _t| j|j | _d| _| j| j
 | jkrOtd| j d| j
 dtj| jd| j dd| _tj| j
| j | jdd| _|j| _| jrtj|j| j
 |jdd	| _tj|j| j
 |jdd	| _t|j| _t| jd
| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   bias)epsZelementwise_affiner   )r$   r%   r   rY   loggerwarning_oncer,   rF   rR   Znum_attention_heads	num_headshead_dimZ
rope_thetaintZpartial_rotary_factorrotary_ndimsZ	is_causal
ValueErrorr   rQ   query_key_valuedenseqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormDropoutattention_dropoutr   
rotary_embr)   r   rY   r+   r-   r.   r%      s>   

zPersimmonAttention.__init__	fused_qkvreturnc                 C   sV   |j \}}}|||| jd| j}|ddddf |ddddf |ddddf fS )a  
        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
        storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        r   .r   Nr   r3   )r9   viewr`   ra   )r)   rp   
batch_sizeZ
seq_lengthZthree_times_hidden_sizer-   r-   r.   _split_heads   s   4zPersimmonAttention._split_headsFrW   attention_maskrC   past_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsc	                 C   s   |  \}	}
}| |}| |\}}}| jr!| |}| |}|dd}|dd}|dd}|\}}|dd | jf |d| jd f }}|dd | jf |d| jd f }}t||||\}}t	j
||fdd}t	j
||fdd}|d ur||| j|d}|||| j|\}}t	||ddt| j }|d ur|d d d d d d d |jd f }|| }tjj|t	jdd	|j}| |}t	||}|  |	| j|
| jfkrtd
|	| j|
| jf d|   |dd }||	|
| j}| |}|sd }|||fS )Nr   r3   .r/   r4   )rA   r@   Zpartial_rotation_sizery   r   )r6   r5   z `attn_output` should be of size z	, but is )sizere   rt   rg   rj   rk   r>   rc   rO   r=   r?   updaterY   matmulmathsqrtra   r9   r   Z
functionalZsoftmaxZfloat32r:   r6   rm   r`   rd   
contiguousreshaperR   rf   )r)   rW   ru   rC   rv   rw   rx   ry   rz   ZbszZq_len_rp   Zquery_statesZ
key_statesZvalue_statesr@   rA   Z	query_rotZ
query_passZkey_rotZkey_passZcache_kwargsZattn_weightscausal_maskZattn_outputr-   r-   r.   rD      sZ   


 &


zPersimmonAttention.forwardrE   NNNFFNN)rF   rG   rH   __doc__r   r   rb   r%   r=   Tensorr   rt   
LongTensorr	   boolrD   rI   r-   r-   r+   r.   rX      s<    $%	
rX   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
ej  dee dee deej	 dee
ejejf  de
ejee
ejejf  f fddZ  ZS )PersimmonDecoderLayerr   rY   c                    sd   t    |j| _t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _t|j| _d S )N)r   rY   r\   )r$   r%   rR   rX   	self_attnrP   mlpr   rh   ri   input_layernormpost_attention_layernormrl   hidden_dropoutdropoutro   r+   r-   r.   r%     s   

zPersimmonDecoderLayer.__init__NFrW   ru   rC   rv   rw   rx   ry   rz   rq   c	              
   C   s   |}	|  |}| j||||||||d\}}
}|	| }|}	| |}| |}| |}||	 }|f}|r:||
f7 }|rA||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`.
                [What are position IDs?](../glossary#position-ids)
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
                cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rW   ru   rC   rv   rw   rx   ry   rz   )r   r   r   r   r   )r)   rW   ru   rC   rv   rw   rx   ry   rz   ZresidualZself_attn_weightsZpresent_key_valueoutputsr-   r-   r.   rD   #  s0   #






zPersimmonDecoderLayer.forwardr   )rF   rG   rH   r   rb   r%   r=   r   r   r   r   r   FloatTensorrD   rI   r-   r-   r+   r.   r     s8    	
r   c                   @   s6   e Zd ZeZdZdZdgZdZdZ	dZ
dZdd ZdS )PersimmonPreTrainedModelmodelTr   past_key_valuesc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )N        )meanstdg      ?)r   Zinitializer_ranger;   r   rQ   weightdataZnormal_r[   Zzero_	Embeddingpadding_idxrh   Zfill_)r)   moduler   r-   r-   r.   _init_weightsu  s   

z&PersimmonPreTrainedModel._init_weightsN)rF   rG   rH   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacher   r-   r-   r-   r.   r   j  s    r   c                       s  e Zd ZdZdef fddZdd Zdd Zee																			d!d
e
ej de
ej de
ej de
eej  de
ej de
e de
e de
e de
ej defddZ	d"deejdf dejdejdedef
ddZedejdededejdejdefdd Z  ZS )#PersimmonModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`]

    Args:
        config: PersimmonConfig
    r   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r-   )r   ).0rY   r]   r-   r.   
<listcomp>  s    z+PersimmonModel.__init__.<locals>.<listcomp>r   r]   F)r$   r%   pad_token_idr   
vocab_sizer   r   rR   embed_tokensZ
ModuleListrangeZnum_hidden_layerslayersrh   ri   final_layernormr   rn   gradient_checkpointing	post_initrV   r+   r]   r.   r%     s   zPersimmonModel.__init__c                 C      | j S rE   r   r)   r-   r-   r.   get_input_embeddings     z#PersimmonModel.get_input_embeddingsc                 C   
   || _ d S rE   r   r)   valuer-   r-   r.   set_input_embeddings     
z#PersimmonModel.set_input_embeddingsN	input_idsru   rC   r   inputs_embedsrx   rw   output_hidden_statesry   rq   c
                 C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}d}
|rVt	|t
sVd}
|d u rLt }n
t|}td |d u r_| |}|	d u r{|d urk| nd}tj|||jd  |jd}	|d u r|	d}| |||	||}|}| ||}|rd	nd }|rd	nd }d }| jD ]A}|r||f7 }| jr| jr| |j|||||||	|	}n||||||||	|d
}|d }|r||rdnd }|r||d f7 }q| |}|r||f7 }|r|nd }|
r| }t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   r*   r-   )ru   rC   rv   rw   rx   ry   rz   r3   )last_hidden_stater   rW   
attentions)r   rw   r   rx   rd   r   trainingr^   r_   r;   r	   r
   Zfrom_legacy_cacher   get_seq_lengthr=   aranger9   r*   rL   _update_causal_maskrn   r   Z_gradient_checkpointing_func__call__r   Zto_legacy_cacher   )r)   r   ru   rC   r   r   rx   rw   r   ry   Zreturn_legacy_cachepast_seen_tokensr   rW   rz   Zall_hidden_statesZall_self_attnsZnext_decoder_cacheZdecoder_layerZlayer_outputsZ
next_cacher-   r-   r.   rD     s   







zPersimmonModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   FZsdpa)r   Zpast_key_values_lengthZis_trainingr   r/   )sequence_lengthtarget_lengthr6   ry   rs   )cudaZxpuZnpu)r   Z_attn_implementationanyr;   r=   r   r   r   Zis_compileabler   Z_ignore_causal_mask_sdpar   r6   r9   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr*   r    finfominZ_unmask_unattended)r)   ru   r   ry   r   rw   r   Zusing_compilable_cacher6   r   r   r   	min_dtyper-   r-   r.   r     sT   




z"PersimmonModel._update_causal_maskr   r   r6   rs   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer6   r*   r   )Zdiagonalr   r/   r   )r5   r=   r   r   fullr*   Ztriur   r   r8   cloner9   r:   Zmasked_fill)ru   r   r   r6   ry   rs   kwargsr   r   Zmask_lengthZpadding_maskr-   r-   r.   r   c  s,    $
6  zDPersimmonModel._prepare_4d_causal_attention_mask_with_cache_position	NNNNNNNNN)F)rF   rG   rH   r   r   r%   r   r   r   r   r   r=   r   r   r   r   r   r   rD   r   r	   r   staticmethodrb   r6   r   rI   r-   r-   r+   r.   r     s~    	

Dr   c                       s   e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
ee											d deej deej deej deeej  deej deej dee dee dee deej deeejf defddZ  ZS )!PersimmonForCausalLMzlm_head.weightc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S NFrZ   )
r$   r%   r   r   r   r   rQ   rR   lm_headr   rV   r+   r-   r.   r%     s
   
zPersimmonForCausalLM.__init__c                 C      | j jS rE   r   r   r   r-   r-   r.   r        z)PersimmonForCausalLM.get_input_embeddingsc                 C      || j _d S rE   r   r   r-   r-   r.   r        z)PersimmonForCausalLM.set_input_embeddingsc                 C   r   rE   r   r   r-   r-   r.   get_output_embeddings  r   z*PersimmonForCausalLM.get_output_embeddingsc                 C   r   rE   r   )r)   Znew_embeddingsr-   r-   r.   set_output_embeddings  r   z*PersimmonForCausalLM.set_output_embeddingsc                 C   r   rE   r   )r)   decoderr-   r-   r.   set_decoder  r   z PersimmonForCausalLM.set_decoderc                 C   r   rE   r   r   r-   r-   r.   get_decoder  r   z PersimmonForCausalLM.get_decoderNr   r   ru   rC   r   r   labelsrx   rw   r   ry   logits_to_keeprq   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| j||||||||	|
d	}|j}t|tr0t| dn|}| |dd|ddf }d}|durT| j	||fd| j j
i|}t|||j|j|jdS )uk  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PersimmonForCausalLM

        >>> model = PersimmonForCausalLM.from_pretrained("adept/persimmon-8b-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-base")

        >>> prompt = "human: Hey, what should I eat for dinner?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'human: Hey, what should I eat for dinner?\n\ncat: 🐱\n\nhuman: 😐\n\n'
        ```N)	r   ru   rC   r   r   rx   rw   r   ry   r   losslogitsr   rW   r   )r   rw   r   r   r   r;   rb   slicer   loss_functionr   r   r   rW   r   )r)   r   ru   rC   r   r   r   rx   rw   r   ry   r   r   r   rW   Zslice_indicesr   r   r-   r-   r.   rD     sD   (zPersimmonForCausalLM.forward)NNNNNNNNNNr   )rF   rG   rH   Z_tied_weights_keysr%   r   r   r   r   r   r   r   r   r   r=   r   r   r   r   r   r   rb   r   rD   rI   r-   r-   r+   r.   r     s^    
	
r   a  
    The Persimmon transformer with a sequence classification head on top (linear layer).

    [`PersimmonForSequenceClassification`] uses the last token in order to do the classification, as other causal
    models (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )Zcustom_introc                          e Zd Z fddZdd Zdd Zee									ddee	j
 d	ee	j d
ee	j
 dee dee	j dee	j
 dee dee dee defddZ  ZS )"PersimmonForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r$   r%   
num_labelsr   r   r   rQ   rR   scorer   rV   r+   r-   r.   r%   "  s
   
z+PersimmonForSequenceClassification.__init__c                 C   r   rE   r   r   r-   r-   r.   r   +  r   z7PersimmonForSequenceClassification.get_input_embeddingsc                 C   r   rE   r   r   r-   r-   r.   r   .  r   z7PersimmonForSequenceClassification.set_input_embeddingsNr   ru   rC   r   r   r   rx   rw   r   rq   c
              
   C   s(  | j ||||||||	d}
|
j}| |}|dur|jd }n|jd }| jjdu r2|dkr2td| jjdu r;d}n1|dur`|| jjk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j|||| jd
}t|||
j|
j|
jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        ru   rC   r   r   rx   rw   r   Nr   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r/   )r*   r6   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   )r   r   pooled_logitsr   r   )r   r   r   r9   r   r   rd   r:   r*   r=   Zint32r   Zargmaxr^   r_   r,   rF   r   r   r   rW   r   )r)   r   ru   rC   r   r   r   rx   rw   r   Ztransformer_outputsrW   r   rs   Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesr   r   r-   r-   r.   rD   1  sL   


z*PersimmonForSequenceClassification.forwardr   )rF   rG   rH   r%   r   r   r   r   r   r=   r   r   r	   r   r   r   rD   rI   r-   r-   r+   r.   r     sH    		
r   c                       r   )PersimmonForTokenClassificationc                    s|   t  | |j| _t|| _t|dd d ur|j}nt|dd d ur'|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropoutr   g?)r$   r%   r   r   r   getattrr   r   r   rl   r   rQ   rR   r   r   )r)   r   r   r+   r-   r.   r%   z  s   
z(PersimmonForTokenClassification.__init__c                 C   r   rE   r   r   r-   r-   r.   r     r   z4PersimmonForTokenClassification.get_input_embeddingsc                 C   r   rE   r   r   r-   r-   r.   r     r   z4PersimmonForTokenClassification.set_input_embeddingsNr   ru   rC   r   r   r   rx   rw   r   rq   c
              
   C   sd   | j ||||||||	d}
|
j}| |}| |}d}|dur(| ||| j}t|||
j|
jdS )r   r   N)r   r   rW   r   )	r   r   r   r   r   r   r   rW   r   )r)   r   ru   rC   r   r   r   rx   rw   r   r   Zsequence_outputr   r   r-   r-   r.   rD     s,   


z'PersimmonForTokenClassification.forwardr   )rF   rG   rH   r%   r   r   r   r   r   r=   r   r   r	   r   r   r   rD   rI   r-   r-   r+   r.   r   w  sH    	
r   )r   r   r   r   r   )Nr   )8r   r   typingr   r   r   r   r=   Ztorch.utils.checkpointr   Zactivationsr   Zcache_utilsr	   r
   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   utilsr   r   r   r   Zconfiguration_persimmonr   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerrF   r^   Moduler   rK   rO   rP   rX   r   r   r   r   r   r   __all__r-   r-   r-   r.   <module>   sP   
#
 Q  vVF