o
    Zh                    @   s  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddl m!Z!m"Z" ddl#m$Z$ e"%e&Z'g dZ(eG dd deZ)dd Z*G dd dej+Z,G dd dej+Z-G dd dej+Z.G dd dej+Z/G dd dej+Z0G dd dej+Z1G d d! d!ej+Z2G d"d# d#ej+Z3G d$d% d%ej+Z4G d&d' d'ej+Z5G d(d) d)ej+Z6G d*d+ d+ej+Z7G d,d- d-ej+Z8G d.d/ d/ej+Z9e!G d0d1 d1eZ:e!G d2d3 d3e:Z;e!d4d5G d6d7 d7e:Z<e!G d8d9 d9e:Z=e!G d:d; d;e:Z>e!G d<d= d=e:Z?g d>Z@dS )?zPyTorch CANINE model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )CanineConfig)   +   ;   =   I   a   g   q                           c                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )CanineModelOutputWithPoolinga  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
            shallow Transformer encoder).
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
            Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
            weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
            encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
            config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
            initial input to each Transformer encoder. The hidden states of the shallow encoders have length
            `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
            `config.downsampling_rate`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
            num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
            config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
            attention softmax, used to compute the weighted average in the self-attention heads.
    Nlast_hidden_statepooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r+   r   torchFloatTensor__annotations__r,   r-   r   r.    r6   r6   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/canine/modeling_canine.pyr*   2   s   
 r*   c                 C   s*  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]5\}
}|
d}
tdd	 |
D r{t	d
d|
  q\|
d dkrd|
d< n>|
d dkr|
|
d  n0|
d dkrd|
d< n%|
d dkrdg|
dd  }
n|
d dkr|
d dv rdg|
dd  }
| }|
D ]x}|d|rd|vr|d|}n|g}|d dks|d dkrt|d}n?|d dks|d dkrt|d}n,|d d krt|d}nz	t||d }W n ty.   t	d
d|
  Y qw t|d!kr@t|d }|| }q|d"d d#krPt|d}n |d$d d%d& td'D v rft|d}n
|dkrp||}|j|jkrtd(|j d)|j d*t	d+|
  t||_q\| S ),z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c                 s   s    | ]}|d v V  qdS ))Zadam_vZadam_mZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepclsZautoregressive_decoderZchar_output_weightsNr6   ).0nr6   r6   r7   	<genexpr>s   s    
z,load_tf_weights_in_canine.<locals>.<genexpr>z	Skipping Zbertencoderr   
embeddingsZsegment_embeddingstoken_type_embeddingsinitial_char_encoderchars_to_moleculesfinal_char_encoder)	LayerNormconv
projectionz[A-Za-z]+_\d+ZEmbedderz_(\d+)ZkernelgammaweightZoutput_biasbetabiasZoutput_weights   iZ_embeddingsic                 S   s   g | ]}d | qS )Z	Embedder_r6   )r:   ir6   r6   r7   
<listcomp>   s    z-load_tf_weights_in_canine.<locals>.<listcomp>   zPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpyZ
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variableappendzipsplitanyjoinremove	fullmatchgetattrAttributeErrorlenintrange	transposeshape
ValueErrorr3   Z
from_numpydata)modelconfigZtf_checkpoint_pathrO   nptfZtf_pathZ	init_varsnamesZarraysnamerf   arrayZpointerZm_nameZscope_namesnumr6   r6   r7   load_tf_weights_in_canineU   s   



 

rq   c                       s   e Zd ZdZ fddZdedefddZdededefd	d
Z				ddee	j
 dee	j
 dee	j
 dee	j de	jf
ddZ  ZS )CanineEmbeddingsz<Construct the character, position and token_type embeddings.c                    s   t    || _|j|j }t|jD ]}d| }t| |t|j	| qt|j	|j| _
t|j|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _d S )	NHashBucketCodepointEmbedder_Zepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__rj   hidden_sizenum_hash_functionsrd   setattrr   	Embeddingnum_hash_bucketschar_position_embeddingsZtype_vocab_sizer?   rD   layer_norm_epsDropouthidden_dropout_probdropoutZregister_bufferr3   arangemax_position_embeddingsexpandr`   rx   )selfrj   Zshard_embedding_sizerL   rn   	__class__r6   r7   r{      s   

zCanineEmbeddings.__init__
num_hashesnum_bucketsc                 C   sV   |t tkrtdt t td| }g }|D ]}|d | | }|| q|S )a  
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        z`num_hashes` must be <= Nr   )rb   _PRIMESrg   rY   )r   	input_idsr   r   ZprimesZresult_tensorsprimehashedr6   r6   r7   _hash_bucket_tensors   s   z%CanineEmbeddings._hash_bucket_tensorsembedding_sizec                 C   sx   || dkrt d| d| d| j|||d}g }t|D ]\}}d| }	t| |	|}
||
 qtj|ddS )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)r   r   rs   rv   dim)rg   r   	enumerater`   rY   r3   cat)r   r   r   r   r   Zhash_bucket_tensorsZembedding_shardsrL   Zhash_bucket_idsrn   Zshard_embeddingsr6   r6   r7   _embed_hash_buckets   s   
z$CanineEmbeddings._embed_hash_bucketsNr   token_type_idsru   inputs_embedsreturnc           
      C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u rE| || jj| jj	| jj
}| |}|| }| jdkr\| |}	||	7 }| |}| |}|S )Nrv   r   dtypedevicery   )sizeru   r3   zeroslongr   r   rj   r|   r}   r   r?   rx   r   rD   r   )
r   r   r   ru   r   input_shape
seq_lengthr?   r>   Zposition_embeddingsr6   r6   r7   forward   s(   





zCanineEmbeddings.forward)NNNN)r/   r0   r1   r2   r{   rc   r   r   r   r3   
LongTensorr4   r   __classcell__r6   r6   r   r7   rr      s(    rr   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )CharactersToMoleculeszeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                    sJ   t    tj|j|j|j|jd| _t|j | _	tj
|j|jd| _
d S )NZin_channelsZout_channelskernel_sizestridert   )rz   r{   r   Conv1dr|   downsampling_raterE   r   
hidden_act
activationrD   r   r   rj   r   r6   r7   r{     s   
zCharactersToMolecules.__init__char_encodingr   c                 C   s   |d d ddd d f }t |dd}| |}t |dd}| |}|d d ddd d f }t j||gdd}| |}|S )Nr   r   rK   rv   r   )r3   re   rE   r   r   rD   )r   r   Zcls_encodingZdownsampledZdownsampled_truncatedresultr6   r6   r7   r   -  s   


zCharactersToMolecules.forward)	r/   r0   r1   r2   r{   r3   Tensorr   r   r6   r6   r   r7   r     s    r   c                       sD   e Zd ZdZ fddZ	d
dejdeej dejfdd	Z  Z	S )ConvProjectionz
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    c                    s`   t    || _tj|jd |j|jdd| _t|j	 | _
tj|j|jd| _t|j| _d S )NrK   r   r   rt   )rz   r{   rj   r   r   r|   upsampling_kernel_sizerE   r   r   r   rD   r   r   r   r   r   r   r6   r7   r{   O  s   
zConvProjection.__init__Ninputsfinal_seq_char_positionsr   c           
      C   s   t |dd}| jjd }|d }|| }t||fd}| ||}t |dd}| |}| |}| 	|}|}|d urDt
d|}	|	S )Nr   rK   r   z,CanineForMaskedLM is currently not supported)r3   re   rj   r   r   ZConstantPad1drE   r   rD   r   NotImplementedError)
r   r   r   Z	pad_totalZpad_begZpad_endpadr   Zfinal_char_seqZ	query_seqr6   r6   r7   r   ^  s   


zConvProjection.forwardN)
r/   r0   r1   r2   r{   r3   r   r   r   r   r6   r6   r   r7   r   I  s    r   c                       sr   e Zd Z fddZdd Z			ddejdejd	eej d
eej dee	 de
ejeej f fddZ  ZS )CanineSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _t|dd| _| jdksf| jd	kry|j| _t	d
|j d | j| _d S d S )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()rx   ry   relative_keyrelative_key_queryrK   r   )rz   r{   r|   num_attention_headshasattrrg   rc   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   Zattention_probs_dropout_probr   r`   rx   r   r   distance_embeddingr   r   r6   r7   r{     s&   

zCanineSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nrv   r   rK   r   r
   )r   r   r   viewpermute)r   xZnew_x_shaper6   r6   r7   transpose_for_scores  s   
z(CanineSelfAttention.transpose_for_scoresNFfrom_tensor	to_tensorattention_mask	head_maskoutput_attentionsr   c                 C   s  |  |}| | |}| | |}| |}	t|	|dd}
| jdks.| jdkr| d }tj	|tj
|jddd}tj	|tj
|jddd}|| }| || j d }|j|	jd}| jdkrvtd|	|}|
| }
n| jdkrtd|	|}td	||}|
| | }
|
t| j }
|d ur|jd
krtj|dd}d|  t|
jj }|
| }
tjj|
dd}| |}|d ur|| }t||}|dddd
 }| d d | j f }|j| }|r||f}|S |f}|S )Nrv   rB   r   r   r   r   )r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   r         ?r   rK   )!r   r   r   r   r3   matmulre   rx   r   r   r   r   r   r   r   tor   Zeinsummathsqrtr   ndimZ	unsqueezefloatZfinfominr   Z
functionalZsoftmaxr   r   
contiguousr   )r   r   r   r   r   r   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresr   Zposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr6   r6   r7   r     sJ   







zCanineSelfAttention.forwardNNF)r/   r0   r1   r{   r   r3   r   r   r4   boolr   r   r   r6   r6   r   r7   r     s&    	r   c                       sF   e Zd Z fddZdeej dejdeejejf fddZ  ZS )CanineSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nrt   )rz   r{   r   r   r|   denserD   r   r   r   r   r   r   r6   r7   r{        
zCanineSelfOutput.__init__r-   input_tensorr   c                 C   &   |  |}| |}| || }|S r   r   r   rD   r   r-   r   r6   r6   r7   r     s   

zCanineSelfOutput.forward	r/   r0   r1   r{   r   r3   r4   r   r   r6   r6   r   r7   r     s    r   c                       s   e Zd ZdZ							ddededededed	ef fd
dZdd Z			ddee	j
 dee	j
 dee	j
 dee dee	j
ee	j
 f f
ddZ  ZS )CanineAttentionav  
    Additional arguments related to local attention:

        - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
        - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
          attend
        to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
        *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
        positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
        width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
        128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
        **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
        *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
        skip when moving to the next block in `to_tensor`.
    F   always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	           	         st   t    t|| _t|| _t | _|| _||k rt	d||k r&t	d|| _
|| _|| _|| _|| _|| _d S )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)rz   r{   r   r   r   outputsetpruned_headslocalrg   r   r   r   r   r   r   	r   rj   r   r   r   r   r   r   r   r   r6   r7   r{     s&   



zCanineAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )rb   r   r   r   r   r   r   r   r   r   r   r   r   union)r   headsindexr6   r6   r7   prune_heads)  s   zCanineAttention.prune_headsNr-   r   r   r   r   c                 C   sF  | j s| |||||}|d }n|jd  }}| }	}
g }| jr)|d d}nd}t||| jD ]}t||| j }|||f q2g }| jrP|d|f td|| j	D ]}t||| j
 }|||f qWt|t|kr|td| d| dg }g }t||D ]w\\}}\}}|	d d ||d d f }|
d d ||d d f }|d d ||||f }| jr|d d ||ddf }tj||gdd}|
d d ddd d f }tj||gdd}| |||||}||d  |r||d  qtj|dd}| ||}|f}| j s||dd   }|S |t| }|S )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.rK   r   )r   r   rf   r   rY   rd   r   r   r   r   r   rb   rg   rZ   r   r3   r   r   tuple)r   r-   r   r   r   Zself_outputsattention_outputfrom_seq_lengthto_seq_lengthr   r   Zfrom_chunksZ
from_startZchunk_startZ	chunk_endZ	to_chunksZattention_output_chunksZattention_probs_chunksZfrom_endZto_startZto_endZfrom_tensor_chunkZto_tensor_chunkZattention_mask_chunkZcls_attention_maskZcls_positionZattention_outputs_chunkr   r6   r6   r7   r   ;  sf   


zCanineAttention.forwardFFFr   r   r   r   r   )r/   r0   r1   r2   r   rc   r{   r   r   r3   r4   r   r   r   r6   r6   r   r7   r     sJ    	!r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )CanineIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )rz   r{   r   r   r|   intermediate_sizer   
isinstancer   strr   intermediate_act_fnr   r   r6   r7   r{     s
   
zCanineIntermediate.__init__r-   r   c                 C      |  |}| |}|S r   )r   r   r   r-   r6   r6   r7   r        

zCanineIntermediate.forward)r/   r0   r1   r{   r3   r4   r   r   r6   r6   r   r7   r     s    r   c                       s<   e Zd Z fddZdeej dejdejfddZ  ZS )CanineOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )rz   r{   r   r   r   r|   r   rD   r   r   r   r   r   r   r6   r7   r{     r   zCanineOutput.__init__r-   r   r   c                 C   r   r   r   r   r6   r6   r7   r     s   

zCanineOutput.forwardr   r6   r6   r   r7   r    s    (r  c                       sp   e Zd Z fddZ			ddeej deej deej dee d	eejeej f f
d
dZ	dd Z
  ZS )CanineLayerc	           	   	      sH   t    |j| _d| _t||||||||| _t|| _t|| _	d S Nr   )
rz   r{   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater  r   r   r   r6   r7   r{     s   


zCanineLayer.__init__NFr-   r   r   r   r   c           	      C   sH   | j ||||d}|d }|dd  }t| j| j| j|}|f| }|S )N)r   r   r   )r  r   feed_forward_chunkr  r  )	r   r-   r   r   r   Zself_attention_outputsr   r   layer_outputr6   r6   r7   r     s   
zCanineLayer.forwardc                 C   s   |  |}| ||}|S r   )r	  r   )r   r   Zintermediate_outputr  r6   r6   r7   r
    s   
zCanineLayer.feed_forward_chunkr   )r/   r0   r1   r{   r   r3   r4   r   r   r   r
  r   r6   r6   r   r7   r    s"    
r  c                       s   e Zd Z							d fdd	Z					ddeej deej d	eej d
ee dee dee de	ee
f fddZ  ZS )CanineEncoderFr   c	           	   
      sH   t    | _t fddtjD | _d| _d S )Nc                    s"   g | ]}t  qS r6   )r  )r:   _r   r   r   r   r   rj   r   r   r6   r7   rM     s    z*CanineEncoder.__init__.<locals>.<listcomp>F)	rz   r{   rj   r   Z
ModuleListrd   num_hidden_layerslayergradient_checkpointingr   r   r  r7   r{     s   

zCanineEncoder.__init__NTr-   r   r   r   output_hidden_statesreturn_dictr   c                 C   s   |rdnd }|r
dnd }t | jD ]:\}	}
|r||f }|d ur$||	 nd }| jr7| jr7| |
j||||}n|
||||}|d }|rK||d f }q|rS||f }|satdd |||fD S t|||dS )Nr6   r   r   c                 s       | ]	}|d ur|V  qd S r   r6   r:   vr6   r6   r7   r<   !      z(CanineEncoder.forward.<locals>.<genexpr>)r+   r-   r.   )r   r  r  ZtrainingZ_gradient_checkpointing_func__call__r   r   )r   r-   r   r   r   r  r  all_hidden_statesall_self_attentionsrL   Zlayer_moduleZlayer_head_maskZlayer_outputsr6   r6   r7   r     s8   	

zCanineEncoder.forwardr   )NNFFT)r/   r0   r1   r{   r   r3   r4   r   r   r   r   r   r   r6   r6   r   r7   r    s:    !
r  c                       6   e Zd Z fddZdeej dejfddZ  ZS )CaninePoolerc                    s*   t    t|j|j| _t | _d S r   )rz   r{   r   r   r|   r   ZTanhr   r   r   r6   r7   r{   *  s   
zCaninePooler.__init__r-   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r   r-   Zfirst_token_tensorpooled_outputr6   r6   r7   r   /  s   

zCaninePooler.forwardr   r6   r6   r   r7   r  )  s    "r  c                       r  )CaninePredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r   )rz   r{   r   r   r|   r   r   r   r   r   transform_act_fnrD   r   r   r   r6   r7   r{   9  s   
z&CaninePredictionHeadTransform.__init__r-   r   c                 C   s"   |  |}| |}| |}|S r   )r   r  rD   r  r6   r6   r7   r   B  s   


z%CaninePredictionHeadTransform.forwardr   r6   r6   r   r7   r  8  s    "	r  c                       r  )CanineLMPredictionHeadc                    sL   t    t|| _tj|j|jdd| _t	t
|j| _| j| j_d S )NF)rJ   )rz   r{   r  	transformr   r   r|   Z
vocab_sizedecoder	Parameterr3   r   rJ   r   r   r6   r7   r{   J  s
   

zCanineLMPredictionHead.__init__r-   r   c                 C   r   r   )r!  r"  r  r6   r6   r7   r   W  r  zCanineLMPredictionHead.forwardr   r6   r6   r   r7   r   I  s    "r   c                       s:   e Zd Z fddZdeej deej fddZ  ZS )CanineOnlyMLMHeadc                    s   t    t|| _d S r   )rz   r{   r   predictionsr   r   r6   r7   r{   ^  s   
zCanineOnlyMLMHead.__init__sequence_outputr   c                 C   s   |  |}|S r   )r%  )r   r&  Zprediction_scoresr6   r6   r7   r   b  s   
zCanineOnlyMLMHead.forward)	r/   r0   r1   r{   r   r3   r   r   r   r6   r6   r   r7   r$  ]  s    r$  c                   @   s$   e Zd ZeZeZdZdZdd Z	dS )CaninePreTrainedModelcanineTc                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjrF|jjjd| jjd |jdurD|jj|j 
  dS dS t |tjr[|j	j
  |jjd dS dS )zInitialize the weightsg        )meanZstdNr   )r   r   r   r   rH   rh   Znormal_rj   Zinitializer_rangerJ   Zzero_r   Zpadding_idxrD   Zfill_)r   moduler6   r6   r7   _init_weightsq  s   

z#CaninePreTrainedModel._init_weightsN)
r/   r0   r1   r   Zconfig_classrq   Zload_tf_weightsZbase_model_prefixZsupports_gradient_checkpointingr+  r6   r6   r6   r7   r'  j  s    r'  c                       s   e Zd Zd fdd	Zdd Zdd Zdejd	efd
dZ	dejdedejfddZ
e									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )CanineModelTc              
      s   t  | || _t|}d|_t|| _t|ddd|j	|j	|j	|j	d| _
t|| _t|| _t|| _t|| _|rAt|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   TF)r   r   r   r   r   r   r   N)rz   r{   rj   copydeepcopyr  rr   char_embeddingsr  Zlocal_transformer_strider@   r   rA   r=   r   rF   rC   r  pooler	post_init)r   rj   Zadd_pooling_layerZshallow_configr   r6   r7   r{     s*   






zCanineModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr=   r  r  r   )r   Zheads_to_pruner  r   r6   r6   r7   _prune_heads  s   zCanineModel._prune_headsc                 C   s\   |j d |j d }}|j d }t||d|f }tj||dftj|jd}|| }|S )aP  
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        r   r   )r   r   r   )rf   r3   reshaper   onesZfloat32r   )r   r   Zto_mask
batch_sizer   r   Zbroadcast_onesmaskr6   r6   r7   )_create_3d_attention_mask_from_input_mask  s   
z5CanineModel._create_3d_attention_mask_from_input_maskchar_attention_maskr   c                 C   sF   |j \}}t||d|f}tjj||d| }tj|dd}|S )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   rv   r   )rf   r3   r4  r   Z	MaxPool1dr   squeeze)r   r9  r   r6  Zchar_seq_lenZpoolable_char_maskZpooled_molecule_maskmolecule_attention_maskr6   r6   r7   _downsample_attention_mask  s   
z&CanineModel._downsample_attention_mask	moleculeschar_seq_lengthr   c           	      C   sz   | j j}|ddddddf }tj||dd}|ddddddf }|| }tj||| dd}tj||gddS )zDRepeats molecules to make them the same length as the char sequence.Nr   rB   )Zrepeatsr   rv   r   )rj   r   r3   Zrepeat_interleaver   )	r   r=  r>  ZrateZmolecules_without_extra_clsZrepeatedZlast_moleculeZremainder_lengthZremainder_repeatedr6   r6   r7   _repeat_molecules  s   zCanineModel._repeat_moleculesNr   r   r   ru   r   r   r   r  r  c
           "      C   s  |d ur|n| j j}|d ur|n| j j}|rdnd }
|rdnd }|	d ur&|	n| j j}	|d ur6|d ur6td|d urE| || | }n|d urR| d d }ntd|\}}|d ura|jn|j}|d u rqtj	||f|d}|d u r~tj
|tj|d}| ||}| j|| j jd}| |||jd f}| || j j}| j||||d}| |d ur|n||}| j||||d	}|j}| |}| j||||||	d
}|d }| jd ur| |nd }| j||d d}tj||gdd}| |}| j||||d	}|j}|r|	r|jn|d }|
|j | |j }
|r2|	r$|jn|d } ||j |  |j }|	sH||f}!|!tdd |
|fD 7 }!|!S t |||
|dS )Nr6   zDYou cannot specify both input_ids and inputs_embeds at the same timerv   z5You have to specify either input_ids or inputs_embeds)r   r   )r   )r   ru   r   r   )r   r   r  )r   r   r   r  r  r   )r>  r   r   c                 s   r  r   r6   r  r6   r6   r7   r<     r  z&CanineModel.forward.<locals>.<genexpr>)r+   r,   r-   r.   )!rj   r   r  use_return_dictrg   Z%warn_if_padding_and_no_attention_maskr   r   r3   r5  r   r   Zget_extended_attention_maskr<  r   rf   Zget_head_maskr  r/  r8  r@   r+   rA   r=   r0  r?  r   rF   rC   r-   r.   r   r*   )"r   r   r   r   ru   r   r   r   r  r  r  r  r   r6  r   r   Zextended_attention_maskr;  Z extended_molecule_attention_maskZinput_char_embeddingsr9  Zinit_chars_encoder_outputsZinput_char_encodingZinit_molecule_encodingZencoder_outputsZmolecule_sequence_outputr  Zrepeated_moleculesconcatr&  Zfinal_chars_encoder_outputsZdeep_encoder_hidden_statesZdeep_encoder_self_attentionsr   r6   r6   r7   r     s   
	


zCanineModel.forward)T)	NNNNNNNNN)r/   r0   r1   r{   r3  r8  r3   r   rc   r<  r?  r   r   r   r4   r   r   r   r*   r   r   r6   r6   r   r7   r,    sJ    "	

r,  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )Zcustom_introc                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eef fddZ  ZS )CanineForSequenceClassificationc                    J   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   rz   r{   
num_labelsr,  r(  r   r   r   r   r   r|   
classifierr1  r   r   r6   r7   r{        
z(CanineForSequenceClassification.__init__Nr   r   r   ru   r   r   labelsr   r  r  r   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   ru   r   r   r   r  r  r   Z
regressionZsingle_label_classificationZmulti_label_classificationrv   rK   losslogitsr-   r.   )rj   r@  r(  r   rG  Zproblem_typerF  r   r3   r   rc   r	   r:  r   r   r   r   r-   r.   )r   r   r   r   ru   r   r   rI  r   r  r  r   r  rM  rL  loss_fctr   r6   r6   r7   r     sV   



"


z'CanineForSequenceClassification.forward
NNNNNNNNNN)r/   r0   r1   r{   r   r   r3   r   r4   r   r   r   r   r   r   r6   r6   r   r7   rC    sH    	

rC  c                       rB  )CanineForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S r  )rz   r{   r,  r(  r   r   r   r   r   r|   rG  r1  r   r   r6   r7   r{     s
   
z CanineForMultipleChoice.__init__Nr   r   r   ru   r   r   rI  r   r  r  r   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rv   rB   rJ  rK   rK  )rj   r@  rf   r   r   r(  r   rG  r   r   r-   r.   )r   r   r   r   ru   r   r   rI  r   r  r  Znum_choicesr   r  rM  Zreshaped_logitsrL  rN  r   r6   r6   r7   r     sL   ,


zCanineForMultipleChoice.forwardrO  )r/   r0   r1   r{   r   r   r3   r   r4   r   r   r   r   r   r   r6   r6   r   r7   rP    sH    
	

rP  c                       rB  )CanineForTokenClassificationc                    rD  r   rE  r   r   r6   r7   r{   X  rH  z%CanineForTokenClassification.__init__Nr   r   r   ru   r   r   rI  r   r  r  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```python
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```NrJ  r   rv   rK   rK  )rj   r@  r(  r   rG  r   r   rF  r   r-   r.   )r   r   r   r   ru   r   r   rI  r   r  r  r   r&  rM  rL  rN  r   r6   r6   r7   r   c  s8   0

z$CanineForTokenClassification.forwardrO  )r/   r0   r1   r{   r   r   r3   r   r4   r   r   r   r   r   r   r6   r6   r   r7   rQ  V  sH    	

rQ  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eef fddZ  ZS )CanineForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
rz   r{   rF  r,  r(  r   r   r|   
qa_outputsr1  r   r   r6   r7   r{     s
   
z#CanineForQuestionAnswering.__init__Nr   r   r   ru   r   r   start_positionsend_positionsr   r  r  r   c                 C   s@  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d}|d}d }|d ur~|d ur~t| dkrK|d}t| dkrX|d}|d}|d| |d| t	|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t
||||j|jdS )	NrJ  r   r   rv   r   )Zignore_indexrK   )rL  start_logits
end_logitsr-   r.   )rj   r@  r(  rS  r[   r:  rb   r   Zclamp_r   r   r-   r.   )r   r   r   r   ru   r   r   rT  rU  r   r  r  r   r&  rM  rV  rW  Z
total_lossZignored_indexrN  Z
start_lossZend_lossr   r6   r6   r7   r     sP   








z"CanineForQuestionAnswering.forward)NNNNNNNNNNN)r/   r0   r1   r{   r   r   r3   r   r4   r   r   r   r   r   r   r6   r6   r   r7   rR    sN    
	

rR  )rP  rR  rC  rQ  r  r,  r'  rq   )Ar2   r-  r   rT   dataclassesr   typingr   r   r   r3   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   Zconfiguration_caniner   Z
get_loggerr/   rR   r   r*   rq   Modulerr   r   r   r   r   r   r   r  r  r  r  r  r   r$  r'  r,  rC  rP  rQ  rR  __all__r6   r6   r6   r7   <module>   sj    
"ae.:d :L  Ug`M