o
    Zh                     @   s  d dl Z d dlmZmZmZ d dlmZ d dlZd dl	m
Z d dl
Zd dlmZmZmZ d dlmZmZ d dlmZ ddlmZmZmZmZmZmZ ddlmZmZmZm Z  dd	l!m"Z"m#Z#m$Z$ d
dl%m&Z& e$'e(Z)dZ*dZ+dZ,dZ-dd Z.dd Z/G dd dej0Z1G dd dej0Z2G dd dej0Z3G dd dej0Z4G dd dej0Z5G dd dej0Z6G d d! d!ej0Z7G d"d# d#eZ8G d$d% d%ej0Z9e"d&e,G d'd( d(e8Z:ee:e*de+ G d)d* d*ej0Z;e"d+e,G d,d- d-e8Z<ee<e*ee+ G d.d/ d/ej0Z=e"d0e,G d1d2 d2e8Z>ee>e*ee+ G d3d4 d4ej0Z?e"d5e,G d6d7 d7e8Z@e e@e-Ad8 ee@e*ee+ G d9d: d:ej0ZBe"d;e,G d<d= d=e8ZCeeCe*ee+ G d>d? d?ej0ZDe"d@e,G dAdB dBe8ZEeeEe*ee+ g dCZFdS )D    N)CallableOptionalTuple)
FrozenDictfreezeunfreeze)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DistilBertConfigzdistilbert-base-uncasedr   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                 C   s*   dt dd|d  t |  }| | S )Nr   i'     )nppowerfloat32)posid_modelZangle_rates r"   f/var/www/auris/lib/python3.10/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py
get_angles`   s   "r$   c                 C   s   t t| d d tjf t|tjd d f |}t|d d dd df |d d dd df< t|d d dd df |d d dd df< |tjdf }t|S )Nr   r   r   .)r$   r   arangeZnewaxissincosjnparray)positionr!   Z
angle_radspos_encodingr"   r"   r#   positional_encodinge   s
   4..
r,   c                   @   sB   e Zd ZU dZeed< ejZejed< dd Z	dde
fdd	Zd
S )FlaxEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 C   s   t j| jj| jjtj jj| jjdd| _	| jj
s/t j| jj| jjtj jj| jjdd| _n
t| jj| jj| _t jd| jd| _t j| jjd| _d S )NZstddev)Zembedding_init-q=epsilonr/   Zrate)nnZEmbedr.   
vocab_sizedimjaxinitializersnormalinitializer_rangeword_embeddingssinusoidal_pos_embdsZmax_position_embeddingsposition_embeddingsr,   r+   	LayerNormr/   Dropoutdropoutselfr"   r"   r#   setupz   s   
zFlaxEmbeddings.setupTdeterministicc           	      C   s   |j \}}| |d}| jjs+t|d}tj|||fd}| |d}n| j	d d d |d d f }||j
}|| }| |}| j||d}|S )Ni4)shaperE   )rG   r<   astyper.   r=   r(   r%   Zbroadcast_tor>   r+   r/   r?   rA   )	rC   	input_idsrE   Z
batch_sizeZ
seq_lengthZinputs_embedsZposition_idsZposition_embedshidden_statesr"   r"   r#   __call__   s   

zFlaxEmbeddings.__call__NT)__name__
__module____qualname____doc__r   __annotations__r(   r   r/   rD   boolrL   r"   r"   r"   r#   r-   t   s   
 r-   c                   @   F   e Zd ZU eed< ejZejed< dd Z		dde	de	fd	d
Z
dS )FlaxMultiHeadSelfAttentionr.   r/   c                 C   s   | j j| _| j j| _tj| j jd| _| j| j dks'td| j d| j tj| j| j	t
jjj| j jdd| _tj| j| j	t
jjj| j jdd| _tj| j| j	t
jjj| j jdd| _tj| j| j	t
jjj| j jdd| _d S )Nr4   r   Hidden size " not dividable by number of heads r0   r/   Zkernel_init)r.   n_headsr7   r5   r@   Zattention_dropoutrA   
ValueErrorDenser/   r8   r9   r:   r;   q_link_linv_linout_linrB   r"   r"   r#   rD      s2   

z FlaxMultiHeadSelfAttention.setupTFrE   output_attentionsc              	      s  |j \ }}|j d }	jj  dd|	f}
 fdd} fdd}||}||}||}|t }t	||
dddd}t||
}||j}|d	d
|   }tj|dd}j||d}t	||}||}|}|r||fS |fS )Nr   c                    s   |   djddddS )zseparate headsr   r   r   r   )reshaperY   	transposexbsZdim_per_headrC   r"   r#   rG      s   z2FlaxMultiHeadSelfAttention.__call__.<locals>.shapec                    s    |  dddd dj S )zgroup headsr   r   r   r   ra   )rc   rb   rY   rd   rf   r"   r#   unshape   s    z4FlaxMultiHeadSelfAttention.__call__.<locals>.unshaper   r   r   gꌠ9Y>)Fg      ?ra   ZaxisrH   )rG   r7   rY   r\   r]   r^   mathsqrtr(   matmulrc   rb   rI   r/   r5   ZsoftmaxrA   r_   )rC   querykeyvaluemaskrE   r`   Zq_lenr7   Zk_lenZ
mask_reshprG   rh   qkvZscoresweightscontextr"   rf   r#   rL      s,   	

z#FlaxMultiHeadSelfAttention.__call__N)TFrN   rO   rP   r   rR   r(   r   r/   rD   rS   rL   r"   r"   r"   r#   rU      s   
 #rU   c                   @   s>   e Zd ZU eed< ejZejed< dd Zd
de	fddZ
d	S )FlaxFFNr.   r/   c                 C   s   t j| jjd| _| jj| _d| _t j| jj| jt	j j
j| jjdd| _t j| jj| jt	j j
j| jjdd| _t| jj | _d S )Nr4   r   r0   rX   )r5   r@   r.   rA   Zchunk_size_feed_forwardZseq_len_dimr[   Z
hidden_dimr/   r8   r9   r:   r;   lin1r7   lin2r   
activationrB   r"   r"   r#   rD      s   
zFlaxFFN.setupTrE   c                 C   s0   |  |}| |}| |}| j||d}|S )NrH   )rx   rz   ry   rA   )rC   rK   rE   r"   r"   r#   rL   	  s
   


zFlaxFFN.__call__NrM   rv   r"   r"   r"   r#   rw      s
   
 rw   c                   @   rT   )FlaxTransformerBlockr.   r/   c                 C   s|   | j j| j j dksJ d| j j d| j j t| j | jd| _tjd| jd| _t	| j | jd| _
tjd| jd| _d S )Nr   rV   rW   r/   r1   r2   )r.   r7   rY   rU   r/   	attentionr5   r?   sa_layer_normrw   ffnoutput_layer_normrB   r"   r"   r#   rD     s   zFlaxTransformerBlock.setupFTr`   rE   c           	      C   s~   | j ||||||d}|r|\}}nt|tu sJ |d }| || }| j||d}| || }|f}|r=|f| }|S )N)rm   rn   ro   rp   r`   rE   r   rH   )r}   typetupler~   r   r   )	rC   rK   	attn_maskr`   rE   Z	sa_outputZ
sa_weightsZ
ffn_outputoutputr"   r"   r#   rL      s&   

zFlaxTransformerBlock.__call__N)FTrv   r"   r"   r"   r#   r{     s   
 r{   c                	   @   R   e Zd ZU eed< ejZejed< dd Z				dde	de	d	e	d
e	fddZ
dS )FlaxTransformerr.   r/   c                    s     fddt  jjD  _d S )Nc                    s"   g | ]}t  jt| jd qS ))namer/   )r{   r.   strr/   ).0r    rB   r"   r#   
<listcomp>E  s    z)FlaxTransformer.setup.<locals>.<listcomp>)ranger.   Zn_layerslayersrB   r"   rB   r#   rD   D  s   

zFlaxTransformer.setupFTr`   output_hidden_statesrE   return_dictc                 C   s   |rdnd }|r
dnd }| j D ]1}	|r||f }|	||||d}
|
d }|r8t|
dks.J |
d }||f }qt|
dks@J q|rH||f }|sVtdd |||fD S t|||d	S )
Nr"   )rK   r   r`   rE   ra   r   r   r   c                 s   s    | ]	}|d ur|V  qd S Nr"   )r   rs   r"   r"   r#   	<genexpr>m  s    z+FlaxTransformer.__call__.<locals>.<genexpr>)Zlast_hidden_staterK   
attentions)r   lenr   r   )rC   rK   attention_maskr`   r   rE   r   Zall_hidden_statesZall_attentionsZlayer_moduleZlayer_outputsr   r"   r"   r#   rL   I  s0   	


zFlaxTransformer.__call__NFFTFrv   r"   r"   r"   r#   r   @  "   
 	r   c                	   @   r   )FlaxTransformerEncoderr.   r/   c                 C   s   t | j| jd| _d S Nr|   )r   r.   r/   layerrB   r"   r"   r#   rD   w  s   zFlaxTransformerEncoder.setupFTr`   r   rE   r   c                 C   s   | j ||||||dS )N)rK   r   r`   r   rE   r   )r   )rC   rK   r   r`   r   rE   r   r"   r"   r#   rL   z  s   	zFlaxTransformerEncoder.__call__Nr   rv   r"   r"   r"   r#   r   s  s"   
 r   c                   @   sR   e Zd ZU eed< ejZejed< ej	j
jZedejf ed< dd Zdd Zd	S )
FlaxDistilBertLMDecoderr.   r/   .	bias_initc                 C   s   |  d| j| jjf| _d S )Nbias)paramr   r.   r6   r   rB   r"   r"   r#   rD     s   zFlaxDistilBertLMDecoder.setupc                 C   sV   t || j}t || j}t|||jd fdfdf}t | j| j}|| }|S )Nr   )r   )r"   r"   )r(   Zasarrayr/   r
   Zdot_generalndimr   )rC   ZinputsZkernelyr   r"   r"   r#   rL     s   z FlaxDistilBertLMDecoder.__call__N)rN   rO   rP   r   rR   r(   r   r/   r8   r5   r9   zerosr   r   r   ZndarrayrD   rL   r"   r"   r"   r#   r     s   
 r   c                       s   e Zd ZU dZeZdZdZej	e
d< ddejdfded	ed
edejdef
 fddZddejjd	ededefddZeed								ddee dejjdedee dee dee fddZ  ZS )FlaxDistilBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
distilbertNmodule_class)r   r   r   Tr.   input_shapeseedr/   _do_initc                    s2   | j d||d|}t j||||||d d S )Nr.   r/   )r   r   r/   r   r"   )r   super__init__)rC   r.   r   r   r/   r   kwargsmodule	__class__r"   r#   r     s   	z&FlaxDistilBertPreTrainedModel.__init__rngparamsreturnc                 C   s   t j|dd}t |}tj|\}}||d}| jj|||ddd }	|d urKtt	|	}	tt	|}| j
D ]}
|	|
 ||
< q8t | _
tt|S |	S )NrF   r|   )r   rA   F)r   r   )r(   r   	ones_liker8   randomsplitr   initr   r   Z_missing_keyssetr   r	   )rC   r   r   r   rJ   r   Z
params_rngdropout_rngrngsZrandom_paramsZmissing_keyr"   r"   r#   init_weights  s   


z*FlaxDistilBertPreTrainedModel.init_weightszbatch_size, sequence_lengthFr   trainr`   r   r   c
              
   C   s   |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d u r't|}i }
|d ur1||
d< | jjd|p9| jitj	|ddtj	|dd| |||	|
dS )NrA   r   rF   r|   )r   )
r.   r`   r   r   r(   r   r   applyr   r)   )rC   rJ   r   Z	head_maskr   r   r   r`   r   r   r   r"   r"   r#   rL     s&   
z&FlaxDistilBertPreTrainedModel.__call__r   )NNNNFNNN)rN   rO   rP   rQ   r   Zconfig_classZbase_model_prefixr   r5   ModulerR   r(   r   r   intr/   rS   r   r8   r   ZPRNGKeyr   r   r   DISTILBERT_INPUTS_DOCSTRINGformatr   dictrL   __classcell__r"   r"   r   r#   r     sV   
  	
r   c                	   @   R   e Zd ZU eed< ejZejed< dd Z				dde	de	d	e	d
e	fddZ
dS )FlaxDistilBertModuler.   r/   c                 C   s(   t | j| jd| _t| j| jd| _d S r   )r-   r.   r/   
embeddingsr   transformerrB   r"   r"   r#   rD     s   zFlaxDistilBertModule.setupTFrE   r`   r   r   c                 C   s`   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||d}| j||||||dS )NrH   )rK   r   rE   r`   r   r   )r.   r`   r   r   r   r   )rC   rJ   r   rE   r`   r   r   Zinput_embedsr"   r"   r#   rL     s   	zFlaxDistilBertModule.__call__NTFFTrv   r"   r"   r"   r#   r     s"   
 r   zdThe bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.c                   @      e Zd ZeZdS )FlaxDistilBertModelN)rN   rO   rP   r   r   r"   r"   r"   r#   r     s    r   c                	   @   r   )FlaxDistilBertForMaskedLMModuler.   r/   c                 C   s   t | j| jd| _tj| jj| jtjjj	| jj
dd| _tjd| jd| _| jjr5t| j| jd| _d S tj| jj| jtjjj	| jj
dd| _d S )Nr|   r0   rX   r1   r2   )r   r.   r/   r   r5   r[   r7   r8   r9   r:   r;   vocab_transformr?   vocab_layer_normtie_word_embeddingsr   vocab_projectorr6   rB   r"   r"   r#   rD      s"   z%FlaxDistilBertForMaskedLMModule.setupTFrE   r`   r   r   c                 C   s   |d ur|n| j j}| j||||||d}|d }| |}	t| j j |	}	| |	}	| j jrC| jjd d d d }
| 	|	|
j
}	n| 	|	}	|sU|	f|dd   }|S t|	|j|jdS )	N)rJ   r   r`   r   rE   r   r   r   r   r<   Z	embeddingr   logitsrK   r   )r.   use_return_dictr   r   r   rz   r   r   	variablesr   Tr   rK   r   )rC   rJ   r   rE   r`   r   r   Zdlbrt_outputrK   Zprediction_logitsZshared_embeddingr   r"   r"   r#   rL   4  s2   	


z(FlaxDistilBertForMaskedLMModule.__call__Nr   rv   r"   r"   r"   r#   r     s"   
 r   z8DistilBert Model with a `language modeling` head on top.c                   @   r   )FlaxDistilBertForMaskedLMN)rN   rO   rP   r   r   r"   r"   r"   r#   r   ]  s    r   c                	   @   r   )-FlaxDistilBertForSequenceClassificationModuler.   r/   c                 C   sf   t | j| jd| _tj| jj| jtjjj	| jj
dd| _tj| jjd| _tj| jj| jd| _d S )Nr   r0   rX   r4   r|   )r   r.   r/   r   r5   r[   r7   r8   r9   r:   r;   pre_classifierr@   seq_classif_dropoutrA   
num_labels
classifierrB   r"   r"   r#   rD   i  s   z3FlaxDistilBertForSequenceClassificationModule.setupTFrE   r`   r   r   c                 C   s   |d ur|n| j j}| j||||||d}|d }|d d df }	| |	}	td |	}	| j|	|d}	| |	}
|sC|
f|dd   S t|
|j|j	dS )NrE   r`   r   r   r   relurH   r   r   )
r.   r   r   r   r   rA   r   r   rK   r   )rC   rJ   r   rE   r`   r   r   distilbert_outputhidden_statepooled_outputr   r"   r"   r#   rL   v  s,   	

z6FlaxDistilBertForSequenceClassificationModule.__call__Nr   rv   r"   r"   r"   r#   r   e  "   
 r   z
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   @   r   )'FlaxDistilBertForSequenceClassificationN)rN   rO   rP   r   r   r"   r"   r"   r#   r         r   c                	   @   r   )%FlaxDistilBertForMultipleChoiceModuler.   r/   c                 C   sb   t | j| jd| _tj| jj| jtjjj	| jj
dd| _tj| jjd| _tjd| jd| _d S )Nr   r0   rX   r4   r   r|   )r   r.   r/   r   r5   r[   r7   r8   r9   r:   r;   r   r@   r   rA   r   rB   r"   r"   r#   rD     s   z+FlaxDistilBertForMultipleChoiceModule.setupTFrE   r`   r   r   c                 C   s   |d ur|n| j j}|jd }|d ur|d|jd nd }|d ur+|d|jd nd }| j||||||d}|d }	|	d d df }
| |
}
td |
}
| j|
|d}
| |
}|d|}|sl|f|dd   S t	||j
|jdS )	Nr   ra   r   r   r   rH   r   r   )r.   r   rG   rb   r   r   r   rA   r   r   rK   r   )rC   rJ   r   rE   r`   r   r   Znum_choicesoutputsr   r   r   Zreshaped_logitsr"   r"   r#   rL     s4   	
	

z.FlaxDistilBertForMultipleChoiceModule.__call__Nr   rv   r"   r"   r"   r#   r     r   r   z
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    c                   @   r   )FlaxDistilBertForMultipleChoiceN)rN   rO   rP   r   r   r"   r"   r"   r#   r     r   r   z(batch_size, num_choices, sequence_lengthc                	   @   r   )*FlaxDistilBertForTokenClassificationModuler.   r/   c                 C   s>   t | j| jd| _tj| jjd| _tj| jj| jd| _	d S )Nr   r4   r|   )
r   r.   r/   r   r5   r@   rA   r[   r   r   rB   r"   r"   r#   rD     s   z0FlaxDistilBertForTokenClassificationModule.setupTFrE   r`   r   r   c           
      C   sr   |d ur|n| j j}| j||||||d}|d }| j||d}| |}	|s0|	f|dd   S t|	|j|jdS )Nr   r   rH   r   r   )r.   r   r   rA   r   r   rK   r   )
rC   rJ   r   rE   r`   r   r   r   rK   r   r"   r"   r#   rL     s&   		
z3FlaxDistilBertForTokenClassificationModule.__call__Nr   rv   r"   r"   r"   r#   r     r   r   z
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    c                   @   r   )$FlaxDistilBertForTokenClassificationN)rN   rO   rP   r   r   r"   r"   r"   r#   r   *  r   r   c                	   @   r   )(FlaxDistilBertForQuestionAnsweringModuler.   r/   c                 C   sN   t | j| jd| _tj| jj| jd| _| jjdksJ tj| jj	d| _
d S )Nr   r|   r   r4   )r   r.   r/   r   r5   r[   r   
qa_outputsr@   Z
qa_dropoutrA   rB   r"   r"   r#   rD   A  s   z.FlaxDistilBertForQuestionAnsweringModule.setupTFrE   r`   r   r   c                 C   s   |d ur|n| j j}| j||||||d}|d }| j||d}| |}	tj|	| j jdd\}
}|
d}
|d}|sG|
|f|dd   S t	|
||j
|jdS )Nr   r   rH   ra   ri   r   )start_logits
end_logitsrK   r   )r.   r   r   rA   r   r(   r   r   Zsqueezer   rK   r   )rC   rJ   r   rE   r`   r   r   r   rK   r   r   r   r"   r"   r#   rL   G  s.   		


z1FlaxDistilBertForQuestionAnsweringModule.__call__Nr   rv   r"   r"   r"   r#   r   =  s"   
 
r   z
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   @   r   )"FlaxDistilBertForQuestionAnsweringN)rN   rO   rP   r   r   r"   r"   r"   r#   r   o  r   r   )r   r   r   r   r   r   r   )Grj   typingr   r   r   Z
flax.linenZlinenr5   r8   Z	jax.numpynumpyr(   r   Zflax.core.frozen_dictr   r   r   Zflax.traverse_utilr   r	   r
   Zmodeling_flax_outputsr   r   r   r   r   r   Zmodeling_flax_utilsr   r   r   r   utilsr   r   r   Zconfiguration_distilbertr   Z
get_loggerrN   loggerZ_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCZFLAX_DISTILBERT_START_DOCSTRINGr   r$   r,   r   r-   rU   rw   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r"   r"   r"   r#   <module>   s    
-S/3Q"A5<
+2