o
    ZhR                     @   s  d dl mZmZmZ d dlZd dlmZ d dlZd dl	m
Z d dl
Zd dlmZmZmZ d dlmZ d dlmZmZ d dlmZ ddlmZmZmZmZmZmZmZ dd	lm Z m!Z!m"Z"m#Z#m$Z$ dd
l%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ e),e-Z.dZ/dZ0ej1j2G dd de&Z3dZ4dZ5G dd dej6Z7G dd dej6Z8G dd dej6Z9G dd dej6Z:G dd dej6Z;G dd dej6Z<G dd  d ej6Z=G d!d" d"ej6Z>G d#d$ d$ej6Z?G d%d& d&e!Z@G d'd( d(ej6ZAe'd)e4G d*d+ d+e@ZBe"eBe/ee0 G d,d- d-ej6ZCe'd.e4G d/d0 d0e@ZDd1ZEe$eDe5Fd2eE  e#eDe3e0d3 G d4d5 d5ej6ZGe'd6e4G d7d8 d8e@ZHe"eHe/ee0d9d: G d;d< d<ej6ZIe'd=e4G d>d? d?e@ZJe"eJe/ee0 G d@dA dAej6ZKe'dBe4G dCdD dDe@ZLe$eLe5FdE e"eLe/ee0 G dFdG dGej6ZMe'dHe4G dIdJ dJe@ZNe"eNe/ee0 G dKdL dLej6ZOe'dMe4G dNdO dOe@ZPe"ePe/ee0 g dPZQdS )Q    )CallableOptionalTupleN)
FrozenDictfreezeunfreeze)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxBaseModelOutputWithPoolingFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )AlbertConfigzalbert/albert-base-v2r   c                   @   sZ   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
eej  ed< dS )FlaxAlbertForPreTrainingOutputaB  
    Output type of [`FlaxAlbertForPreTraining`].

    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprediction_logits
sop_logitshidden_states
attentions)__name__
__module____qualname____doc__r    jnpndarray__annotations__r!   r"   r   r   r#    r+   r+   ^/var/www/auris/lib/python3.10/site-packages/transformers/models/albert/modeling_flax_albert.pyr   6   s   
 r   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   @   sB   e Zd ZU dZeed< ejZejed< dd Z	dde
fdd	Zd
S )FlaxAlbertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 C   s   t j| jj| jjtj jj| jjdd| _	t j| jj
| jjtj jj| jjdd| _t j| jj| jjtj jj| jjdd| _t j| jj| jd| _t j| jjd| _d S )N)Zstddev)Zembedding_initepsilonr/   Zrate)nnZEmbedr.   
vocab_sizeembedding_sizejaxinitializersnormalinitializer_rangeword_embeddingsZmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr/   Dropouthidden_dropout_probdropoutselfr+   r+   r,   setup   s"   zFlaxAlbertEmbeddings.setupTdeterministicc           	      C   sX   |  |d}| |d}| |d}|| | }| |}| j||d}|S )Ni4rE   )r:   astyper;   r<   r=   rA   )	rC   	input_idstoken_type_idsposition_idsrE   Zinputs_embedsZposition_embedsr<   r"   r+   r+   r,   __call__   s   
zFlaxAlbertEmbeddings.__call__NT)r$   r%   r&   r'   r   r*   r(   float32r/   rD   boolrL   r+   r+   r+   r,   r-      s   
 r-   c                   @   s>   e Zd ZU eed< ejZejed< dd Zdde	fdd	Z
d
S )FlaxAlbertSelfAttentionr.   r/   c                 C   s   | j j| j j dkrtdtj| j j| jtjj	| j j
d| _tj| j j| jtjj	| j j
d| _tj| j j| jtjj	| j j
d| _tj| j jtjj	| j j
| jd| _tj| j j| jd| _tj| j jd| _d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r/   kernel_initrQ   r/   r0   r2   )r.   hidden_sizenum_attention_heads
ValueErrorr3   Denser/   r6   r7   r8   r9   querykeyvaluedenser=   r>   r?   r@   rA   rB   r+   r+   r,   rD      s4   zFlaxAlbertSelfAttention.setupTFoutput_attentionsc                 C   s  | j j| j j }| ||jd d | j j|f }| ||jd d | j j|f }| ||jd d | j j|f }|d urmtj	|dd}t
|dkt|jd| jt|jt| jj| j}	nd }	d }
|s~| j jdkr~| d}
t|||	|
| j jd|| jd d	}td	||}||jd d d
 }| |}| j||d}| || }|r||f}|S |f}|S )N   )Zaxisr   g        rA   T)biasdropout_rngZdropout_rateZbroadcast_dropoutrE   r/   	precisionz...hqk,...khd->...qhd)rG   )r.   rS   rT   rW   reshapeshaperY   rX   r(   Zexpand_dimsr   selectfullrH   r/   ZfinfominZattention_probs_dropout_probZmake_rngr   ZeinsumrZ   rA   r=   )rC   r"   attention_maskrE   r[   Zhead_dimZquery_statesZvalue_statesZ
key_statesZattention_biasra   Zattn_weightsZattn_outputZprojected_attn_outputZlayernormed_attn_outputoutputsr+   r+   r,   rL      sR   




z FlaxAlbertSelfAttention.__call__NTFr$   r%   r&   r   r*   r(   rN   r/   rD   rO   rL   r+   r+   r+   r,   rP      s
   
 rP   c                   @   sF   e Zd ZU eed< ejZejed< dd Z		dde	de	fd	d
Z
dS )FlaxAlbertLayerr.   r/   c                 C   s   t | j| jd| _tj| jjtjj	| jj
| jd| _t| jj | _tj| jjtjj	| jj
| jd| _tj| jj| jd| _tj| jjd| _d S )Nr/   rR   r0   r2   )rP   r.   r/   	attentionr3   rV   Zintermediate_sizer6   r7   r8   r9   ffnr   
hidden_act
activationrS   
ffn_outputr=   r>   full_layer_layer_normr?   r@   rA   rB   r+   r+   r,   rD     s   zFlaxAlbertLayer.setupTFrE   r[   c           	      C   sp   | j ||||d}|d }| |}| |}| |}| j||d}| || }|f}|r6||d f7 }|S )NrE   r[   r   rG   r   )ro   rp   rr   rs   rA   rt   )	rC   r"   ri   rE   r[   Zattention_outputsZattention_outputrs   rj   r+   r+   r,   rL   )  s   


zFlaxAlbertLayer.__call__Nrk   rl   r+   r+   r+   r,   rm     s   
 rm   c                   @   sL   e Zd ZU eed< ejZejed< dd Z			dde	de	d	e	fd
dZ
dS )FlaxAlbertLayerCollectionr.   r/   c                         fddt  jjD  _d S )Nc                    s"   g | ]}t  jt| jd qS ))namer/   )rm   r.   strr/   .0irB   r+   r,   
<listcomp>F  s    z3FlaxAlbertLayerCollection.setup.<locals>.<listcomp>)ranger.   Zinner_group_numlayersrB   r+   rB   r,   rD   E  s   

zFlaxAlbertLayerCollection.setupTFrE   r[   output_hidden_statesc                 C   sz   d}d}t | jD ] \}}	|	||||d}
|
d }|r"||
d f }|r)||f }q	|f}|r4||f }|r;||f }|S )Nr+   ru   r   r   )	enumerater   )rC   r"   ri   rE   r[   r   Zlayer_hidden_statesZlayer_attentionslayer_indexZalbert_layerZlayer_outputrj   r+   r+   r,   rL   J  s*   


z"FlaxAlbertLayerCollection.__call__NTFFrl   r+   r+   r+   r,   rv   A  s   
 	rv   c                   @   s\   e Zd ZU eed< ejZejed< dZe	e
 ed< dd Z			dd	ed
edefddZdS )FlaxAlbertLayerCollectionsr.   r/   Nr   c                 C   s   t | j| jd| _d S )Nrn   )rv   r.   r/   albert_layersrB   r+   r+   r,   rD   q  s   z FlaxAlbertLayerCollections.setupTFrE   r[   r   c                 C   s   | j |||||d}|S NrE   r[   r   )r   )rC   r"   ri   rE   r[   r   rj   r+   r+   r,   rL   t  s   z#FlaxAlbertLayerCollections.__call__r   )r$   r%   r&   r   r*   r(   rN   r/   r   r   ry   rD   rO   rL   r+   r+   r+   r,   r   l  s   
 r   c                	   @   R   e Zd ZU eed< ejZejed< dd Z				dde	de	d	e	d
e	fddZ
dS )FlaxAlbertLayerGroupsr.   r/   c                    rw   )Nc                    s(   g | ]}t  jt|t| jd qS ))rx   r   r/   )r   r.   ry   r/   rz   rB   r+   r,   r}     s    z/FlaxAlbertLayerGroups.setup.<locals>.<listcomp>)r~   r.   num_hidden_groupsr   rB   r+   rB   r,   rD     s   

zFlaxAlbertLayerGroups.setupTFrE   r[   r   return_dictc                 C   s   |rdnd }|r|fnd }t | jjD ]-}	t|	| jj| jj  }
| j|
 |||||d}|d }|r9||d  }|r@||f }q|sOtdd |||fD S t|||dS )Nr+   r   r   rc   c                 s   s    | ]	}|d ur|V  qd S Nr+   )r{   vr+   r+   r,   	<genexpr>  s    z1FlaxAlbertLayerGroups.__call__.<locals>.<genexpr>)last_hidden_stater"   r#   )r~   r.   Znum_hidden_layersintr   r   tupler   )rC   r"   ri   rE   r[   r   r   Zall_attentionsZall_hidden_statesr|   Z	group_idxZlayer_group_outputr+   r+   r,   rL     s,   	
zFlaxAlbertLayerGroups.__call__NTFFTrl   r+   r+   r+   r,   r     s"   
 
r   c                	   @   r   )FlaxAlbertEncoderr.   r/   c                 C   s<   t j| jjtj j| jj| jd| _	t
| j| jd| _d S )NrR   rn   )r3   rV   r.   rS   r6   r7   r8   r9   r/   embedding_hidden_mapping_inr   albert_layer_groupsrB   r+   r+   r,   rD     s   zFlaxAlbertEncoder.setupTFrE   r[   r   r   c                 C   s   |  |}| j|||||dS r   )r   r   )rC   r"   ri   rE   r[   r   r   r+   r+   r,   rL     s   
	zFlaxAlbertEncoder.__call__Nr   rl   r+   r+   r+   r,   r     s"   
 r   c                   @   sT   e Zd ZU eed< ejZejed< ej	j
jZedejf ed< dd Zd
dd	ZdS )FlaxAlbertOnlyMLMHeadr.   r/   .	bias_initc                 C   sn   t j| jj| jd| _t| jj | _t j	| jj
| jd| _	t j| jj| jdd| _| d| j| jjf| _d S )Nrn   r0   F)r/   Zuse_biasr`   )r3   rV   r.   r5   r/   rZ   r   rq   rr   r=   r>   r4   decoderparamr   r`   rB   r+   r+   r,   rD     s
   zFlaxAlbertOnlyMLMHead.setupNc                 C   sX   |  |}| |}| |}|d ur | jdd|jii|}n| |}|| j7 }|S )NparamsZkernel)rZ   rr   r=   r   applyTr`   )rC   r"   shared_embeddingr+   r+   r,   rL     s   




zFlaxAlbertOnlyMLMHead.__call__r   )r$   r%   r&   r   r*   r(   rN   r/   r6   r3   r7   zerosr   r   npr)   rD   rL   r+   r+   r+   r,   r     s   
 r   c                   @   s8   e Zd ZU eed< ejZejed< dd Zd	ddZ	dS )
FlaxAlbertSOPHeadr.   r/   c                 C   s&   t | jj| _t jd| jd| _d S )Nr\   rn   )r3   r?   r.   classifier_dropout_probrA   rV   r/   
classifierrB   r+   r+   r,   rD     s   zFlaxAlbertSOPHead.setupTc                 C   s   | j ||d}| |}|S )NrG   )rA   r   )rC   pooled_outputrE   logitsr+   r+   r,   rL     s   
zFlaxAlbertSOPHead.__call__NrM   )
r$   r%   r&   r   r*   r(   rN   r/   rD   rL   r+   r+   r+   r,   r     s
   
 r   c                       s   e Zd ZU dZeZdZdZej	e
d< ddejdfded	ed
edejdef
 fddZddejjd	ededefddZeed									ddee dejjdedee dee dee fddZ  ZS )FlaxAlbertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    albertNmodule_class)r   r   r   Tr.   input_shapeseedr/   _do_initc                    s2   | j d||d|}t j||||||d d S )Nr.   r/   )r   r   r/   r   r+   )r   super__init__)rC   r.   r   r   r/   r   kwargsmodule	__class__r+   r,   r     s   	z"FlaxAlbertPreTrainedModel.__init__rngr   returnc                 C   s   t j|dd}t |}t t t |jd |}t |}tj	
|\}}	||	d}
| jj|
||||ddd }|d uratt|}tt|}| jD ]}|| ||< qNt | _tt|S |S )NrF   rn   rc   )r   rA   F)r   r   )r(   r   
zeros_likebroadcast_toarange
atleast_2dre   	ones_liker6   randomsplitr   initr	   r   Z_missing_keyssetr   r
   )rC   r   r   r   rI   rJ   rK   ri   Z
params_rngra   rngsZrandom_paramsZmissing_keyr+   r+   r,   init_weights  s&   



z&FlaxAlbertPreTrainedModel.init_weightsbatch_size, sequence_lengthFra   trainr[   r   r   c                 C   s   |d ur|n| j j}|	d ur|	n| j j}	|
d ur|
n| j j}
|d u r't|}|d u r;ttt|j	d |j	}|d u rDt
|}i }|d urN||d< | jjd|pV| jitj|ddtj|ddtj|ddtj|dd| ||	|
|d
S )Nrc   rA   r   rF   rn   )r   )r.   r[   r   r   r(   r   r   r   r   re   r   r   r   r   array)rC   rI   ri   rJ   rK   r   ra   r   r[   r   r   r   r+   r+   r,   rL   *  s2   
 
z"FlaxAlbertPreTrainedModel.__call__r   )	NNNNNFNNN)r$   r%   r&   r'   r   config_classZbase_model_prefixr   r3   Moduler*   r(   rN   r   r   r/   rO   r   r6   r   ZPRNGKeyr   r   r   ALBERT_INPUTS_DOCSTRINGformatr   dictrL   __classcell__r+   r+   r   r,   r     sX   
  	
r   c                   @   sv   e Zd ZU eed< ejZejed< dZe	ed< dd Z
						dd	eej d
eej de	de	de	de	fddZdS )FlaxAlbertModuler.   r/   Tadd_pooling_layerc                 C   sn   t | j| jd| _t| j| jd| _| jr/tj| jj	t
jj| jj| jdd| _tj| _d S d | _d | _d S )Nrn   pooler)rQ   r/   rx   )r-   r.   r/   
embeddingsr   encoderr   r3   rV   rS   r6   r7   r8   r9   r   tanhpooler_activationrB   r+   r+   r,   rD   `  s   
zFlaxAlbertModule.setupNFrJ   rK   rE   r[   r   r   c	                 C   s   |d u r	t |}|d u rt t t |jd |j}| j||||d}	| j|	|||||d}
|
d }	| jrI| 	|	d d df }| 
|}nd }|sd|d u rZ|	f|
dd   S |	|f|
dd   S t|	||
j|
jdS )Nrc   rG   rE   r[   r   r   r   r   )r   Zpooler_outputr"   r#   )r(   r   r   r   r   re   r   r   r   r   r   r   r"   r#   )rC   rI   ri   rJ   rK   rE   r[   r   r   r"   rj   Zpooledr+   r+   r,   rL   o  s8   
 zFlaxAlbertModule.__call__)NNTFFT)r$   r%   r&   r   r*   r(   rN   r/   r   rO   rD   r   r   r)   rL   r+   r+   r+   r,   r   [  s0   
 	r   z`The bare Albert Model transformer outputting raw hidden-states without any specific head on top.c                   @      e Zd ZeZdS )FlaxAlbertModelN)r$   r%   r&   r   r   r+   r+   r+   r,   r     s    r   c                	   @   r   )FlaxAlbertForPreTrainingModuler.   r/   c                 C   s:   t | j| jd| _t| j| jd| _t| j| jd| _d S )Nr   )r   r.   r/   r   r   predictionsr   sop_classifierrB   r+   r+   r,   rD        z$FlaxAlbertForPreTrainingModule.setupTFrE   r[   r   r   c	              
   C   s   | j ||||||||d}	| jjr| j jd d d d }
nd }
|	d }|	d }| j||
d}| j||d	}|sB||f|	d
d   S t|||	j|	jdS )Nr   r   r   r:   	embeddingr   r   r   rG   r\   )r    r!   r"   r#   )	r   r.   tie_word_embeddings	variablesr   r   r   r"   r#   )rC   rI   ri   rJ   rK   rE   r[   r   r   rj   r   r"   r   Zprediction_scoresZ
sop_scoresr+   r+   r,   rL     s2   z'FlaxAlbertForPreTrainingModule.__call__Nr   rl   r+   r+   r+   r,   r     "   
 	r   z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                   @   r   )FlaxAlbertForPreTrainingN)r$   r%   r&   r   r   r+   r+   r+   r,   r         r   a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.sop_logits
    ```
r   )output_typer   c                	   @   r   )FlaxAlbertForMaskedLMModuler.   r/   c                 C   s*   t | jd| jd| _t| j| jd| _d S )NF)r.   r   r/   r   )r   r.   r/   r   r   r   rB   r+   r+   r,   rD     s   z!FlaxAlbertForMaskedLMModule.setupTFrE   r[   r   r   c	              
   C   s~   | j ||||||||d}	|	d }
| jjr"| j jd d d d }nd }| j|
|d}|s6|f|	dd   S t||	j|	jd	S )
Nr   r   r   r   r:   r   r   r   r   r"   r#   )r   r.   r   r   r   r   r"   r#   )rC   rI   ri   rJ   rK   rE   r[   r   r   rj   r"   r   r   r+   r+   r,   rL     s,   z$FlaxAlbertForMaskedLMModule.__call__Nr   rl   r+   r+   r+   r,   r   	  "   
 
	r   z4Albert Model with a `language modeling` head on top.c                   @   r   )FlaxAlbertForMaskedLMN)r$   r%   r&   r   r   r+   r+   r+   r,   r   ;  s    r   z
refs/pr/11)revisionc                	   @   r   ))FlaxAlbertForSequenceClassificationModuler.   r/   c                 C   sV   t | j| jd| _| jjd ur| jjn| jj}tj|d| _tj	| jj
| jd| _d S )Nr   r2   rn   r   r.   r/   r   r   r@   r3   r?   rA   rV   
num_labelsr   rC   Zclassifier_dropoutr+   r+   r,   rD   I  s   z/FlaxAlbertForSequenceClassificationModule.setupTFrE   r[   r   r   c	              
   C   b   | j ||||||||d}	|	d }
| j|
|d}
| |
}|s(|f|	dd   S t||	j|	jdS )Nr   r   rG   r\   r   )r   rA   r   r   r"   r#   )rC   rI   ri   rJ   rK   rE   r[   r   r   rj   r   r   r+   r+   r,   rL   V  (   
z2FlaxAlbertForSequenceClassificationModule.__call__Nr   rl   r+   r+   r+   r,   r   E  s"   
 	r   z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   @   r   )#FlaxAlbertForSequenceClassificationN)r$   r%   r&   r   r   r+   r+   r+   r,   r   {  r   r   c                	   @   r   )!FlaxAlbertForMultipleChoiceModuler.   r/   c                 C   s:   t | j| jd| _tj| jjd| _tjd| jd| _	d S )Nr   r2   r   rn   )
r   r.   r/   r   r3   r?   r@   rA   rV   r   rB   r+   r+   r,   rD     r   z'FlaxAlbertForMultipleChoiceModule.setupTFrE   r[   r   r   c	              
   C   s   |j d }	|d ur|d|j d nd }|d ur!|d|j d nd }|d ur0|d|j d nd }|d ur?|d|j d nd }| j||||||||d}
|
d }| j||d}| |}|d|	}|so|f|
dd   S t||
j|
jdS )Nr   rc   r   rG   r\   r   )re   rd   r   rA   r   r   r"   r#   )rC   rI   ri   rJ   rK   rE   r[   r   r   Znum_choicesrj   r   r   Zreshaped_logitsr+   r+   r,   rL     s4   

z*FlaxAlbertForMultipleChoiceModule.__call__Nr   rl   r+   r+   r+   r,   r     r   r   z
    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                   @   r   )FlaxAlbertForMultipleChoiceN)r$   r%   r&   r   r   r+   r+   r+   r,   r     r   r   z(batch_size, num_choices, sequence_lengthc                	   @   r   )&FlaxAlbertForTokenClassificationModuler.   r/   c                 C   sX   t | j| jdd| _| jjd ur| jjn| jj}tj|d| _tj	| jj
| jd| _d S )NFr.   r/   r   r2   rn   r   r   r+   r+   r,   rD     s   z,FlaxAlbertForTokenClassificationModule.setupTFrE   r[   r   r   c	              
   C   r   )Nr   r   rG   r   r   )r   rA   r   r   r"   r#   )rC   rI   ri   rJ   rK   rE   r[   r   r   rj   r"   r   r+   r+   r,   rL     r   z/FlaxAlbertForTokenClassificationModule.__call__Nr   rl   r+   r+   r+   r,   r     s"   
 	r   z
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   @   r   ) FlaxAlbertForTokenClassificationN)r$   r%   r&   r   r   r+   r+   r+   r,   r     r   r   c                	   @   r   )$FlaxAlbertForQuestionAnsweringModuler.   r/   c                 C   s.   t | j| jdd| _tj| jj| jd| _d S )NFr   rn   )r   r.   r/   r   r3   rV   r   
qa_outputsrB   r+   r+   r,   rD   $  s   z*FlaxAlbertForQuestionAnsweringModule.setupTFrE   r[   r   r   c	              
   C   s   | j ||||||||d}	|	d }
| |
}tj|| jjdd\}}|d}|d}|s8||f|	dd   S t|||	j|	j	dS )Nr   r   rc   r_   r   )start_logits
end_logitsr"   r#   )
r   r   r(   r   r.   r   Zsqueezer   r"   r#   )rC   rI   ri   rJ   rK   rE   r[   r   r   rj   r"   r   r   r   r+   r+   r,   rL   (  s.   


z-FlaxAlbertForQuestionAnsweringModule.__call__Nr   rl   r+   r+   r+   r,   r      r   r   z
    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   @   r   )FlaxAlbertForQuestionAnsweringN)r$   r%   r&   r   r   r+   r+   r+   r,   r   Q  r   r   )r   r   r   r   r   r   r   r   )Rtypingr   r   r   ZflaxZ
flax.linenZlinenr3   r6   Z	jax.numpynumpyr(   r   Zflax.core.frozen_dictr   r   r   Zflax.linen.attentionr   Zflax.traverse_utilr	   r
   r   Zmodeling_flax_outputsr   r   r   r   r   r   r   Zmodeling_flax_utilsr   r   r   r   r   utilsr   r   r   r   Zconfiguration_albertr   Z
get_loggerr$   loggerZ_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCstruct	dataclassr   ZALBERT_START_DOCSTRINGr   r   r-   rP   rm   rv   r   r   r   r   r   r   r   r   r   r   Z%FLAX_ALBERT_FOR_PRETRAINING_DOCSTRINGr   r   r   r   r   r   r   r   r   r   r   __all__r+   r+   r+   r,   <module>   s   $	
#!(U,+/_F62
66
31