o
    Zh                     @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddl m!Z! e"e#Z$dZ%dd Z&G dd dej'Z(G dd dej'Z)dej*de+de+dej*fddZ,G dd dej'Z-G dd dej'Z.G dd  d ej'Z/G d!d" d"ej'Z0	$dKd%ej*d&e+d'e+d(e1d)e1dej*fd*d+Z2G d,d- d-ej'Z3G d.d/ d/ej'Z4eG d0d1 d1eZ5G d2d3 d3ej'Z6eG d4d5 d5eZ7ed6d7G d8d9 d9e5Z8eG d:d; d;e5Z9ed<d7G d=d> d>e5Z:eG d?d@ d@e5Z;edAd7G dBdC dCe5Z<eG dDdE dEe5Z=eG dFdG dGe5Z>eG dHdI dIe5Z?g dJZ@dS )Lz!PyTorch Funnel Transformer model.    N)	dataclass)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )FunnelConfigg    .Ac                 C   s  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6ddd	d
ddddddddddd}t||	D ]\}
}|
d}
tdd |
D rt	dd|
  qm|
d dkrqm| }d}|
dd D ]}t|ts|d|rt|d| d }||jk rd}||j| kr||j| 8 }|d7 }||j| ks|j| | }q||j8 }|j| }q|dkrt|tr|j} n.||v rt||| }qzt||}W q ty#   t dd|
 |j! d}Y  nw |sHt"|j!t"|j!kr8|#|j!}|dkrB|$|}t%&||_'qm| S ) z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape k_headq_headv_head	post_projlinear_1linear_2	attentionffnweightbiasword_embeddings
embeddings)kqvoZlayer_1Zlayer_2Zrel_attnffkernelgammabetaZlookup_tableZword_embeddinginput/c                 s   s    | ]}|d v V  qdS ))Zadam_vZadam_mZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepN ).0nr/   r/   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py	<genexpr>\   s
    
z,load_tf_weights_in_funnel.<locals>.<genexpr>z	Skipping 	generatorFr   z	layer_\d+zlayer_(\d+)rTr*   )(renumpyZ
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variableappendzipsplitanyjoin
isinstanceFunnelPositionwiseFFN	fullmatchintsearchgroupsZnum_hidden_layersblock_sizesblockslayersFunnelRelMultiheadAttentionr_kernelgetattrAttributeErrorprintshapelenreshapeZ	transposetorchZ
from_numpydata)modelconfigZtf_checkpoint_pathr6   nptfZtf_pathZ	init_varsnamesZarraysnamerS   arrayZ
_layer_mapZpointerZskippedZm_namelayer_indexZ	block_idxr/   r/   r2   load_tf_weights_in_funnel.   s   






r`   c                       sN   e Zd Zdeddf fddZ	d
deej deej dejfdd	Z  Z	S )FunnelEmbeddingsrY   returnNc                    sH   t    tj|j|j|jd| _tj|j	|j
d| _t|j| _d S )N)padding_idxZeps)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idr#   	LayerNormd_modellayer_norm_eps
layer_normDropouthidden_dropoutdropoutselfrY   	__class__r/   r2   rf      s   
zFunnelEmbeddings.__init__	input_idsinputs_embedsc                 C   s*   |d u r	|  |}| |}| |}|S N)r#   rm   rp   )rr   ru   rv   r$   r/   r/   r2   forward   s
   


zFunnelEmbeddings.forwardNN)
__name__
__module____qualname__r   rf   r   rV   Tensorrx   __classcell__r/   r/   rs   r2   ra      s    ra   c                       s  e Zd ZU dZdZeed< deddf fddZ		d,d	e	j
d
ee	j
 dee	j
 dee	j
 fddZde	j
de	j
fddZdede	jde	jdeee	j
 eee	j
  f fddZde	j
defddZd-de	j
dedede	j
fddZdee	j
ee	j
 ee	j
 f d eeee ee f de	j
fd!d"Z	d.dee	j
ee	j
 ee	j
 f d$edede	j
fd%d&Zd'ee	j
 dee	j
ee	j
 f fd(d)Zd'ee	j
 dee	j
 fd*d+Z  ZS )/FunnelAttentionStructurez>
    Contains helpers for `FunnelRelMultiheadAttention `.
       cls_token_type_idrY   rb   Nc                    s6   t    || _t|j| _t|j| _d | _d S rw   )	re   rf   rY   r   rn   ro   sin_dropoutcos_dropoutpooling_multrq   rs   r/   r2   rf      s
   

z!FunnelAttentionStructure.__init__rv   attention_masktoken_type_idsc                 C   sv   d| _ |d | _}| ||j|j}|dur| |nd}| jjr3t	j
||d |d gdnd}||||fS )zCReturns the attention inputs associated to the inputs of the model.r   N)r   r   r   r   )r   sizeseq_lenget_position_embedsdtypedevicetoken_type_ids_to_matrY   separate_clsr   
functionalpadZnew_ones)rr   rv   r   r   r   position_embedstoken_type_matcls_maskr/   r/   r2   init_attention_inputs   s   	"z.FunnelAttentionStructure.init_attention_inputsc                 C   s^   |dddddf |dddf k}|| j k}|dddddf |dddf B }||B S )z-Convert `token_type_ids` to `token_type_mat`.N)r   )rr   r   r   Zcls_idsZcls_matr/   r/   r2   r      s   &
&z.FunnelAttentionStructure.token_type_ids_to_matr   r   r   c                 C   s  | j j}| j jdkr}tjd|dtj|d|}tjd|d dtj|d|}dd||d    }|dddf |d  }t|}	| |	}
t	|}| 
|}tj|
|
gd	d
}tj||	gd	d
}tj||gd	d
}tj|	 |gd	d
}||||fS tjd|d dtj|d|}dd||d    }tj| d |d dtj|d|}|d }|dddf |d  }| t|}	| 
t	|}tj|	|gd	d
}tjd|tj|d|}|}g }td| j jD ]e}|dkrd}n/| ||}d|d  }| j|||dd}|dddf | }||d|}t|d|}|}d| }| ||}|dddf | }||d|}t|d|}|||g q|S )a  
        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
        are using the factorized or the relative shift attention:

        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
        final formula.

        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
        formula.

        Paper link: https://arxiv.org/abs/2006.03236
        
factorizedr         ?r   r   r   r   i'  Ndim)shift)rY   rk   attention_typerV   arangeZint64tosinr   cosr   catrangeZ
num_blocksstride_pool_posrelative_posexpandr   gatherr@   )rr   r   r   r   rk   Zpos_seqZfreq_seqZinv_freqZsinusoidZ	sin_embedZsin_embed_dZ	cos_embedZcos_embed_dphipsipiomegaZ
rel_pos_idZzero_offsetZ	pos_embedpos
pooled_posZposition_embeds_listblock_indexZposition_embeds_poolingstrideZrel_posZposition_embeds_no_poolingr/   r/   r2   r      sV    



 &z,FunnelAttentionStructure.get_position_embedspos_idr   c                 C   sf   | j jr,|d|  d g}| j jr|dd n|dd }t||ddd gdS |ddd S )ze
        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
        r   r   r   Nr   )rY   r   Z
new_tensortruncate_seqrV   r   )rr   r   r   Zcls_posZpooled_pos_idr/   r/   r2   r     s
    z(FunnelAttentionStructure.stride_pool_posr   r   r   r   c           	      C   sb   |du r|}|d |d  }|t | }|||  }|d |d  }tj||d | tj|jdS )zV
        Build the relative positional vector between `pos` and `pooled_pos`.
        Nr   r   r   r   )rT   rV   r   longr   )	rr   r   r   r   r   Z	ref_pointZ
num_removeZmax_distZmin_distr/   r/   r2   r   $  s   z%FunnelAttentionStructure.relative_postensoraxisc                    s   |du rdS t  ttfr D ]}||}q|S t |ttfr/t| fdd|D S  |j;  jjrBjjrBt	dddnt	ddd}t	dg  |g }jjrnt	dg  t	ddg }t
j|| |g d}|| S )zT
        Perform pooling by stride slicing the tensor along the given axis.
        Nc                 3   s    | ]	} | V  qd S rw   )stride_poolr0   xr   rr   r/   r2   r3   E      z7FunnelAttentionStructure.stride_pool.<locals>.<genexpr>r   r   r   )r   )rE   listtupler   typendimrY   r   r   slicerV   r   )rr   r   r   axZ
axis_sliceZ	enc_sliceZ	cls_slicer/   r   r2   r   2  s    
&z$FunnelAttentionStructure.stride_poolmeanmodec                    sz  du rdS t ttfrt fddD S jjrBjjr/ddddf n}tjddddf |gddj	}|dkrVddddddf n|dkrhdddddddf df d	kr{t
jjd
dn$ dkrt
jjd
dn dkrt
jj d
d ntd|dkrddddddf S |dkrdddf S S )z3Apply 1D pooling to a tensor of size [B x T (x H)].Nc                 3   s     | ]}j  d V  qdS ))r   r   N)pool_tensorr   r   rr   r   r   r/   r2   r3   \  s    z7FunnelAttentionStructure.pool_tensor.<locals>.<genexpr>r   r   r   r   r   r   T)r   Z	ceil_modemaxminz0The supported modes are 'mean', 'max' and 'min'.r   )rE   r   r   r   rY   r   r   rV   r   r   r   r   Z
avg_pool2dZ
max_pool2dNotImplementedError)rr   r   r   r   suffixr   r/   r   r2   r   S  s2     "z$FunnelAttentionStructure.pool_tensorattention_inputsc                 C   s   |\}}}}| j jr6| j jdkr | |dd d|dd  }| |d}| |d}| j|| j jd}n3|  jd9  _| j jdkrI| |d}| |ddg}| |ddg}| j|dd}| j|| j jd}||||f}||fS )zTPool `output` and the proper parts of `attention_inputs` before the attention layer.r   Nr   r   r   r   r   )rY   pool_q_onlyr   r   r   Zpooling_typer   )rr   outputr   r   r   r   r   r/   r/   r2   pre_attention_poolingy  s     z.FunnelAttentionStructure.pre_attention_poolingc                 C   s   |\}}}}| j jr:|  jd9  _| j jdkr'|dd | |dd d }| |d}| |d}| j|dd}||||f}|S )zFPool the proper parts of `attention_inputs` after the attention layer.r   r   Nr   r   r   r   )rY   r   r   r   r   r   )rr   r   r   r   r   r   r/   r/   r2   post_attention_pooling  s    z/FunnelAttentionStructure.post_attention_poolingry   Nr   )r   r   )rz   r{   r|   __doc__r   rH   __annotations__r   rf   rV   r}   r   r   r   r   r   r   r   r   r   r   r   r   strr   r   r   r~   r/   r/   rs   r2   r      sd   
 

P 
"
&
&r   positional_attncontext_lenr   rb   c                 C   sn   | j \}}}}t| ||||g} | d d d d |d d d f } t| ||||| g} | dd |f } | S )N.)rS   rV   rU   )r   r   r   
batch_sizen_headr   Zmax_rel_lenr/   r/   r2   _relative_shift_gather  s    r   c                       sz   e Zd Zdededdf fddZdddZdd	d
Z	ddej	dej	dej	de
ej	 dede
ej	df fddZ  ZS )rN   rY   r   rb   Nc                    s*  t    || _|| _|j|j|j}}}t|j	| _	t|j
| _
tj||| dd| _t||| | _t||| | _tt||g| _tt||g| _tt|||g| _tt||g| _ttd||g| _t|| || _tj||jd| _d|d  | _d S )NF)r"   r   rd   r   g      ?)re   rf   rY   r   rk   r   d_headr   rn   ro   attention_dropoutLinearr   r   r   	ParameterrV   zerosr_w_biasr_r_biasrO   r_s_bias	seg_embedr   rj   rl   rm   scale)rr   rY   r   rk   r   r   rs   r/   r2   rf     s"   
z$FunnelRelMultiheadAttention.__init__c                 C   s   | j jdkrA|\}}}}| j| j }	| j}
td||	 |
}||dddf  }||dddf  }td||td|| }n3|jd |krJdnd}|| j |d  }| j| j }| j}
td||
}td|| |}t	|||}|dur|||9 }|S )	z5Relative attention score for the positional encodingsr   zbinh,dnh->bindNzbind,jd->bnijr   r   ztd,dnh->tnhzbinh,tnh->bnit)
rY   r   r   r   rO   rV   einsumrS   r   r   )rr   r   r   r   r   r   r   r   r   uZw_rZq_r_attentionZq_r_attention_1Zq_r_attention_2r   r   r5   r'   Zr_headr/   r/   r2   relative_positional_attention  s(   z9FunnelRelMultiheadAttention.relative_positional_attentionc                 C   s   |du rdS |j \}}}| j| j }td|| | j}|dddf ||j d ||g}tj|ddd\}	}
t||
|j |	|j }|durO||9 }|S )z/Relative attention score for the token_type_idsNr   zbind,snd->bnisr   r   r   r   )	rS   r   r   rV   r   r   r   rB   where)rr   r   r   r   r   r   r   r   Ztoken_type_biasZdiff_token_typeZsame_token_typetoken_type_attnr/   r/   r2   relative_token_type_attention  s   $z9FunnelRelMultiheadAttention.relative_token_type_attentionFquerykeyvaluer   output_attentions.c                 C   sj  |\}}}}	|j \}
}}|j d }| jj| jj}}| ||
|||}| ||
|||}| ||
|||}|| j }| j	| j }t
d|| |}| ||||	}| |||	}|| | }|j}| }|d ur|td|d d d d f     }t
j|d|d}| |}t
d||}| ||
||| }| |}| || }|r||fS |fS )Nr   zbind,bjnd->bnijr   )r   r   zbnij,bjnd->bind)rS   rY   r   r   r   viewr   r   r   r   rV   r   r   r   r   floatINFZsoftmaxr   r   rU   ro   rm   )rr   r   r   r   r   r   r   r   r   r   r   r   _r   r   r   r   r   r   r   Zcontent_scorer   r   Z
attn_scorer   Z	attn_probZattn_vecZattn_outr   r/   r/   r2   rx     s0   


"

z#FunnelRelMultiheadAttention.forwardrw   F)rz   r{   r|   r   rH   rf   r   r   rV   r}   r   boolrx   r~   r/   r/   rs   r2   rN     s$    

*rN   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	rF   rY   rb   Nc                    sl   t    t|j|j| _t|j | _	t
|j| _t|j|j| _t
|j| _t|j|j| _d S rw   )re   rf   r   r   rk   Zd_innerr   r   
hidden_actactivation_functionrn   activation_dropoutr   ro   rp   rj   rl   rm   rq   rs   r/   r2   rf   >  s   
zFunnelPositionwiseFFN.__init__hiddenc                 C   s@   |  |}| |}| |}| |}| |}| || S rw   )r   r   r   r   rp   rm   )rr   r   hr/   r/   r2   rx   G  s   




zFunnelPositionwiseFFN.forward)	rz   r{   r|   r   rf   rV   r}   rx   r~   r/   r/   rs   r2   rF   =  s    	rF   c                       sR   e Zd Zdededdf fddZ	ddejd	ejd
ejdede	f
ddZ
  ZS )FunnelLayerrY   r   rb   Nc                    s$   t    t||| _t|| _d S rw   )re   rf   rN   r   rF   r    )rr   rY   r   rs   r/   r2   rf   Q  s   
zFunnelLayer.__init__Fr   r   r   r   c                 C   s8   | j |||||d}| |d }|r||d fS |fS )Nr   r   r   )r   r    )rr   r   r   r   r   r   Zattnr   r/   r/   r2   rx   V  s   zFunnelLayer.forwardr   )rz   r{   r|   r   rH   rf   rV   r}   r   r   rx   r~   r/   r/   rs   r2   r   P  s    r   c                       sn   e Zd Zdeddf fddZ					ddejd	eej d
eej dededede	e
ef fddZ  ZS )FunnelEncoderrY   rb   Nc                    >   t     | _t | _t fddt jD | _	d S )Nc                    s.   g | ]\ }t  fd dt|D qS )c                    s   g | ]}t  qS r/   r   r0   r   )r   rY   r/   r2   
<listcomp>j      z5FunnelEncoder.__init__.<locals>.<listcomp>.<listcomp>)r   
ModuleListr   )r0   
block_sizerY   )r   r2   r   i  s    z*FunnelEncoder.__init__.<locals>.<listcomp>)
re   rf   rY   r   attention_structurer   r   	enumeraterK   rL   rq   rs   r   r2   rf   d  s   



zFunnelEncoder.__init__FTrv   r   r   r   output_hidden_statesreturn_dictc              
   C   sf  | |}| jj|||d}|}|r|fnd }	|rdnd }
t| jD ]{\}}|d| jjr0dndk}|o8|dk}|rD| j||\}}t|D ]T\}}t	| jj
| D ]G}|dko_|dko_|}|ro|}| jjrj|n| }}n| } }}||||||d}|d }|r| j|}|r|
|dd   }
|r|	|f }	qTqHq"|stdd ||	|
fD S t||	|
d	S )
Nr   r   r/   r   r   r   r   c                 s       | ]	}|d ur|V  qd S rw   r/   r0   r'   r/   r/   r2   r3     r   z(FunnelEncoder.forward.<locals>.<genexpr>last_hidden_statehidden_states
attentions)Ztype_asr   r   r   rL   r   rY   r   r   r   Zblock_repeatsr   r   r   r   )rr   rv   r   r   r   r  r  r   r   all_hidden_statesall_attentionsr   blockZpooling_flagZpooled_hiddenr_   layerZrepeat_indexZ
do_poolingr   r   r   layer_outputr/   r/   r2   rx   o  sJ   


zFunnelEncoder.forwardNNFFTrz   r{   r|   r   rf   rV   r}   r   r   r   r   r   rx   r~   r/   r/   rs   r2   r   c  s,    
r   TFr   r   
target_lenr   r   c              	   C   s   |dkr| S |r| ddddf }| ddddf } t j| |dd}|rN|r7tj|ddd|d ddf}|ddd|d f }t j||gdd}|S |ddd|f }|S )z{
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    r   N)Zrepeatsr   r   r   )rV   Zrepeat_interleaver   r   r   r   )r   r   r  r   r   clsr   r/   r/   r2   upsample  s   r  c                       st   e Zd Zdeddf fddZ					ddejd	ejd
eej deej dededede	e
ef fddZ  ZS )FunnelDecoderrY   rb   Nc                    r   )Nc                    s   g | ]}t  d qS )r   r   r   r   r/   r2   r     r   z*FunnelDecoder.__init__.<locals>.<listcomp>)
re   rf   rY   r   r   r   r   r   Znum_decoder_layersrM   rq   rs   r   r2   rf     s   

$zFunnelDecoder.__init__FTfinal_hiddenfirst_block_hiddenr   r   r   r  r  c                 C   s   t |dt| jjd  |jd | jj| jjd}|| }	|r!|	fnd }
|r'dnd }| jj|	||d}| j	D ] }||	|	|	||d}|d }	|rN||dd   }|rU|
|	f }
q5|sdt
dd	 |	|
|fD S t|	|
|d
S )Nr   r   )r   r  r   r   r/   r  r   r   c                 s   r  rw   r/   r  r/   r/   r2   r3     r   z(FunnelDecoder.forward.<locals>.<genexpr>r  )r  rT   rY   rK   rS   r   r   r   r   rM   r   r   )rr   r  r  r   r   r   r  r  Zupsampled_hiddenr   r
  r  r   r  r  r/   r/   r2   rx     s4   


zFunnelDecoder.forwardr  r  r/   r/   rs   r2   r    s0    

	r  c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
FunnelDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.rY   rb   Nc                    s6   t    || _t|j|j| _t|jd| _d S r   )re   rf   rY   r   r   rk   densedense_predictionrq   rs   r/   r2   rf     s   
z'FunnelDiscriminatorPredictions.__init__discriminator_hidden_statesc                 C   s.   |  |}t| jj |}| |d}|S )Nr   )r  r   rY   r   r  squeeze)rr   r  r  logitsr/   r/   r2   rx     s   
z&FunnelDiscriminatorPredictions.forward)
rz   r{   r|   r   r   rf   rV   r}   rx   r~   r/   r/   rs   r2   r    s    r  c                   @   s    e Zd ZeZeZdZdd ZdS )FunnelPreTrainedModelfunnelc                 C   st  |j j}|ddkrLt|dd d ur8| jjd u r+|jj\}}t	dt
||  }n| jj}tjj|j|d t|dd d urJtj|jd d S d S |dkrtjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 d S |d
kr| jjd u rdn| jj}tjj|jj|d |jjd ur|jjj|jj   d S d S d S )Nr   r   r!   r   )stdr"   g        rN   )bra   )rt   rz   findrP   rY   Zinitializer_stdr!   rS   rZ   sqrtr   r   initZnormal_Z	constant_r"   Zuniform_r   Zinitializer_ranger   rO   r   r   r#   rc   rW   Zzero_)rr   module	classnameZfan_outZfan_inr  r/   r/   r2   _init_weights  s0   z#FunnelPreTrainedModel._init_weightsN)	rz   r{   r|   r   Zconfig_classr`   Zload_tf_weightsZbase_model_prefixr&  r/   r/   r/   r2   r    s
    r  c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  Z	S )
FunnelClassificationHeadrY   n_labelsrb   Nc                    s>   t    t|j|j| _t|j| _t|j|| _	d S rw   )
re   rf   r   r   rk   linear_hiddenrn   ro   rp   
linear_out)rr   rY   r(  rs   r/   r2   rf     s   
z!FunnelClassificationHead.__init__r   c                 C   s(   |  |}t|}| |}| |S rw   )r)  rV   tanhrp   r*  )rr   r   r/   r/   r2   rx     s   



z FunnelClassificationHead.forward)
rz   r{   r|   r   rH   rf   rV   r}   rx   r~   r/   r/   rs   r2   r'    s    r'  c                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )FunnelForPreTrainingOutputa  
    Output type of [`FunnelForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss of the ELECTRA-style objective.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Prediction scores of the head (scores for each token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossr  r  r	  )rz   r{   r|   r   r-  r   rV   ZFloatTensorr   r  r  r   r	  r/   r/   r/   r2   r,  &  s   
 r,  z
    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
    decoder) or any task-specific head on top.
    )Zcustom_introc                       s   e Zd Zdeddf fddZdejfddZdejddfd	d
Ze										dde
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )FunnelBaseModelrY   rb   Nc                    ,   t  | t|| _t|| _|   d S rw   )re   rf   ra   r$   r   encoder	post_initrq   rs   r/   r2   rf   J  s   

zFunnelBaseModel.__init__c                 C      | j jS rw   r$   r#   rr   r/   r/   r2   get_input_embeddingsS     z$FunnelBaseModel.get_input_embeddingsnew_embeddingsc                 C      || j _d S rw   r3  rr   r7  r/   r/   r2   set_input_embeddingsV     z$FunnelBaseModel.set_input_embeddingsru   r   r   position_ids	head_maskrv   r   r  r  c
                 C   s   |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}|d u rltj
|
tj|d}| j||d}| j||||||	d}|S )NDYou cannot specify both input_ids and inputs_embeds at the same timer   5You have to specify either input_ids or inputs_embedsr   r   rv   r   r   r   r  r  )rY   r   r  use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr   r   rV   onesr   r   r$   r0  )rr   ru   r   r   r<  r=  rv   r   r  r  input_shaper   encoder_outputsr/   r/   r2   rx   Y  s6   
	zFunnelBaseModel.forward	NNNNNNNNNrz   r{   r|   r   rf   r   rg   r5  r:  r   r   rV   r}   r   r   r   r   rx   r~   r/   r/   rs   r2   r.  C  sF    		

r.  c                       s   e Zd Zdeddf fddZdejfddZdejddfd	d
Ze								dde
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )FunnelModelrY   rb   Nc                    s<   t  | || _t|| _t|| _t|| _| 	  d S rw   )
re   rf   rY   ra   r$   r   r0  r  decoderr1  rq   rs   r/   r2   rf     s   


zFunnelModel.__init__c                 C   r2  rw   r3  r4  r/   r/   r2   r5    r6  z FunnelModel.get_input_embeddingsr7  c                 C   r8  rw   r3  r9  r/   r/   r2   r:    r;  z FunnelModel.set_input_embeddingsru   r   r   rv   r   r  r  c              	   C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }n|d urF| d d }ntd|d urQ|jn|j}	|d u r_tj	||	d}|d u rltj
|tj|	d}| j||d}| j||||d|d}
| j|
d	 |
d
 | j jd	  |||||d}|sd	}|d	 f}|r|d
7 }||
d
 ||  f }|r|d
7 }||
d ||  f }|S t|d	 |r|
j|j nd |r|
j|j dS d dS )Nr>  r   r?  r@  r   rA  TrB  r   r   )r  r  r   r   r   r  r  r   r  )rY   r   r  rC  rD  rE  r   r   rV   rF  r   r   r$   r0  rL  rK   r   r  r	  )rr   ru   r   r   rv   r   r  r  rG  r   rH  Zdecoder_outputsidxoutputsr/   r/   r2   rx     sl   
	

zFunnelModel.forward)NNNNNNNrJ  r/   r/   rs   r2   rK    s:    

	rK  z
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
    generated tokens.
    c                          e Zd Zdeddf fddZe								ddeej deej deej d	eej d
eej dee	 dee	 dee	 de
eef fddZ  ZS )FunnelForPreTrainingrY   rb   Nc                    r/  rw   )re   rf   rK  r  r  discriminator_predictionsr1  rq   rs   r/   r2   rf     s   

zFunnelForPreTraining.__init__ru   r   r   rv   labelsr   r  r  c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}d}|dur]t }|durO|d|
jd dk}|d|
jd | }|| }||| }n||d|
jd | }|ss|f|	dd  }|durq|f| S |S t	|||	j
|	jdS )a"  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
            docstring) Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, FunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> logits = model(**inputs).logits
        ```Nr   r   rv   r   r  r  r   r   r   r-  r  r  r	  )rY   rC  r  rQ  r   r   r   rS   r   r,  r  r	  )rr   ru   r   r   rv   rR  r   r  r  r  Zdiscriminator_sequence_outputr  r-  loss_fctZactive_lossZactive_logitsZactive_labelsr   r/   r/   r2   rx     s<    	
zFunnelForPreTraining.forwardNNNNNNNN)rz   r{   r|   r   rf   r   r   rV   r}   r   r   r   r,  rx   r~   r/   r/   rs   r2   rP    s<    	

rP  c                       s   e Zd ZdgZdeddf fddZdejfddZd	ej	ddfd
dZ
e								ddeej deej deej deej deej dee dee dee deeef fddZ  ZS )FunnelForMaskedLMzlm_head.weightrY   rb   Nc                    s4   t  | t|| _t|j|j| _| 	  d S rw   )
re   rf   rK  r  r   r   rk   rh   lm_headr1  rq   rs   r/   r2   rf   C  s   
zFunnelForMaskedLM.__init__c                 C   s   | j S rw   rX  r4  r/   r/   r2   get_output_embeddingsL  s   z'FunnelForMaskedLM.get_output_embeddingsr7  c                 C   s
   || _ d S rw   rY  r9  r/   r/   r2   set_output_embeddingsO  s   
z'FunnelForMaskedLM.set_output_embeddingsru   r   r   rv   rR  r   r  r  c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}d}|dur6t }||d| j j|d}|sL|f|	dd  }|durJ|f| S |S t|||	j|	j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NrS  r   r   r   rT  )
rY   rC  r  rX  r	   r   rh   r   r  r	  )rr   ru   r   r   rv   rR  r   r  r  rN  r  Zprediction_logitsZmasked_lm_lossrU  r   r/   r/   r2   rx   R  s2   

zFunnelForMaskedLM.forwardrV  )rz   r{   r|   Z_tied_weights_keysr   rf   r   r   rZ  rg   r[  r   r   rV   r}   r   r   r   r   rx   r~   r/   r/   rs   r2   rW  ?  sB    		

rW  z
    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    c                       rO  )FunnelForSequenceClassificationrY   rb   Nc                    s>   t  | |j| _|| _t|| _t||j| _|   d S rw   )	re   rf   
num_labelsrY   r.  r  r'  
classifierr1  rq   rs   r/   r2   rf     s   
z(FunnelForSequenceClassification.__init__ru   r   r   rv   rR  r   r  r  c	              	   C   st  |dur|n| j j}| j|||||||d}	|	d }
|
dddf }| |}d}|dur| j jdu rW| jdkr=d| j _n| jdkrS|jtjksN|jtj	krSd| j _nd| j _| j jdkrut
 }| jdkro|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|	dd  }|dur|f| S |S t|||	j|	jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrS  r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr   rT  )rY   rC  r  r^  Zproblem_typer]  r   rV   r   rH   r
   r  r	   r   r   r   r  r	  )rr   ru   r   r   rv   rR  r   r  r  rN  r  pooled_outputr  r-  rU  r   r/   r/   r2   rx     sR   



"


z'FunnelForSequenceClassification.forwardrV  )rz   r{   r|   r   rf   r   r   rV   r}   r   r   r   r   rx   r~   r/   r/   rs   r2   r\    s<    
	

r\  c                       rO  )FunnelForMultipleChoicerY   rb   Nc                    s.   t  | t|| _t|d| _|   d S r   )re   rf   r.  r  r'  r^  r1  rq   rs   r/   r2   rf     s   
z FunnelForMultipleChoice.__init__ru   r   r   rv   rR  r   r  r  c	              	   C   sR  |dur|n| j j}|dur|jd n|jd }	|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durV|d|d|dnd}| j|||||||d}
|
d }|dddf }| |}|d|	}d}|durt }|||}|s|f|
dd  }|dur|f| S |S t|||
j	|
j
dS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   rS  r   rT  )rY   rC  rS   r   r   r  r^  r	   r   r  r	  )rr   ru   r   r   rv   rR  r   r  r  Znum_choicesrN  r  r_  r  Zreshaped_logitsr-  rU  r   r/   r/   r2   rx     sF   


zFunnelForMultipleChoice.forwardrV  )rz   r{   r|   r   rf   r   r   rV   r}   r   r   r   r   rx   r~   r/   r/   rs   r2   r`    s<    	

r`  c                       rO  )FunnelForTokenClassificationrY   rb   Nc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rw   )re   rf   r]  rK  r  r   rn   ro   rp   r   ri   r^  r1  rq   rs   r/   r2   rf   $  s   
z%FunnelForTokenClassification.__init__ru   r   r   rv   rR  r   r  r  c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|dur:t }||d| j|d}|sP|f|	dd  }|durN|f| S |S t|||	j	|	j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrS  r   r   r   rT  )rY   rC  r  rp   r^  r	   r   r]  r   r  r	  )rr   ru   r   r   rv   rR  r   r  r  rN  r  r  r-  rU  r   r/   r/   r2   rx   /  s4   


z$FunnelForTokenClassification.forwardrV  )rz   r{   r|   r   rf   r   r   rV   r}   r   r   r   r   rx   r~   r/   r/   rs   r2   rb  "  s<    	

rb  c                       s   e Zd Zdeddf fddZe									ddeej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eef fddZ  ZS )FunnelForQuestionAnsweringrY   rb   Nc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S rw   )
re   rf   r]  rK  r  r   r   ri   
qa_outputsr1  rq   rs   r/   r2   rf   b  s
   
z#FunnelForQuestionAnswering.__init__ru   r   r   rv   start_positionsend_positionsr   r  r  c
              	   C   sD  |	d ur|	n| j j}	| j|||||||	d}
|
d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrM|	d}t| dkrZ|d}|d}|
d|}|
d|}t|d}|||}|||}|| d }|	s||f|
dd   }|d ur|f| S |S t||||
j|
jdS )	NrS  r   r   r   r   )Zignore_indexr   )r-  start_logits
end_logitsr  r	  )rY   rC  r  rd  rB   r  
contiguousrT   r   Zsquezeclampr	   r   r  r	  )rr   ru   r   r   rv   re  rf  r   r  r  rN  r  r  rg  rh  Z
total_lossZignored_indexrU  Z
start_lossZend_lossr   r/   r/   r2   rx   l  sL   







z"FunnelForQuestionAnswering.forwardrI  )rz   r{   r|   r   rf   r   r   rV   r}   r   r   r   r   rx   r~   r/   r/   rs   r2   rc  `  sB    
	

rc  )
r.  rW  r`  rP  rc  r\  rb  rK  r  r`   )TF)Ar   r;   dataclassesr   typingr   r   r   r   r7   rZ   rV   r   Ztorch.nnr   r	   r
   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   utilsr   r   r   Zconfiguration_funnelr   Z
get_loggerrz   r9   r   r`   Modulera   r   r}   rH   r   rN   rF   r   r   r   r  r  r  r  r'  r,  r.  rK  rP  rW  r\  r`  rb  rc  __all__r/   r/   r/   r2   <module>   s    
Z   @
1C]ODPG=I