
    fThl                     >   S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJrJrJr  SSKJrJr  SS	KJrJrJrJrJrJr  SS
KJr  SSKJrJrJ r   SSK!J"r"J#r#  SSK$J%r%  \#RL                  " \'5      r(S r) " S S\RT                  5      r+\" " S S\5      5       r, " S S\RT                  5      r- " S S\RT                  5      r. " S S\RT                  5      r/ " S S\RT                  5      r0 " S S\RT                  5      r1 " S S\RT                  5      r2 " S  S!\RT                  5      r3 " S" S#\RT                  5      r4 " S$ S%\RT                  5      r5 " S& S'\RT                  5      r6 " S( S)\RT                  5      r7\" " S* S+\,5      5       r8 " S, S-\RT                  5      r9\" " S. S/\,5      5       r: " S0 S1\RT                  5      r;\"" S2S39 " S4 S5\,5      5       r<\" " S6 S7\,5      5       r=\" " S8 S9\,5      5       r>\" " S: S;\,5      5       r?/ S<Qr@g)=zPyTorch ConvBERT model.    N)
attrgetter)CallableOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )ConvBertConfigc                     SSK n[        R
                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n0 nU H?  u  px[        R                  SU SU 35        UR                  R                  XG5      n	XU'   MA     SSS	S
SSSS.n
UR                  S:  a  SnOSn[        UR                  5       GH;  nSU S3U
SU S3'   SU S3U
SU S3'   SU S3U
SU S3'   SU S3U
SU S3'   SU S3U
SU S3'   SU S3U
SU S3'   SU S 3U
SU S!3'   SU S"3U
SU S#3'   SU S$3U
SU S%3'   SU S&3U
SU S'3'   SU S(3U
SU S)3'   SU S*3U
SU S+3'   SU S,3U
SU S-3'   SU S.3U
SU S/3'   SU S03U
SU S13'   SU S23U
SU S33'   SU S43U
SU S53'   SU S6U S73U
SU S83'   SU S6U S93U
SU S:3'   SU S;U S73U
SU S<3'   SU S;U S93U
SU S=3'   SU S>3U
SU S?3'   SU S@3U
SU SA3'   GM>     U R                  5        GH#  nUS   n[        U5      nU" U 5      nX   n[         R"                  " UU   5      n[        R                  SBU SCU SD35        UR%                  S75      (       a8  UR%                  SE5      (       d"  UR%                  SF5      (       d  UR&                  nUR%                  SG5      (       a  UR)                  SSHS5      nUR%                  SI5      (       a  UR)                  SHSS5      nUR%                  SJ5      (       a  UR+                  SK5      nUUl        GM&     U $ ! [         a    [        R                  S5        e f = f)Lz'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape z"electra/embeddings/word_embeddingsz&electra/embeddings/position_embeddingsz(electra/embeddings/token_type_embeddingsz"electra/embeddings/LayerNorm/gammaz!electra/embeddings/LayerNorm/betaz!electra/embeddings_project/kernelzelectra/embeddings_project/bias)z!embeddings.word_embeddings.weightz%embeddings.position_embeddings.weightz'embeddings.token_type_embeddings.weightzembeddings.LayerNorm.weightzembeddings.LayerNorm.biaszembeddings_project.weightzembeddings_project.biasr   g_densedensezelectra/encoder/layer_z/attention/self/query/kernelzencoder.layer.z.attention.self.query.weightz/attention/self/query/biasz.attention.self.query.biasz/attention/self/key/kernelz.attention.self.key.weightz/attention/self/key/biasz.attention.self.key.biasz/attention/self/value/kernelz.attention.self.value.weightz/attention/self/value/biasz.attention.self.value.biasz./attention/self/conv_attn_key/depthwise_kernelz4.attention.self.key_conv_attn_layer.depthwise.weightz./attention/self/conv_attn_key/pointwise_kernelz4.attention.self.key_conv_attn_layer.pointwise.weightz"/attention/self/conv_attn_key/biasz(.attention.self.key_conv_attn_layer.biasz'/attention/self/conv_attn_kernel/kernelz(.attention.self.conv_kernel_layer.weightz%/attention/self/conv_attn_kernel/biasz&.attention.self.conv_kernel_layer.biasz&/attention/self/conv_attn_point/kernelz%.attention.self.conv_out_layer.weightz$/attention/self/conv_attn_point/biasz#.attention.self.conv_out_layer.biasz/attention/output/dense/kernelz.attention.output.dense.weightz!/attention/output/LayerNorm/gammaz".attention.output.LayerNorm.weightz/attention/output/dense/biasz.attention.output.dense.biasz /attention/output/LayerNorm/betaz .attention.output.LayerNorm.biasz/intermediate/z/kernelz.intermediate.dense.weightz/biasz.intermediate.dense.biasz/output/z.output.dense.weightz.output.dense.biasz/output/LayerNorm/gammaz.output.LayerNorm.weightz/output/LayerNorm/betaz.output.LayerNorm.biaszTF: z, PT:  z/intermediate/g_dense/kernelz/output/g_dense/kernelz/depthwise_kernel   z/pointwise_kernelz/conv_attn_key/bias)
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variable
num_groupsrangenum_hidden_layersnamed_parametersr   torch
from_numpyendswithTpermute	unsqueezedata)modelconfigtf_checkpoint_pathtftf_path	init_varstf_datanameshapearrayparam_mappinggroup_dense_namejparam
param_name	retrieverresulttf_namevalues                      f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/convbert/modeling_convbert.pyload_tf_weights_in_convbertrM   0   sf    ggoo01G
KK8	BC''0IG (l5'BC&&w5 ! .R1Y3]'K%H%H#DM 1$"6++,$QC'CD 	qc)EFG %QC'AB 	qc)CDE %QC'AB 	qc)CDE %QC'?@ 	qc)ABC %QC'CD 	qc)EFG %QC'AB 	qc)CDE %QC'UV 	qc)]^_ %QC'UV 	qc)]^_ %QC'IJ 	qc)QRS %QC'NO 	qc)QRS %QC'LM 	qc)OPQ %QC'MN 	qc)NOP %QC'KL 	qc)LMN %QC'EF 	qc)GHI %QC'HI 	qc)KLM %QC'CD 	qc)EFG %QC'GH 	qc)IJK %QC~6F5GwO 	qc)CDE %QC~6F5GuM 	qc)ABC %QCx0@/AI 	qc)=>? %QCx0@/AG 	qc);<= %QC'>? 	qc)ABC G]]^\__uDvqc)?@AG -J '')1X
z*	5!+  !12d7)6*Q78I&&##$BCC''(@AA!GGE/00MM!Q*E/00MM!Q*E122OOB'E# *$ Lk  Q	
 	s   M !M.c                      ^  \ rS rSrSrU 4S jr    SS\\R                     S\\R                     S\\R                     S\\R                     S\R                  4
S	 jjr
S
rU =r$ )ConvBertEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.c                 
  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  S9SS9  g )	N)padding_idxepsposition_ids)r   r"   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr2   arangeexpandzerosrU   sizelongselfr:   	__class__s     rL   r[   ConvBertEmbeddings.__init__   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&;&;AVAVWzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsrW   rU   inputs_embedsreturnc                 >   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	U R                  U5      n
XI-   U
-   nU R                  U5      nU R                  U5      nU$ )Nr"   r   rW   r   rY   device)rn   rU   hasattrrW   rl   r2   rm   ro   rz   r`   rb   rd   re   ri   )rq   ru   rW   rU   rv   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrb   rd   
embeddingss               rL   forwardConvBertEmbeddings.forward   s,     #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M"66|D $ : :> J"8;PP
^^J/
\\*-
rt   )re   ri   rb   rd   r`   )NNNN)__name__
__module____qualname____firstlineno____doc__r[   r   r2   
LongTensorFloatTensorr   __static_attributes____classcell__rr   s   @rL   rO   rO      s    Q
( 15593759$E,,-$ !!1!12$ u//0	$
   1 12$ 
		$ $rt   rO   c                   *    \ rS rSr\r\rSrSr	S r
Srg)ConvBertPreTrainedModel   convbertTc                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       a%  UR                  R
                  R                  5         g[        U[         5      (       a]  UR                  R
                  R                  SU R                  R                  S9  UR                  R
                  R                  5         gg)zInitialize the weights        meanstdNg      ?)
isinstancer   LinearConv1dweightr8   normal_r:   initializer_rangebiaszero_r\   rR   re   fill_SeparableConv1DGroupedLinearLayer)rq   modules     rL   _init_weights%ConvBertPreTrainedModel._init_weights   s   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)00KK""$ 233MM&&CT[[5R5R&SKK""$ 4rt    N)r   r   r   r   r   config_classrM   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr   r   r   rt   rL   r   r      s    !L1O"&*#%rt   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )r   i  zSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc           	        > [         TU ]  5         [        R                  " UUUUUS-  SS9U l        [        R                  " X#SSS9U l        [        R                  " [        R                  " US5      5      U l	        U R                  R                  R                  R                  SUR                  S9  U R
                  R                  R                  R                  SUR                  S9  g )Nr!   F)kernel_sizegroupspaddingr   r   )r   r   r   r   )rZ   r[   r   r   	depthwise	pointwise	Parameterr2   rm   r   r   r8   r   r   )rq   r:   input_filtersoutput_filtersr   kwargsrr   s         rL   r[   SeparableConv1D.__init__  s    # 1$
 =aV[\LL^Q!?@	""**9Q9Q*R""**9Q9Q*Rrt   hidden_statesrw   c                 f    U R                  U5      nU R                  U5      nX R                  -  nU$ N)r   r   r   )rq   r   xs      rL   r   SeparableConv1D.forward  s.    NN=)NN1	YYrt   )r   r   r   r   r   r   r   r   r[   r2   Tensorr   r   r   r   s   @rL   r   r     s,    ]S U\\ ell  rt   r   c                      ^  \ rS rSrU 4S jrS r    SS\R                  S\\R                     S\\R                     S\\R                     S\\
   S	\\R                  \\R                     4   4S
 jjrSrU =r$ )ConvBertSelfAttentioni  c                 ~  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  UR                  -  nUS:  a  UR                  U l        SU l        OX l        UR                  U l        UR                  U l        UR                  U R                  -  S:w  a  [        S5      eUR                  U R                  -  S-  U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        XR                  U R                  U R                  5      U l        [        R                  " U R                  U R                  U R                  -  5      U l        [        R                  " UR                  U R                  5      U l        [        R&                  " U R                  S/[)        U R                  S-
  S-  5      S/S	9U l        [        R,                  " UR.                  5      U l        g )
Nr   r^   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr!   )r   r   )rZ   r[   hidden_sizenum_attention_headsr{   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   r   querykeyrK   r   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldrg   attention_probs_dropout_probri   )rq   r:   new_num_attention_headsrr   s      rL   r[   ConvBertSelfAttention.__init__   s=    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)"<"<@Q@Q"Q"Q&$88DO'(D$'>$$//DO & 7 7 8 88A=UVV$*$6$6$:R:R$RWX#X !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
#2&&(:(:D<Q<Q$
  "$4+=+=t?W?WZ^ZoZo?o!p ii(:(:D<N<NOii..2S$BWBWZ[B[_`A`=acd<e
 zz&"E"EFrt   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr"   r   r!   r   r   )rn   r   r   viewr6   )rq   r   new_x_shapes      rL   transpose_for_scores*ConvBertSelfAttention.transpose_for_scoresG  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$rt   r   attention_mask	head_maskencoder_hidden_statesoutput_attentionsrw   c                    U R                  U5      nUR                  S5      nUb#  U R                  U5      nU R                  U5      n	O"U R                  U5      nU R                  U5      n	U R	                  UR                  SS5      5      n
U
R                  SS5      n
U R                  U5      nU R                  U5      nU R                  U	5      n[        R                  " X5      nU R                  U5      n[        R                  " USU R                  S/5      n[        R                  " USS9nU R                  U5      n[        R                  " UUSU R                  /5      nUR                  SS5      R                  5       R!                  S5      n["        R$                  R'                  UU R                  S/SU R                  S-
  S-  S/SS9nUR                  SS5      R                  USU R                  U R                  5      n[        R                  " USU R(                  U R                  /5      n[        R*                  " UU5      n[        R                  " USU R                  /5      n[        R*                  " XR                  SS5      5      nU[,        R.                  " U R(                  5      -  nUb  UU-   n["        R$                  R                  USS9nU R1                  U5      nUb  UU-  n[        R*                  " UU5      nUR3                  SSSS5      R                  5       n[        R                  " UUSU R4                  U R(                  /5      n[        R6                  " UU/S5      nUR                  5       S S U R4                  U R(                  -  S-  4-   nUR8                  " U6 nU(       a  UU4nU$ U4nU$ )	Nr   r   r!   r"   dim)r   dilationr   strider   )r   rn   r   rK   r   	transposer   r2   multiplyr   reshaper   softmaxr   r   
contiguousr7   r   
functionalr   r   matmulmathsqrtri   r6   r   catr   )rq   r   r   r   r   r   mixed_query_layer
batch_sizemixed_key_layermixed_value_layermixed_key_conv_attn_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapeoutputss                          rL   r   ConvBertSelfAttention.forwardL  s    !JJ}5"''*
 !,"hh'<=O $

+@ A"hh}5O $

= 9$($<$<]=T=TUVXY=Z$[!$=$G$G1$M!//0AB--o>	//0AB..)BV 22?C!MM*;b$BWBWYZ=[\!MM*;C,,];~
BHZHZ7[\'11!Q7BBDNNrR--..2++a/A5q9 . 
 (11!Q7??D..0E0E
 ~D<T<TVZVkVk7lmn6GH~D<N<N7OP !<<5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF==*b$BZBZ\`\t\t1uv		=(";Q? #0"4"4"6s";$$t'?'??!C?
 #
 &**,CD6G=/2 O\M]rt   )r   r   r   r   r   ri   r   r   r   r   r   r   rK   NNNF)r   r   r   r   r[   r   r2   r   r   r   boolr   r   r   r   r   s   @rL   r   r     s    %GN% 7;158<,1P||P !!2!23P E--.	P
  (5P $D>P 
u||Xell33	4P Prt   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ConvBertSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrS   )rZ   r[   r   r   r   r   re   rf   rg   rh   ri   rp   s     rL   r[   ConvBertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rt   r   input_tensorrw   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   ri   re   rq   r   r  s      rL   r   ConvBertSelfOutput.forward  5    

=1]3}'CDrt   re   r   ri   
r   r   r   r   r[   r2   r   r   r   r   r   s   @rL   r   r     s6    >U\\  RWR^R^  rt   r   c                      ^  \ rS rSrU 4S jrS r    SS\R                  S\\R                     S\\R                     S\\R                     S\\
   S	\\R                  \\R                     4   4S
 jjrSrU =r$ )ConvBertAttentioni  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )rZ   r[   r   rq   r   outputsetpruned_headsrp   s     rL   r[   ConvBertAttention.__init__  s0    )&1	(0Ert   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rq   r   r   r  r   r   r   rK   r  r   r   union)rq   headsindexs      rL   prune_headsConvBertAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rt   r   r   r   r   r   rw   c                 l    U R                  UUUUU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )rq   r  )	rq   r   r   r   r   r   self_outputsattention_outputr   s	            rL   r   ConvBertAttention.forward  sQ     yy!
  ;;|AF#%QR(88rt   )r  r  rq   r   )r   r   r   r   r[   r  r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s    ";* 7;158<,1|| !!2!23 E--.	
  (5 $D> 
u||Xe&7&788	9 rt   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )r   i  c                   > [         TU ]  5         Xl        X l        X0l        U R                  U R                  -  U l        U R                  U R                  -  U l        [        R                  " [        R                  " U R                  U R
                  U R                  5      5      U l        [        R                  " [        R                  " U5      5      U l        g r   )rZ   r[   
input_sizeoutput_sizer.   group_in_dimgroup_out_dimr   r   r2   emptyr   r   )rq   r  r  r.   rr   s       rL   r[   GroupedLinearLayer.__init__  s    $&$ OOt>!--@ll5;;t@Q@QSWSeSe#fgLL[!9:	rt   r   rw   c                    [        UR                  5       5      S   n[        R                  " USU R                  U R
                  /5      nUR                  SSS5      n[        R                  " X0R                  5      nUR                  SSS5      n[        R                  " X2SU R                  /5      nX0R                  -   nU$ )Nr   r"   r   r!   )listrn   r2   r   r.   r   r6   r   r   r  r   )rq   r   r   r   s       rL   r   GroupedLinearLayer.forward  s    -,,./2
MM-"doot?P?P)QRIIaALLKK(IIaAMM!"d.>.>?@		Mrt   )r   r   r!  r  r.   r  r   r	  r   s   @rL   r   r     s(    ;U\\ ell  rt   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ConvBertIntermediatei  c                   > [         TU ]  5         UR                  S:X  a1  [        R                  " UR
                  UR                  5      U l        O.[        UR
                  UR                  UR                  S9U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g )Nr   r  r  r.   )rZ   r[   r.   r   r   r   intermediate_sizer   r   r   
hidden_actstrr   intermediate_act_fnrp   s     rL   r[   ConvBertIntermediate.__init__  s    !6#5#5v7O7OPDJ+!--6;S;S`f`q`qDJ f''--'-f.?.?'@D$'-'8'8D$rt   r   rw   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r.  rq   r   s     rL   r   ConvBertIntermediate.forward  s&    

=100?rt   r1  r	  r   s   @rL   r(  r(    s(    9U\\ ell  rt   r(  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ConvBertOutputi  c                   > [         TU ]  5         UR                  S:X  a1  [        R                  " UR
                  UR                  5      U l        O.[        UR
                  UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   r*  rS   )rZ   r[   r.   r   r   r+  r   r   r   re   rf   rg   rh   ri   rp   s     rL   r[   ConvBertOutput.__init__  s    !6#;#;V=O=OPDJ+!33ASAS`f`q`qDJ f&8&8f>S>STzz&"<"<=rt   r   r  rw   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r  s      rL   r   ConvBertOutput.forward  r  rt   r  r	  r   s   @rL   r5  r5    s6    	>U\\  RWR^R^  rt   r5  c                     ^  \ rS rSrU 4S jr     SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\
\R                  \\R                     4   4S
 jjrS rSrU =r$ )ConvBertLayeri  c                 v  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        U5      U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is added)rZ   r[   chunk_size_feed_forwardseq_len_dimr  	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr(  intermediater5  r  rp   s     rL   r[   ConvBertLayer.__init__  s    '-'E'E$*62 ++#)#=#= ##??4&(f ghh"3F";D08$V,rt   r   r   r   r   encoder_attention_maskr   rw   c                 L   U R                  UUUUS9nUS   nUSS  n	U R                  (       aD  UbA  [        U S5      (       d  [        SU  S35      eU R	                  UUUUU5      n
U
S   nXSS  -   n	[        U R                  U R                  U R                  U5      nU4U	-   n	U	$ )N)r   r   r   rC  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r?  r@  r{   AttributeErrorrC  r   feed_forward_chunkr=  r>  )rq   r   r   r   r   rF  r   self_attention_outputsr  r   cross_attention_outputslayer_outputs               rL   r   ConvBertLayer.forward&  s     "&/	 "0 "
 2!4(,??4@4!122$=dV DD D  '+&9&9 &%!'#  7q9 ;;G0##T%A%A4CSCSUe
  /G+rt   c                 J    U R                  U5      nU R                  X!5      nU$ r   )rD  r  )rq   r  intermediate_outputrL  s       rL   rI   ConvBertLayer.feed_forward_chunkN  s)    "//0@A{{#6Irt   )rA  r?  r=  rC  rD  r@  r  r>  )NNNNF)r   r   r   r   r[   r2   r   r   r   r   r   r   rI  r   r   r   s   @rL   r;  r;    s    -" 7;158<9=,1&||& !!2!23& E--.	&
  (5& !) 6& $D>& 
u||Xe&7&788	9&P rt   r;  c                      ^  \ rS rSrU 4S jr       SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\\	   S
\\	   S\
\\4   4S jjrSrU =r$ )ConvBertEncoderiT  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rZ   r[   r:   r   
ModuleListr/   r0   r;  layergradient_checkpointing)rq   r:   _rr   s      rL   r[   ConvBertEncoder.__init__U  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A&r   r   r   r   rF  r   output_hidden_statesreturn_dictrw   c	           
      ^   U(       a  SOS n	U(       a  SOS n
U(       a  U R                   R                  (       a  SOS n[        U R                  5       H  u  pU(       a  X4-   n	Ub  X<   OS nU R                  (       a3  U R
                  (       a"  U R                  UR                  UUUUUU5      nOU" UUUUUU5      nUS   nU(       d  My  XS   4-   n
U R                   R                  (       d  M  XS   4-   nM     U(       a  X4-   n	U(       d  [        S XX4 5       5      $ [        UU	U
US9$ )Nr   r   r   r!   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   ).0vs     rL   	<genexpr>*ConvBertEncoder.forward.<locals>.<genexpr>  s      fA fs   	)last_hidden_stater   
attentionscross_attentions)
r:   rA  	enumeraterU  rV  training_gradient_checkpointing_func__call__tupler   )rq   r   r   r   r   rF  r   rY  rZ  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                   rL   r   ConvBertEncoder.forward[  sM    #7BD$5b4%64;;;Z;Zr`d(4OA#$58H$H!.7.CilO**t}} $ A A ))!"#)*%! !-!"#)*%! *!,M  &91=M<O&O#;;222+?QRCSBU+U(;  5>   14D D '<Of  
 2++*1	
 	
rt   )r:   rV  rU  )NNNNFFT)r   r   r   r   r[   r2   r   r   r   r   r   r   r   r   r   r   r   s   @rL   rR  rR  T  s    , 7;158<9=,1/4&*;
||;
 !!2!23;
 E--.	;

  (5;
 !) 6;
 $D>;
 'tn;
 d^;
 
u88	9;
 ;
rt   rR  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ConvBertPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rZ   r[   r   r   r   r   r   r,  r-  r   transform_act_fnre   rf   rp   s     rL   r[   (ConvBertPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrt   r   rw   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   rt  re   r2  s     rL   r   'ConvBertPredictionHeadTransform.forward  s4    

=1--m<}5rt   )re   r   rt  r	  r   s   @rL   rr  rr    s)    UU\\ ell  rt   rr  c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\R                  4S jjrS	rU =r$ )ConvBertSequenceSummaryi  a  
Compute a single vector summary of a sequence hidden states.

Args:
    config ([`ConvBertConfig`]):
        The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
        config class of your model for the default values it uses):

        - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

            - `"last"` -- Take the last token hidden state (like XLNet)
            - `"first"` -- Take the first token hidden state (like Bert)
            - `"mean"` -- Take the mean of all tokens hidden states
            - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
            - `"attn"` -- Not implemented now, use multi-head attention

        - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
        - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
          (otherwise to `config.hidden_size`).
        - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
          another string or `None` will add no activation.
        - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
        - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
r:   c                   > [         TU ]  5         [        USS5      U l        U R                  S:X  a  [        e[
        R                  " 5       U l        [        US5      (       a  UR                  (       aq  [        US5      (       a.  UR                  (       a  UR                  S:  a  UR                  nOUR                  n[
        R                  " UR                  U5      U l        [        USS 5      nU(       a  [        U5      O[
        R                  " 5       U l        [
        R                  " 5       U l        [        US5      (       a5  UR"                  S:  a%  [
        R$                  " UR"                  5      U l        [
        R                  " 5       U l        [        US	5      (       a7  UR(                  S:  a&  [
        R$                  " UR(                  5      U l        g g g )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)rZ   r[   getattrr{  NotImplementedErrorr   Identitysummaryr{   r~  r  
num_labelsr   r   r   
activationfirst_dropoutr  rg   last_dropoutr  )rq   r:   num_classesactivation_stringrr   s       rL   r[    ConvBertSequenceSummary.__init__  sa   #FNFC& &%{{}6-..63J3Jv788V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]62338T8TWX8X!#F,H,H!IDKKM6122v7R7RUV7V "

6+F+F GD 8W2rt   r   	cls_indexrw   c                    U R                   S:X  a  USS2S4   nGOU R                   S:X  a  USS2S4   nGOU R                   S:X  a  UR                  SS9nOU R                   S	:X  a  Uc?  [        R                  " US
SS2SS24   UR                  S   S-
  [        R
                  S9nOXUR                  S5      R                  S5      nUR                  SUR                  5       S-
  -  UR                  S5      4-   5      nUR                  SU5      R                  S5      nOU R                   S:X  a  [        eU R                  W5      nU R                  U5      nU R                  U5      nU R!                  U5      nU$ )a#  
Compute a single vector summary of a sequence hidden states.

Args:
    hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
        The hidden states of the last layer.
    cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
        Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

Returns:
    `torch.FloatTensor`: The summary of the sequence hidden states.
r|  Nr"   firstr   r   r   r   r  .r   rX   )r"   r}  )r{  r   r2   	full_likerA   ro   r7   rl   r   rn   gathersqueezer  r  r  r  r  )rq   r   r  r  s       rL   r   ConvBertSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rt   )r  r  r  r  r{  r   )r   r   r   r   r   r   r[   r2   r   r   r   r   r   r   r   s   @rL   ry  ry    sV    2H~ H< Y])"..);CEDTDT;U)			) )rt   ry  c                   D  ^  \ rS rSrU 4S jrS rS rS r\         SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )ConvBertModeli  c                 $  > [         TU ]  U5        [        U5      U l        UR                  UR
                  :w  a0  [        R                  " UR                  UR
                  5      U l        [        U5      U l
        Xl        U R                  5         g r   )rZ   r[   rO   r   r^   r   r   r   embeddings_projectrR  encoderr:   	post_initrp   s     rL   r[   ConvBertModel.__init__  sj     ,V4  F$6$66&(ii0E0EvGYGY&ZD#&v.rt   c                 .    U R                   R                  $ r   r   r`   rq   s    rL   get_input_embeddings"ConvBertModel.get_input_embeddings  s    ...rt   c                 $    XR                   l        g r   r  )rq   rK   s     rL   set_input_embeddings"ConvBertModel.set_input_embeddings  s    */'rt   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rU  r?  r  )rq   heads_to_prunerU  r  s       rL   _prune_headsConvBertModel._prune_heads"  s<    
 +002LELLu%//;;EB 3rt   ru   r   rW   rU   r   rv   r   rY  rZ  rw   c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " XS9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R!                  X*5      nU R#                  XPR                   R$                  5      nU R                  XX6S9n[        U S5      (       a  U R'                  U5      nU R)                  UUUUUU	S	9nU$ )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer"   z5You have to specify either input_ids or inputs_embeds)rz   rW   ry   )ru   rU   rW   rv   r  )r   r   r   rY  rZ  )r:   r   rY  use_return_dictr   %warn_if_padding_and_no_attention_maskrn   rz   r2   onesr{   r   rW   rl   rm   ro   get_extended_attention_maskget_head_maskr0   r  r  )rq   ru   r   rW   rU   r   rv   r   rY  rZ  r|   r   r}   rz   r~   r   extended_attention_maskr   s                     rL   r   ConvBertModel.forward*  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZCN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z"&"B"B>"_&&y++2O2OP	> ( 
 4-.. 33MBM2/!5# % 
 rt   )r:   r   r  r  )	NNNNNNNNN)r   r   r   r   r[   r  r  r  r   r   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s   
/0C  156:59371559,0/3&*<E,,-< !!2!23< !!1!12	<
 u//0< E--.<   1 12< $D>< 'tn< d^< 
u88	9< <rt   r  c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )ConvBertGeneratorPredictionsij  zAPrediction module for the generator, made up of two dense layers.c                    > [         TU ]  5         [        S5      U l        [        R
                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )NgelurS   )rZ   r[   r   r  r   re   r^   rf   r   r   r   rp   s     rL   r[   %ConvBertGeneratorPredictions.__init__m  sV    (0f&;&;AVAVWYYv1163H3HI
rt   generator_hidden_statesrw   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r  re   )rq   r  r   s      rL   r   $ConvBertGeneratorPredictions.forwardt  s3    

#:;6}5rt   )re   r  r   )r   r   r   r   r   r[   r2   r   r   r   r   r   s   @rL   r  r  j  s0    KJu/@/@ UEVEV  rt   r  c                   d  ^  \ rS rSrS/rU 4S jrS rS r\          SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )ConvBertForMaskedLMi|  zgenerator.lm_head.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  " UR                  UR                  5      U l
        U R                  5         g r   )rZ   r[   r  r   r  generator_predictionsr   r   r^   r]   generator_lm_headr  rp   s     rL   r[   ConvBertForMaskedLM.__init__  sR     %f-%A&%I"!#6+@+@&BSBS!Trt   c                     U R                   $ r   r  r  s    rL   get_output_embeddings)ConvBertForMaskedLM.get_output_embeddings  s    %%%rt   c                     Xl         g r   r  )rq   r`   s     rL   set_output_embeddings)ConvBertForMaskedLM.set_output_embeddings  s    !0rt   ru   r   rW   rU   r   rv   labelsr   rY  rZ  rw   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
5	      nUS   nU R                  U5      nU R	                  U5      nSnUbQ  [
        R                  " 5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Nr   r"   r   losslogitsr   rb  )r:   r  r   r  r  r   r
   r   r]   r   r   rb  )rq   ru   r   rW   rU   r   rv   r  r   rY  rZ  r  generator_sequence_outputprediction_scoresr  loss_fctr  s                    rL   r   ConvBertForMaskedLM.forward  s   ( &1%<k$++B]B]"&-- 
#
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D'),CAB,GGF)-)9TGf$EvE$1??.99	
 	
rt   )r   r  r  
NNNNNNNNNN)r   r   r   r   _tied_weights_keysr[   r  r  r   r   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r  r  |  s   45&1  156:59371559-1,0/3&*4
E,,-4
 !!2!234
 !!1!12	4

 u//04
 E--.4
   1 124
 ))*4
 $D>4
 'tn4
 d^4
 
un$	%4
 4
rt   r  c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )ConvBertClassificationHeadi  z-Head for sentence-level classification tasks.c                 n  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        Xl        g r   )rZ   r[   r   r   r   r   classifier_dropoutrh   rg   ri   r  out_projr:   rq   r:   r  rr   s      rL   r[   #ConvBertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrt   r   rw   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        U R                  R                     " U5      nU R                  U5      nU R                  U5      nU$ )Nr   )ri   r   r   r:   r,  r  )rq   r   r   r   s       rL   r   "ConvBertClassificationHead.forward  se    !Q'"LLOJJqM4;;))*1-LLOMM!rt   )r:   r   ri   r  r   r   s   @rL   r  r    s+    7	U\\   rt   r  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )!ConvBertForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        U R                  5         g r   )	rZ   r[   r  r:   r  r   r  
classifierr  rp   s     rL   r[   *ConvBertForSequenceClassification.__init__  sF      ++%f-4V< 	rt   ru   r   rW   rU   r   rv   r  r   rY  rZ  rw   c                 0   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   rW   rU   r   rv   r   rY  rZ  r   r   
regressionsingle_label_classificationmulti_label_classificationr"   r  )r:   r  r   r  problem_typer  rY   r2   ro   r   r   r  r
   r   r	   r   r   rb  rq   ru   r   rW   rU   r   rv   r  r   rY  rZ  r   sequence_outputr  r  r  r  s                    rL   r   )ConvBertForSequenceClassification.forward  s   ( &1%<k$++B]B]--))%'/!5#   

 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rt   )r  r:   r   r  r  )r   r   r   r   r[   r   r   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s     156:59371559-1,0/3&*D
E,,-D
 !!2!23D
 !!1!12	D

 u//0D
 E--.D
   1 12D
 ))*D
 $D>D
 'tnD
 d^D
 
u..	/D
 D
rt   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )ConvBertForMultipleChoicei9  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  " UR                  S5      U l	        U R                  5         g )Nr   )rZ   r[   r  r   ry  sequence_summaryr   r   r   r  r  rp   s     rL   r[   "ConvBertForMultipleChoice.__init__;  sM     %f- 7 ?))F$6$6: 	rt   ru   r   rW   rU   r   rv   r  r   rY  rZ  rw   c                 \   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:


    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r"   r   r  r   r  )r:   r  rA   r   rn   r   r  r  r
   r   r   rb  )rq   ru   r   rW   rU   r   rv   r  r   rY  rZ  num_choicesr   r  pooled_outputr  reshaped_logitsr  r  r  s                       rL   r   !ConvBertForMultipleChoice.forwardE  s   Z &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 --))%'/!5#   

 "!*--o>/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rt   )r  r   r  r  )r   r   r   r   r[   r   r   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r  r  9  s     156:59371559-1,0/3&*Y
E,,-Y
 !!2!23Y
 !!1!12	Y

 u//0Y
 E--.Y
   1 12Y
 ))*Y
 $D>Y
 'tnY
 d^Y
 
u//	0Y
 Y
rt   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )ConvBertForTokenClassificationi  c                 f  > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r   )rZ   r[   r  r  r   r  rh   r   rg   ri   r   r   r  r  r  s      rL   r[   'ConvBertForTokenClassification.__init__  s      ++%f-)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rt   ru   r   rW   rU   r   rv   r  r   rY  rZ  rw   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r"   r   r  )r:   r  r   ri   r  r
   r   r  r   r   rb  r  s                    rL   r   &ConvBertForTokenClassification.forward  s    $ &1%<k$++B]B]--))%'/!5#   

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rt   )r  r   ri   r  r  )r   r   r   r   r[   r   r   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s     156:59371559-1,0/3&*2
E,,-2
 !!2!232
 !!1!12	2

 u//02
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
rt   r  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )ConvBertForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   )
rZ   r[   r  r  r   r   r   r   
qa_outputsr  rp   s     rL   r[   %ConvBertForQuestionAnswering.__init__  sS      ++%f-))F$6$68I8IJ 	rt   ru   r   rW   rU   r   rv   start_positionsend_positionsr   rY  rZ  rw   c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r"   r   )ignore_indexr!   )r  start_logits
end_logitsr   rb  )r:   r  r   r  splitr  r   r  rn   clampr
   r   r   rb  )rq   ru   r   rW   rU   r   rv   r  r  r   rY  rZ  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          rL   r   $ConvBertForQuestionAnswering.forward  s    &1%<k$++B]B]--))%'/!5#   

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rt   )r   r  r  )NNNNNNNNNNN)r   r   r   r   r[   r   r   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s$     156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
rt   r  )	r  r  r  r  r  r;  r  r   rM   )Ar   r   r'   operatorr   typingr   r   r   r   r2   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   r   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_convbertr   
get_loggerr   r%   rM   ModulerO   r   r   r   r   r  r   r(  r5  r;  rR  rr  ry  r  r  r  r  r  r  r  r  __all__r   rt   rL   <module>r     s;     	  3 3    A A 1  . l l 3 
		H	%yx9 9x %o % %8bii 4}BII }@ *		 *Z ,299 (RYY &:BII :zB
bii B
Jbii $`bii `F X+ X Xv299 $ H
1 H
 H
V 0 P
(? P
P
f e
 7 e
 e
P B
%< B
 B
J J
#: J
 J
Z
rt   