
    fTh                        S r SSKrSSKJrJrJr  SSKrSSKrSSKJr  SSK	J
r
JrJr  SSKJr  SSKJrJrJrJrJrJrJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'  \RP                  " \)5      r* " S S\RV                  5      r, " S S\RV                  5      r- " S S\RV                  5      r. " S S\RV                  5      r/ " S S\RV                  5      r0 " S S\RV                  5      r1 " S S\RV                  5      r2 " S S\RV                  5      r3 " S S \RV                  5      r4\ " S! S"\5      5       r5\ " S# S$\55      5       r6\ " S% S&\55      5       r7 " S' S(\RV                  5      r8\" S)S*9 " S+ S,\55      5       r9\ " S- S.\55      5       r:\ " S/ S0\55      5       r; " S1 S2\RV                  5      r<\ " S3 S4\55      5       r=S7S5 jr>/ S6Qr?g)8zPyTorch I-BERT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )IBertEmbeddings0   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                 r  > [         TU ]  5         UR                  U l        SU l        SU l        SU l        SU l        SU l        [        UR                  UR                  UR                  U R                  U R                  S9U l        [        UR                  UR                  U R                  U R                  S9U l        U R                  S[         R"                  " UR$                  5      R'                  S5      S	S
9  [)        USS5      U l        UR                  U l        [        UR$                  UR                  U R,                  U R                  U R                  S9U l        [1        U R                  U R                  S9U l        [1        U R                  U R                  S9U l        [7        UR                  UR8                  U R                  U R                  UR:                  S9U l        [1        U R
                  U R                  S9U l        [@        RB                  " URD                  5      U l#        g )N             )padding_idx
weight_bit
quant_mode)r)   r*   position_ids)r   F)
persistentposition_embedding_typeabsoluter*   eps
output_bitr*   force_dequant)$super__init__r*   embedding_bitembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsregister_buffertorcharangemax_position_embeddingsexpandgetattrr.   r(   position_embeddingsr   embeddings_act1embeddings_act2r   layer_norm_epsr4   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__s     `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/ibert/modeling_ibert.pyr6   IBertEmbeddings.__init__5   s    ++!#-++)) 
 &4""F$6$64CUCUbfbqbq&
"
 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$ "..#1**(())$
   ((>(>4??['(>(>4??[ &%%)) ..
 "*$,,4??!Szz&"<"<=    c                    UcD  Ub0  [        XR                  U5      R                  UR                  5      nOU R	                  U5      nUb  UR                  5       nOUR                  5       S S nUc8  [        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      u  pGOS nU R                  U5      u  pU R                  UUUU	S9u  pU R                  S:X  a'  U R                  U5      u  pU R                  U
UUUS9u  pU R                  X5      u  pU R!                  U
5      n
U R#                  X5      u  pX4$ )Nr,   dtypedeviceidentityidentity_scaling_factorr/   )"create_position_ids_from_input_idsr(   tor[   &create_position_ids_from_inputs_embedssizerC   zeroslongr+   r?   rA   rI   r.   rH   rL   rP   rM   )rR   	input_idstoken_type_idsr+   inputs_embedspast_key_values_lengthinput_shapeinputs_embeds_scaling_factorrA   $token_type_embeddings_scaling_factor
embeddingsembeddings_scaling_factorrH   "position_embeddings_scaling_factors                 rU   forwardIBertEmbeddings.forwardi   s}    $A//1G "Y%%&   $JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN :>:N:Ny:Y7M7+/(FJF`F`aoFpC040D0D(*$H	 1E 1
-
 '':5FJF^F^_kFlC484H4H),(J	 5I 51J 15z0e-
\\*-
040F0Fz0m-
44rW   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr,   r   rY   r   )rb   rC   rD   r(   rd   r[   	unsqueezerF   )rR   rg   ri   sequence_lengthr+   s        rU   ra   6IBertEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rW   )rL   r9   rP   r8   r7   rI   rJ   r:   r;   rM   r(   r.   rH   r*   rA   r?   )NNNNr   )
__name__
__module____qualname____firstlineno____doc__r6   ro   ra   __static_attributes____classcell__rT   s   @rU   r!   r!   0   s%    2>j rs-5^= =rW   r!   c                   >   ^  \ rS rSrU 4S jrS r   SS jrSrU =r$ )IBertSelfAttention   c           
      ~  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        SU l        SU l        SU l	        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        UR                  U R                  SU R                  U R                  U R                  SS	9U l        [        UR                  U R                  SU R                  U R                  U R                  SS	9U l        [        UR                  U R                  SU R                  U R                  U R                  SS	9U l        [#        U R                  U R                  S
9U l        [#        U R                  U R                  S
9U l        [#        U R                  U R                  S
9U l        [#        U R                  U R                  S
9U l        [,        R.                  " UR0                  5      U l        [5        USS5      U l        U R6                  S:w  a  [        S5      e[9        U R                  U R                  UR:                  S9U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r$   r'   Tbiasr)   bias_bitr*   per_channelr0   r.   r/   zDI-BERT only supports 'absolute' for `config.position_embedding_type`r*   r4   )r5   r6   r=   num_attention_headshasattr
ValueErrorr*   r)   r   r9   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrM   r   rN   attention_probs_dropout_probrP   rG   r.   r   r4   softmaxrQ   s     rU   r6   IBertSelfAttention.__init__   sB    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  !++#)#=#= #&v'9'9F<V<V'V#W !558P8PP !]]

 ]]
 !]]

 !)$// R&t||P ($// R!)$,,4??!Szz&"E"EF'.v7PR\']$'':5cdd!$,,4??Z`ZnZnorW   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr,   r      r   r
   )rb   r   r   viewpermute)rR   xnew_x_shapes      rU   transpose_for_scores'IBertSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$rW   c                    U R                  X5      u  pgU R                  X5      u  pU R                  X5      u  pU R                  Xg5      u  pU R	                  X5      u  pU R                  X5      u  nnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  " XR                  SS5      5      n[        R                  " U R                  5      nUU-  nU R                  (       a  X-  U-  nOS nUb  UU-   nU R                  UU5      u  nnU R                  U5      nUb  UU-  n[        R                  " UU5      nUb  UU-  nOS nUR!                  SSSS5      R#                  5       nUR%                  5       S S U R&                  4-   nUR(                  " U6 nU R+                  UU5      u  nnU(       a  UU4OU4nU(       a  UU4OU4nUU4$ )Nr,   r   r   r   r
   )r   r   r   r   r   r   r   rC   matmul	transposemathsqrtr   r*   r   rP   r   
contiguousrb   r   r   rM   )rR   hidden_stateshidden_states_scaling_factorattention_mask	head_maskoutput_attentionsmixed_query_layer mixed_query_layer_scaling_factormixed_key_layermixed_key_layer_scaling_factormixed_value_layer mixed_value_layer_scaling_factorquery_layerquery_layer_scaling_factor	key_layerkey_layer_scaling_factorvalue_layervalue_layer_scaling_factorattention_scoresscaleattention_scores_scaling_factorattention_probsattention_probs_scaling_factorcontext_layercontext_layer_scaling_factornew_context_layer_shapeoutputsoutput_scaling_factors                               rU   ro   IBertSelfAttention.forward   s1    ?Cjj>u;:>((=:o7>Bjj>u; 372G2G3
/ /3.A.A/.r+	262G2G3
//
 //<--i8	//< !<<5H5HR5PQ		$223+e3??.H.cfk.k+.2+%/.@ ;?,,=;
77 ,,7  -	9O_kB)5+ILf+f(+/(%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD 7;6L6L77
33 7H=/2mM] ! *+IJ.0 	 ---rW   )r9   r   r   r   rP   r   r   r   rM   r.   r*   r   r   r   r   r   r)   NNF)	ru   rv   rw   rx   r6   r   ro   rz   r{   r|   s   @rU   r~   r~      s&    8pt% K. K.rW   r~   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertSelfOutputi8  c           
      t  > [         TU ]  5         UR                  U l        SU l        SU l        SU l        SU l        SU l        [        UR                  UR                  SU R                  U R
                  U R                  SS9U l
        [        U R                  U R                  S9U l        [        UR                  UR                  U R                  U R                  UR                  S9U l        [        U R                  U R                  S9U l        [$        R&                  " UR(                  5      U l        g Nr$   r'   r&   Tr   r0   r1   )r5   r6   r*   r9   r)   r   r:   r;   r   r=   denser   ln_input_actr   rK   r4   rL   rM   r   rN   rO   rP   rQ   s     rU   r6   IBertSelfOutput.__init__9  s     ++ ]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rW   c                     U R                  X5      u  pU R                  U5      nU R                  UUUUS9u  pU R                  X5      u  pU R	                  X5      u  pX4$ Nr\   r   rP   r   rL   rM   rR   r   r   input_tensorinput_tensor_scaling_factors        rU   ro   IBertSelfOutput.forwardV  z    6:jj6m3]36:6G6G(!$?	 7H 7
3 7;nn]6q36:6L6L7
3 ::rW   rL   r9   r   r   rP   r   r:   r;   rM   r*   r)   ru   rv   rw   rx   r6   ro   rz   r{   r|   s   @rU   r   r   8      >:; ;rW   r   c                   >   ^  \ rS rSrU 4S jrS r   SS jrSrU =r$ )IBertAttentionig  c                    > [         TU ]  5         UR                  U l        [        U5      U l        [        U5      U l        [        5       U l        g N)	r5   r6   r*   r~   rR   r   outputsetpruned_headsrQ   s     rU   r6   IBertAttention.__init__h  s=     ++&v.	%f-ErW   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   dim)lenr   rR   r   r   r   r   r   r   r   r   r   r   union)rR   headsindexs      rU   prune_headsIBertAttention.prune_headso  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rW   c                     U R                  UUUUU5      u  pgU R                  US   US   X5      u  pU4USS  -   n
U	4USS  -   nX4$ )Nr   r   )rR   r   )rR   r   r   r   r   r   self_outputsself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   outputs_scaling_factors               rU   ro   IBertAttention.forward  s     59II(5
1 =AKKO8;]=
9 $%QR(88"A!CFabcbdFe!e..rW   )r   r   r*   rR   r   )	ru   rv   rw   rx   r6   r   ro   rz   r{   r|   s   @rU   r   r   g  s"    ";, / /rW   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertIntermediatei  c           
        > [         TU ]  5         UR                  U l        SU l        SU l        SU l        [        UR                  UR                  SU R                  U R
                  U R                  SS9U l	        UR                  S:w  a  [        S5      e[        U R                  UR                  S9U l        [        U R                  U R                  S9U l        g )	Nr$   r'   Tr   r   z3I-BERT only supports 'gelu' for `config.hidden_act`r   r0   )r5   r6   r*   r9   r)   r   r   r=   intermediate_sizer   
hidden_actr   r   r4   intermediate_act_fnr   rM   rQ   s     rU   r6   IBertIntermediate.__init__  s     ++ $$]]

 &RSS#*dooU[UiUi#j !)$,,4??!SrW   c                 z    U R                  X5      u  pU R                  X5      u  pU R                  X5      u  pX4$ r   )r   r   rM   )rR   r   r   s      rU   ro   IBertIntermediate.forward  sL    6:jj6m36:6N6N7
3
 7;6L6L7
3 ::rW   )r9   r   r   r   rM   r*   r)   r   r|   s   @rU   r   r     s    T(
; 
;rW   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertOutputi  c           
      t  > [         TU ]  5         UR                  U l        SU l        SU l        SU l        SU l        SU l        [        UR                  UR                  SU R                  U R
                  U R                  SS9U l        [        U R                  U R                  S9U l        [        UR                  UR                  U R                  U R                  UR                   S9U l        [        U R                  U R                  S9U l        [&        R(                  " UR*                  5      U l        g r   )r5   r6   r*   r9   r)   r   r:   r;   r   r   r=   r   r   r   r   rK   r4   rL   rM   r   rN   rO   rP   rQ   s     rU   r6   IBertOutput.__init__  s     ++ $$]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rW   c                     U R                  X5      u  pU R                  U5      nU R                  UUUUS9u  pU R                  X5      u  pU R	                  X5      u  pX4$ r   r   r   s        rU   ro   IBertOutput.forward  r   rW   r   r   r|   s   @rU   r   r     r   rW   r   c                   >   ^  \ rS rSrU 4S jr   SS jrS rSrU =r$ )
IBertLayeri  c                 L  > [         TU ]  5         UR                  U l        SU l        SU l        [        U5      U l        [        U5      U l        [        U5      U l
        [        U R                  U R                  S9U l        [        U R                  U R                  S9U l        g )Nr$   r   r0   )r5   r6   r*   r9   seq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrQ   s     rU   r6   IBertLayer.__init__  s}     ++'/-f5!&)$,T\\doo$V!&t||PrW   c                     U R                  UUUUUS9u  pgUS   nUS   n	USS  n
U R                  X5      u  pU4U
-   n
U
$ )N)r   r   r   )r   feed_forward_chunk)rR   r   r   r   r   r   self_attention_outputs%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factors                rU   ro   IBertLayer.forward  s}     IM(/ IW I
E 2!4*OPQ*R'(,484K4K5
1  /G+rW   c                     U R                  X5      u  pU R                  X5      u  p4U R                  X45      u  p4U R                  X4X5      u  pVXV4$ r   )r  r  r  r   )rR   r   r   intermediate_output"intermediate_output_scaling_factorr	  r
  s          rU   r  IBertLayer.feed_forward_chunk  su    <@<U<U=
9 CGBSBSC
? CGBUBUC
? 59KKEU5
1 88rW   )r9   r   r  r   r  r  r*   r   r   )	ru   rv   rw   rx   r6   ro   r  rz   r{   r|   s   @rU   r   r     s#    Q" 69 9rW   r   c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )IBertEncoderi#  c                    > [         TU ]  5         Xl        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l	        g s  snf r   )
r5   r6   rS   r*   r   
ModuleListrangenum_hidden_layersr   layer)rR   rS   _rT   s      rU   r6   IBertEncoder.__init__$  sT     ++]]fF^F^@_#`@_1Jv$6@_#`a
#`s   A0c                 N   U(       a  SOS nU(       a  SOS n	S n
S n[        U R                  5       H<  u  pU(       a  X4-   nUb  XL   OS nU" UUUUU5      nUS   nU(       d  M4  XS   4-   n	M>     U(       a  X4-   nU(       d  [        S UUUU	U
4 5       5      $ [        UUUU	U
S9$ )N r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r  ).0vs     rU   	<genexpr>'IBertEncoder.forward.<locals>.<genexpr>O  s"      
A  s   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions)	enumerater  tupler   )rR   r   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_masklayer_outputss                   rU   ro   IBertEncoder.forward*  s     #7BD$5b4#!(4OA#$58H$H!.7.CilO(,!M *!,M  &91=M<O&O#!  5$   14D D 
 "&%'(
 
 
 9+.+*1
 	
rW   )rS   r  r*   )NNFFTr   r|   s   @rU   r  r  #  s$    b "6
 6
rW   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )IBertPooleric  c                    > [         TU ]  5         UR                  U l        [        R                  " UR
                  UR
                  5      U l        [        R                  " 5       U l        g r   )	r5   r6   r*   r   Linearr=   r   Tanh
activationrQ   s     rU   r6   IBertPooler.__init__d  sF     ++YYv1163E3EF
'')rW   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r   r6  )rR   r   first_token_tensorpooled_outputs       rU   ro   IBertPooler.forwardj  s6     +1a40

#566rW   )r6  r   r*   r   r|   s   @rU   r2  r2  c  s    $ rW   r2  c                   ,    \ rS rSr\rSrS rSS jrSr	g)IBertPreTrainedModelis  ibertc                    [        U[        [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        [        R                  45      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        [        R                  45      (       aJ  UR                  R
                  R                  5         UR                  R
                  R!                  S5        g[        U["        5      (       a%  UR                  R
                  R                  5         gg)zInitialize the weightsg        )meanstdNg      ?)
isinstancer   r   r4  weightdatanormal_rS   initializer_ranger   zero_r   	Embeddingr(   r   rL   fill_IBertLMHead)rR   modules     rU   _init_weights"IBertPreTrainedModel._init_weightsx  sA   f{BII677 MM&&CT[[5R5R&S{{&  &&( ' >??MM&&CT[[5R5R&S!!-""6#5#56<<> .r|| <==KK""$MM$$S),,KK""$ -rW   Nc                     [        S5      e)Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rR   new_num_tokenss     rU   resize_token_embeddings,IBertPreTrainedModel.resize_token_embeddings  s    !"Z[[rW   r  r   )
ru   rv   rw   rx   r   config_classbase_model_prefixrM  rR  rz   r  rW   rU   r>  r>  s  s    L%$\rW   r>  c                   f  ^  \ rS rSrSrSU 4S jjrS rS rS r\	         SS\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\\\\R                     4   4S jj5       rSrU =r$ )
IBertModeli  a  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

c                    > [         TU ]  U5        Xl        UR                  U l        [	        U5      U l        [        U5      U l        U(       a  [        U5      OSU l	        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r5   r6   rS   r*   r!   rl   r  encoderr2  pooler	post_init)rR   rS   add_pooling_layerrT   s      rU   r6   IBertModel.__init__  sX    
 	  ++)&1#F+->k&)D 	rW   c                 .    U R                   R                  $ r   rl   r?   rR   s    rU   get_input_embeddingsIBertModel.get_input_embeddings  s    ...rW   c                 $    XR                   l        g r   r_  )rR   r   s     rU   set_input_embeddingsIBertModel.set_input_embeddings  s    */'rW   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrY  r  r   r   )rR   heads_to_pruner  r   s       rU   _prune_headsIBertModel._prune_heads  s<    
 +002LELLu%//;;EB 3rW   re   r   rf   r+   r   rg   r   r&  r'  returnc
           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUc$  [        R                  " U
[        R                  US9nU R                  X*5      nU R                  XPR                   R                  5      nU R                  UUUUS9u  nnU R!                  UUUUUUU	S9nUS   nU R"                  b  U R#                  U5      OS nU	(       d
  UU4US	S  -   $ [%        UUUR&                  UR(                  UR*                  UR,                  S
9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer,   z5You have to specify either input_ids or inputs_embeds)r[   rY   )re   r+   rf   rg   )r   r   r   r&  r'  r   r   )r   pooler_outputr!  r   r"  r#  )rS   r   r&  use_return_dictr   %warn_if_padding_and_no_attention_maskrb   r[   rC   onesrc   rd   get_extended_attention_maskget_head_maskr  rl   rY  rZ  r   r!  r   r"  r#  )rR   re   r   rf   r+   r   rg   r   r&  r'  ri   
batch_size
seq_lengthr[   extended_attention_maskembedding_outputembedding_output_scaling_factorencoder_outputssequence_outputr;  s                       rU   ro   IBertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!"[[EJJvVN 150P0PQ_0m &&y++2O2OP	<@OO%)'	 =L =
99 ,,+2/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rW   )rS   rl   rY  rZ  r*   )T)	NNNNNNNNN)ru   rv   rw   rx   ry   r6   ra  rd  ri  r   r   rC   
LongTensorFloatTensorboolr   r   r   ro   rz   r{   r|   s   @rU   rW  rW    s    "/0C  156:59371559,0/3&*K
E,,-K
 !!2!23K
 !!1!12	K

 u//0K
 E--.K
   1 12K
 $D>K
 'tnK
 d^K
 
;U5CTCT=UU	VK
 K
rW   rW  c                     ^  \ rS rSrSS/rU 4S jrS rS r\          SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\\
R                     4   4S jj5       rSrU =r$ )IBertForMaskedLMi  zlm_head.decoder.biaszlm_head.decoder.weightc                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        U R                  5         g NF)r\  )r5   r6   rW  r?  rK  lm_headr[  rQ   s     rU   r6   IBertForMaskedLM.__init__  s6     %@
"6* 	rW   c                 .    U R                   R                  $ r   )r  decoderr`  s    rU   get_output_embeddings&IBertForMaskedLM.get_output_embeddings  s    ||###rW   c                 Z    XR                   l        UR                  U R                   l        g r   )r  r  r   )rR   new_embeddingss     rU   set_output_embeddings&IBertForMaskedLM.set_output_embeddings  s    -*//rW   re   r   rf   r+   r   rg   labelsr   r&  r'  rk  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Nr   rf   r+   r   rg   r   r&  r'  r   r,   r   losslogitsr   r"  )
rS   rn  r?  r  r   r   r<   r   r   r"  )rR   re   r   rf   r+   r   rg   r  r   r&  r'  r   ry  prediction_scoresmasked_lm_lossloss_fctr   s                    rU   ro   IBertForMaskedLM.forward  s    ( &1%<k$++B]B]**))%'/!5#  

 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rW   )r?  r  
NNNNNNNNNN)ru   rv   rw   rx   _tied_weights_keysr6   r  r  r   r   rC   r{  r|  r}  r   r   r   ro   rz   r{   r|   s   @rU   r  r    s,   02JK$0  156:59371559-1,0/3&*1
E,,-1
 !!2!231
 !!1!12	1

 u//01
 E--.1
   1 121
 ))*1
 $D>1
 'tn1
 d^1
 
~uU%6%677	81
 1
rW   r  c                   <   ^  \ rS rSrSrU 4S jrS rSS jrSrU =r	$ )rK  iP  z)I-BERT Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )N)r2   )r5   r6   r   r4  r=   r   rL   rK   
layer_normr<   r  	ParameterrC   rc   r   rQ   s     rU   r6   IBertLMHead.__init__S  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrW   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r  r  )rR   featureskwargsr   s       rU   ro   IBertLMHead.forward\  s;    JJx GOOA LLOrW   c                     U R                   R                  R                  R                  S:X  a  U R                  U R                   l        g U R                   R                  U l        g )Nmeta)r  r   r[   typer`  s    rU   _tie_weightsIBertLMHead._tie_weightsf  sC    <<##((F2 $		DLL ))DIrW   )r   r  r   r  )rk  N)
ru   rv   rw   rx   ry   r6   ro   r  rz   r{   r|   s   @rU   rK  rK  P  s    3&* *rW   rK  z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )IBertForSequenceClassificationio  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [        U5      U l        U R                  5         g r  )r5   r6   
num_labelsrW  r?  IBertClassificationHead
classifierr[  rQ   s     rU   r6   'IBertForSequenceClassification.__init__v  sC      ++%@
1&9 	rW   re   r   rf   r+   r   rg   r  r   r&  r'  rk  c                 0   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU
(       d  U4US	S -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S
9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr,   r   r  )rS   rn  r?  r  problem_typer  rZ   rC   rd   r   r	   squeezer   r   r   r   r   r"  rR   re   r   rf   r+   r   rg   r  r   r&  r'  r   ry  r  r  r  r   s                    rU   ro   &IBertForSequenceClassification.forward  s   ( &1%<k$++B]B]**))%'/!5#  

 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rW   )r  r?  r  r  )ru   rv   rw   rx   r6   r   r   rC   r{  r|  r}  r   r   r   ro   rz   r{   r|   s   @rU   r  r  o  s$     156:59371559-1,0/3&*B
E,,-B
 !!2!23B
 !!1!12	B

 u//0B
 E--.B
   1 12B
 ))*B
 $D>B
 'tnB
 d^B
 
'u/@/@)AA	BB
 B
rW   r  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )IBertForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r5   r6   rW  r?  r   rN   rO   rP   r4  r=   r  r[  rQ   s     rU   r6   IBertForMultipleChoice.__init__  sV     '
zz&"<"<=))F$6$6: 	rW   re   rf   r   r  r+   r   rg   r   r&  r'  rk  c                 \   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr   r,   r   )r+   rf   r   r   rg   r   r&  r'  r   r  )rS   rn  shaper   rb   r?  rP   r  r   r   r   r"  )rR   re   rf   r   r  r+   r   rg   r   r&  r'  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r;  r  reshaped_logitsr  r  r   s                           rU   ro   IBertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ***..,/!5#  

  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rW   )r  rP   r?  r  )ru   rv   rw   rx   r6   r   r   rC   r{  r|  r}  r   r   r   ro   rz   r{   r|   s   @rU   r  r    s$     15596:-1371559,0/3&*W
E,,-W
 !!1!12W
 !!2!23	W

 ))*W
 u//0W
 E--.W
   1 12W
 $D>W
 'tnW
 d^W
 
(%0A0A*BB	CW
 W
rW   r  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )IBertForTokenClassificationi-  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )r5   r6   r  rW  r?  r   rN   rO   rP   r4  r=   r  r[  rQ   s     rU   r6   $IBertForTokenClassification.__init__/  sk      ++%@
zz&"<"<=))F$6$68I8IJ 	rW   re   r   rf   r+   r   rg   r  r   r&  r'  rk  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r,   r   r  )rS   rn  r?  rP   r  r   r   r  r   r   r"  r  s                    rU   ro   #IBertForTokenClassification.forward:  s    $ &1%<k$++B]B]**))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rW   )r  rP   r?  r  r  )ru   rv   rw   rx   r6   r   r   rC   r{  r|  r}  r   r   r   ro   rz   r{   r|   s   @rU   r  r  -  s   	  156:59371559-1,0/3&*2
E,,-2
 !!2!232
 !!1!12	2

 u//02
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
$eE,=,=&>>	?2
 2
rW   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  ip  z-Head for sentence-level classification tasks.c                 ,  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        g r   )r5   r6   r   r4  r=   r   rN   rO   rP   r  out_projrQ   s     rU   r6    IBertClassificationHead.__init__s  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHrW   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r9  )rP   r   rC   tanhr  )rR   r  r  r   s       rU   ro   IBertClassificationHead.forwardy  s^     Aq)]3

=1

=1]3m4rW   )r   rP   r  )	ru   rv   rw   rx   ry   r6   ro   rz   r{   r|   s   @rU   r  r  p  s    7I rW   r  c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )IBertForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r5   r6   r  rW  r?  r   r4  r=   
qa_outputsr[  rQ   s     rU   r6   "IBertForQuestionAnswering.__init__  sU      ++%@
))F$6$68I8IJ 	rW   re   r   rf   r+   r   rg   start_positionsend_positionsr   r&  r'  rk  c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r,   r   )ignore_indexr   )r  start_logits
end_logitsr   r"  )rS   rn  r?  r  splitr  r   r   rb   clampr   r   r   r"  )rR   re   r   rf   r+   r   rg   r  r  r   r&  r'  r   ry  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rU   ro   !IBertForQuestionAnswering.forward  s    &1%<k$++B]B]**))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rW   )r?  r  r  )NNNNNNNNNNN)ru   rv   rw   rx   r6   r   r   rC   r{  r|  r}  r   r   r   ro   rz   r{   r|   s   @rU   r  r    s/     156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
+U53D3D-EE	F>
 >
rW   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )a1  
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's *utils.make_positions*.

Args:
input_ids (`torch.LongTensor`):
       Indices of input sequence tokens in the vocabulary.

Returns: torch.Tensor
r   r   )ner   rC   cumsumtype_asrd   )re   r(   rh   maskincremental_indicess        rU   r_   r_     sW     <<$((*D <<!4<<TBE[[_cc##%33rW   )r  r  r  r  r  rW  r>  )r   )@ry   r   typingr   r   r   rC   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   configuration_ibertr   quant_modulesr   r   r   r   r   r   
get_loggerru   loggerModuler!   r~   r   r   r   r   r   r  r2  r>  rW  r  rK  r  r  r  r  r  r_   __all__r  rW   rU   <module>r     s  $   ) )    A A    . Q , , c c 
		H	%w=bii w=tK. K.\,;bii ,;^./RYY ./b;		 ;D,;")) ,;^79 79t=
299 =
@"))   \? \ \4 u
% u
 u
p E
+ E
 E
P*")) *> N
%9 N
N
b c
1 c
 c
L ?
"6 ?
 ?
Dbii & J
 4 J
 J
Z4"rW   